atsuki-yamaguchi's picture
Upload folder using huggingface_hub
48eff55 verified
raw
history blame
35 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.15451502053576513,
"eval_steps": 500,
"global_step": 6104,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007847256940708912,
"grad_norm": 26.94572639465332,
"learning_rate": 1.0157273918741808e-06,
"loss": 8.5879,
"step": 31
},
{
"epoch": 0.0015694513881417823,
"grad_norm": 14.633014678955078,
"learning_rate": 2.0314547837483616e-06,
"loss": 7.5048,
"step": 62
},
{
"epoch": 0.002354177082212673,
"grad_norm": 15.984803199768066,
"learning_rate": 3.0471821756225426e-06,
"loss": 6.1391,
"step": 93
},
{
"epoch": 0.0031389027762835646,
"grad_norm": 11.297175407409668,
"learning_rate": 4.062909567496723e-06,
"loss": 4.9299,
"step": 124
},
{
"epoch": 0.003923628470354455,
"grad_norm": 14.864474296569824,
"learning_rate": 5.078636959370905e-06,
"loss": 4.3205,
"step": 155
},
{
"epoch": 0.004708354164425346,
"grad_norm": 11.237608909606934,
"learning_rate": 6.094364351245085e-06,
"loss": 4.0,
"step": 186
},
{
"epoch": 0.005493079858496238,
"grad_norm": 23.79303550720215,
"learning_rate": 7.110091743119267e-06,
"loss": 3.7952,
"step": 217
},
{
"epoch": 0.006277805552567129,
"grad_norm": 15.1551513671875,
"learning_rate": 8.125819134993446e-06,
"loss": 3.689,
"step": 248
},
{
"epoch": 0.00706253124663802,
"grad_norm": 14.605571746826172,
"learning_rate": 9.141546526867629e-06,
"loss": 3.5147,
"step": 279
},
{
"epoch": 0.00784725694070891,
"grad_norm": 16.463390350341797,
"learning_rate": 1.015727391874181e-05,
"loss": 3.3901,
"step": 310
},
{
"epoch": 0.008631982634779801,
"grad_norm": 13.09945011138916,
"learning_rate": 1.117300131061599e-05,
"loss": 3.317,
"step": 341
},
{
"epoch": 0.009416708328850693,
"grad_norm": 11.993067741394043,
"learning_rate": 1.218872870249017e-05,
"loss": 3.2508,
"step": 372
},
{
"epoch": 0.010201434022921584,
"grad_norm": 10.388030052185059,
"learning_rate": 1.3204456094364351e-05,
"loss": 3.1239,
"step": 403
},
{
"epoch": 0.010986159716992476,
"grad_norm": 11.977804183959961,
"learning_rate": 1.4220183486238533e-05,
"loss": 3.0739,
"step": 434
},
{
"epoch": 0.011770885411063367,
"grad_norm": 8.925983428955078,
"learning_rate": 1.5235910878112714e-05,
"loss": 3.0169,
"step": 465
},
{
"epoch": 0.012555611105134258,
"grad_norm": 9.57411003112793,
"learning_rate": 1.6251638269986893e-05,
"loss": 2.959,
"step": 496
},
{
"epoch": 0.01334033679920515,
"grad_norm": 7.380288124084473,
"learning_rate": 1.7267365661861077e-05,
"loss": 2.8921,
"step": 527
},
{
"epoch": 0.01412506249327604,
"grad_norm": 8.812368392944336,
"learning_rate": 1.8283093053735257e-05,
"loss": 2.843,
"step": 558
},
{
"epoch": 0.014909788187346932,
"grad_norm": 8.870095252990723,
"learning_rate": 1.9298820445609438e-05,
"loss": 2.7895,
"step": 589
},
{
"epoch": 0.01569451388141782,
"grad_norm": 9.503872871398926,
"learning_rate": 2.031454783748362e-05,
"loss": 2.7757,
"step": 620
},
{
"epoch": 0.016479239575488712,
"grad_norm": 6.582827568054199,
"learning_rate": 2.13302752293578e-05,
"loss": 2.7099,
"step": 651
},
{
"epoch": 0.017263965269559603,
"grad_norm": 6.266632556915283,
"learning_rate": 2.234600262123198e-05,
"loss": 2.6729,
"step": 682
},
{
"epoch": 0.018048690963630494,
"grad_norm": 6.645415306091309,
"learning_rate": 2.336173001310616e-05,
"loss": 2.6616,
"step": 713
},
{
"epoch": 0.018833416657701385,
"grad_norm": 7.8323073387146,
"learning_rate": 2.437745740498034e-05,
"loss": 2.6291,
"step": 744
},
{
"epoch": 0.019618142351772276,
"grad_norm": 5.577521324157715,
"learning_rate": 2.5393184796854525e-05,
"loss": 2.6072,
"step": 775
},
{
"epoch": 0.020402868045843167,
"grad_norm": 5.603636264801025,
"learning_rate": 2.6408912188728702e-05,
"loss": 2.5787,
"step": 806
},
{
"epoch": 0.021187593739914058,
"grad_norm": 6.945438385009766,
"learning_rate": 2.7424639580602886e-05,
"loss": 2.5198,
"step": 837
},
{
"epoch": 0.021972319433984953,
"grad_norm": 5.6279826164245605,
"learning_rate": 2.8440366972477066e-05,
"loss": 2.5417,
"step": 868
},
{
"epoch": 0.022757045128055844,
"grad_norm": 5.517001628875732,
"learning_rate": 2.9456094364351244e-05,
"loss": 2.4849,
"step": 899
},
{
"epoch": 0.023541770822126735,
"grad_norm": 5.865486145019531,
"learning_rate": 3.0471821756225428e-05,
"loss": 2.5103,
"step": 930
},
{
"epoch": 0.024326496516197626,
"grad_norm": 4.949043273925781,
"learning_rate": 3.148754914809961e-05,
"loss": 2.4581,
"step": 961
},
{
"epoch": 0.025111222210268517,
"grad_norm": 4.701717853546143,
"learning_rate": 3.2503276539973785e-05,
"loss": 2.4315,
"step": 992
},
{
"epoch": 0.025895947904339408,
"grad_norm": 4.533145904541016,
"learning_rate": 3.351900393184797e-05,
"loss": 2.4056,
"step": 1023
},
{
"epoch": 0.0266806735984103,
"grad_norm": 4.724672794342041,
"learning_rate": 3.453473132372215e-05,
"loss": 2.3994,
"step": 1054
},
{
"epoch": 0.02746539929248119,
"grad_norm": 4.745669364929199,
"learning_rate": 3.555045871559633e-05,
"loss": 2.3546,
"step": 1085
},
{
"epoch": 0.02825012498655208,
"grad_norm": 4.4554948806762695,
"learning_rate": 3.6566186107470514e-05,
"loss": 2.3642,
"step": 1116
},
{
"epoch": 0.029034850680622972,
"grad_norm": 4.4792304039001465,
"learning_rate": 3.7581913499344695e-05,
"loss": 2.3296,
"step": 1147
},
{
"epoch": 0.029819576374693863,
"grad_norm": 3.9329679012298584,
"learning_rate": 3.8597640891218876e-05,
"loss": 2.3105,
"step": 1178
},
{
"epoch": 0.030604302068764754,
"grad_norm": 4.338287830352783,
"learning_rate": 3.9613368283093056e-05,
"loss": 2.2811,
"step": 1209
},
{
"epoch": 0.03138902776283564,
"grad_norm": 4.130499839782715,
"learning_rate": 4.062909567496724e-05,
"loss": 2.2898,
"step": 1240
},
{
"epoch": 0.03217375345690653,
"grad_norm": 3.5664470195770264,
"learning_rate": 4.164482306684142e-05,
"loss": 2.2786,
"step": 1271
},
{
"epoch": 0.032958479150977424,
"grad_norm": 3.642627716064453,
"learning_rate": 4.26605504587156e-05,
"loss": 2.2439,
"step": 1302
},
{
"epoch": 0.033743204845048315,
"grad_norm": 3.7562780380249023,
"learning_rate": 4.367627785058978e-05,
"loss": 2.2441,
"step": 1333
},
{
"epoch": 0.034527930539119206,
"grad_norm": 3.3117406368255615,
"learning_rate": 4.469200524246396e-05,
"loss": 2.2604,
"step": 1364
},
{
"epoch": 0.0353126562331901,
"grad_norm": 3.4313724040985107,
"learning_rate": 4.570773263433814e-05,
"loss": 2.2069,
"step": 1395
},
{
"epoch": 0.03609738192726099,
"grad_norm": 3.4720091819763184,
"learning_rate": 4.672346002621232e-05,
"loss": 2.2087,
"step": 1426
},
{
"epoch": 0.03688210762133188,
"grad_norm": 3.491856575012207,
"learning_rate": 4.77391874180865e-05,
"loss": 2.1808,
"step": 1457
},
{
"epoch": 0.03766683331540277,
"grad_norm": 3.3730666637420654,
"learning_rate": 4.875491480996068e-05,
"loss": 2.1907,
"step": 1488
},
{
"epoch": 0.03845155900947366,
"grad_norm": 2.894322395324707,
"learning_rate": 4.977064220183487e-05,
"loss": 2.1689,
"step": 1519
},
{
"epoch": 0.03923628470354455,
"grad_norm": 3.195884943008423,
"learning_rate": 4.9999915451558777e-05,
"loss": 2.194,
"step": 1550
},
{
"epoch": 0.04002101039761544,
"grad_norm": 3.154061794281006,
"learning_rate": 4.999955597496219e-05,
"loss": 2.1409,
"step": 1581
},
{
"epoch": 0.040805736091686334,
"grad_norm": 2.8204188346862793,
"learning_rate": 4.9998914381774255e-05,
"loss": 2.145,
"step": 1612
},
{
"epoch": 0.041590461785757225,
"grad_norm": 2.98260760307312,
"learning_rate": 4.999799067923527e-05,
"loss": 2.1523,
"step": 1643
},
{
"epoch": 0.042375187479828116,
"grad_norm": 2.917949914932251,
"learning_rate": 4.999678487776908e-05,
"loss": 2.1221,
"step": 1674
},
{
"epoch": 0.04315991317389901,
"grad_norm": 2.811469554901123,
"learning_rate": 4.9995296990983006e-05,
"loss": 2.1242,
"step": 1705
},
{
"epoch": 0.043944638867969905,
"grad_norm": 3.067636728286743,
"learning_rate": 4.999352703566763e-05,
"loss": 2.1092,
"step": 1736
},
{
"epoch": 0.044729364562040796,
"grad_norm": 2.6231868267059326,
"learning_rate": 4.999147503179668e-05,
"loss": 2.1018,
"step": 1767
},
{
"epoch": 0.04551409025611169,
"grad_norm": 2.8247616291046143,
"learning_rate": 4.998914100252672e-05,
"loss": 2.074,
"step": 1798
},
{
"epoch": 0.04629881595018258,
"grad_norm": 2.5960075855255127,
"learning_rate": 4.998652497419696e-05,
"loss": 2.0824,
"step": 1829
},
{
"epoch": 0.04708354164425347,
"grad_norm": 2.7796943187713623,
"learning_rate": 4.9983626976328927e-05,
"loss": 2.0998,
"step": 1860
},
{
"epoch": 0.04786826733832436,
"grad_norm": 2.49242901802063,
"learning_rate": 4.998044704162613e-05,
"loss": 2.0893,
"step": 1891
},
{
"epoch": 0.04865299303239525,
"grad_norm": 2.4294378757476807,
"learning_rate": 4.9976985205973705e-05,
"loss": 2.0617,
"step": 1922
},
{
"epoch": 0.04943771872646614,
"grad_norm": 2.553217649459839,
"learning_rate": 4.997324150843799e-05,
"loss": 2.0632,
"step": 1953
},
{
"epoch": 0.050222444420537034,
"grad_norm": 2.6711318492889404,
"learning_rate": 4.99692159912661e-05,
"loss": 2.0445,
"step": 1984
},
{
"epoch": 0.051007170114607925,
"grad_norm": 2.714432716369629,
"learning_rate": 4.996490869988546e-05,
"loss": 2.0185,
"step": 2015
},
{
"epoch": 0.051791895808678816,
"grad_norm": 2.6516053676605225,
"learning_rate": 4.996031968290326e-05,
"loss": 2.057,
"step": 2046
},
{
"epoch": 0.05257662150274971,
"grad_norm": 2.4798831939697266,
"learning_rate": 4.995544899210594e-05,
"loss": 2.0199,
"step": 2077
},
{
"epoch": 0.0533613471968206,
"grad_norm": 2.5150041580200195,
"learning_rate": 4.9950296682458583e-05,
"loss": 2.0264,
"step": 2108
},
{
"epoch": 0.05414607289089149,
"grad_norm": 2.637777805328369,
"learning_rate": 4.994486281210429e-05,
"loss": 2.0233,
"step": 2139
},
{
"epoch": 0.05493079858496238,
"grad_norm": 2.330376148223877,
"learning_rate": 4.9939147442363566e-05,
"loss": 2.0201,
"step": 2170
},
{
"epoch": 0.05571552427903327,
"grad_norm": 2.3436174392700195,
"learning_rate": 4.9933150637733574e-05,
"loss": 1.9865,
"step": 2201
},
{
"epoch": 0.05650024997310416,
"grad_norm": 2.7756845951080322,
"learning_rate": 4.992687246588743e-05,
"loss": 1.9983,
"step": 2232
},
{
"epoch": 0.05728497566717505,
"grad_norm": 2.1725504398345947,
"learning_rate": 4.992031299767347e-05,
"loss": 1.9689,
"step": 2263
},
{
"epoch": 0.058069701361245944,
"grad_norm": 2.2163312435150146,
"learning_rate": 4.9913472307114386e-05,
"loss": 1.9829,
"step": 2294
},
{
"epoch": 0.058854427055316835,
"grad_norm": 2.2829232215881348,
"learning_rate": 4.9906350471406446e-05,
"loss": 2.0142,
"step": 2325
},
{
"epoch": 0.059639152749387726,
"grad_norm": 2.239596366882324,
"learning_rate": 4.989894757091861e-05,
"loss": 1.9697,
"step": 2356
},
{
"epoch": 0.06042387844345862,
"grad_norm": 2.2926037311553955,
"learning_rate": 4.989126368919158e-05,
"loss": 1.9688,
"step": 2387
},
{
"epoch": 0.06120860413752951,
"grad_norm": 10.08767032623291,
"learning_rate": 4.988329891293693e-05,
"loss": 1.9845,
"step": 2418
},
{
"epoch": 0.0619933298316004,
"grad_norm": 2.2427194118499756,
"learning_rate": 4.987505333203608e-05,
"loss": 1.9744,
"step": 2449
},
{
"epoch": 0.06277805552567128,
"grad_norm": 2.5111870765686035,
"learning_rate": 4.9866527039539276e-05,
"loss": 1.9526,
"step": 2480
},
{
"epoch": 0.06356278121974218,
"grad_norm": 2.2100026607513428,
"learning_rate": 4.9857720131664594e-05,
"loss": 1.9826,
"step": 2511
},
{
"epoch": 0.06434750691381307,
"grad_norm": 2.2112088203430176,
"learning_rate": 4.9848632707796773e-05,
"loss": 1.9698,
"step": 2542
},
{
"epoch": 0.06513223260788396,
"grad_norm": 2.404014825820923,
"learning_rate": 4.9839264870486155e-05,
"loss": 1.9628,
"step": 2573
},
{
"epoch": 0.06591695830195485,
"grad_norm": 2.526423692703247,
"learning_rate": 4.9829616725447526e-05,
"loss": 1.9481,
"step": 2604
},
{
"epoch": 0.06670168399602575,
"grad_norm": 2.2506027221679688,
"learning_rate": 4.981968838155888e-05,
"loss": 1.9418,
"step": 2635
},
{
"epoch": 0.06748640969009663,
"grad_norm": 2.4334371089935303,
"learning_rate": 4.980947995086024e-05,
"loss": 1.9423,
"step": 2666
},
{
"epoch": 0.06827113538416753,
"grad_norm": 2.3028314113616943,
"learning_rate": 4.979899154855234e-05,
"loss": 1.9391,
"step": 2697
},
{
"epoch": 0.06905586107823841,
"grad_norm": 2.122143030166626,
"learning_rate": 4.9788223292995386e-05,
"loss": 1.933,
"step": 2728
},
{
"epoch": 0.06984058677230931,
"grad_norm": 2.1335129737854004,
"learning_rate": 4.977717530570768e-05,
"loss": 1.9212,
"step": 2759
},
{
"epoch": 0.0706253124663802,
"grad_norm": 2.198650598526001,
"learning_rate": 4.976584771136425e-05,
"loss": 1.9217,
"step": 2790
},
{
"epoch": 0.07141003816045109,
"grad_norm": 2.4985201358795166,
"learning_rate": 4.975424063779547e-05,
"loss": 1.9277,
"step": 2821
},
{
"epoch": 0.07219476385452198,
"grad_norm": 1.9877598285675049,
"learning_rate": 4.974235421598557e-05,
"loss": 1.9278,
"step": 2852
},
{
"epoch": 0.07297948954859287,
"grad_norm": 3.0082573890686035,
"learning_rate": 4.973018858007122e-05,
"loss": 1.9261,
"step": 2883
},
{
"epoch": 0.07376421524266376,
"grad_norm": 2.139742851257324,
"learning_rate": 4.9717743867339963e-05,
"loss": 1.9168,
"step": 2914
},
{
"epoch": 0.07454894093673466,
"grad_norm": 2.1748037338256836,
"learning_rate": 4.9705020218228695e-05,
"loss": 1.9132,
"step": 2945
},
{
"epoch": 0.07533366663080554,
"grad_norm": 2.0570950508117676,
"learning_rate": 4.969201777632205e-05,
"loss": 1.9177,
"step": 2976
},
{
"epoch": 0.07611839232487644,
"grad_norm": 1.9970216751098633,
"learning_rate": 4.9678736688350846e-05,
"loss": 1.9105,
"step": 3007
},
{
"epoch": 0.07690311801894732,
"grad_norm": 1.9640527963638306,
"learning_rate": 4.966517710419033e-05,
"loss": 1.9084,
"step": 3038
},
{
"epoch": 0.07768784371301822,
"grad_norm": 2.172874927520752,
"learning_rate": 4.965133917685858e-05,
"loss": 1.8995,
"step": 3069
},
{
"epoch": 0.0784725694070891,
"grad_norm": 2.1881916522979736,
"learning_rate": 4.9637223062514714e-05,
"loss": 1.9019,
"step": 3100
},
{
"epoch": 0.07925729510116,
"grad_norm": 1.975496530532837,
"learning_rate": 4.962282892045718e-05,
"loss": 1.8967,
"step": 3131
},
{
"epoch": 0.08004202079523089,
"grad_norm": 2.0970685482025146,
"learning_rate": 4.9608156913121904e-05,
"loss": 1.8867,
"step": 3162
},
{
"epoch": 0.08082674648930178,
"grad_norm": 2.096353769302368,
"learning_rate": 4.959320720608049e-05,
"loss": 1.8967,
"step": 3193
},
{
"epoch": 0.08161147218337267,
"grad_norm": 1.998336911201477,
"learning_rate": 4.9577979968038354e-05,
"loss": 1.8876,
"step": 3224
},
{
"epoch": 0.08239619787744357,
"grad_norm": 2.098055362701416,
"learning_rate": 4.956247537083282e-05,
"loss": 1.9,
"step": 3255
},
{
"epoch": 0.08318092357151445,
"grad_norm": 2.0739505290985107,
"learning_rate": 4.9546693589431145e-05,
"loss": 1.8902,
"step": 3286
},
{
"epoch": 0.08396564926558535,
"grad_norm": 1.9556243419647217,
"learning_rate": 4.9530634801928595e-05,
"loss": 1.888,
"step": 3317
},
{
"epoch": 0.08475037495965623,
"grad_norm": 2.096874952316284,
"learning_rate": 4.9514299189546395e-05,
"loss": 1.8785,
"step": 3348
},
{
"epoch": 0.08553510065372713,
"grad_norm": 1.9407072067260742,
"learning_rate": 4.949768693662973e-05,
"loss": 1.8646,
"step": 3379
},
{
"epoch": 0.08631982634779801,
"grad_norm": 1.9928467273712158,
"learning_rate": 4.948079823064559e-05,
"loss": 1.8751,
"step": 3410
},
{
"epoch": 0.08710455204186891,
"grad_norm": 1.9670037031173706,
"learning_rate": 4.946363326218074e-05,
"loss": 1.8831,
"step": 3441
},
{
"epoch": 0.08788927773593981,
"grad_norm": 1.999193787574768,
"learning_rate": 4.9446192224939525e-05,
"loss": 1.8605,
"step": 3472
},
{
"epoch": 0.0886740034300107,
"grad_norm": 1.9073724746704102,
"learning_rate": 4.942847531574167e-05,
"loss": 1.8576,
"step": 3503
},
{
"epoch": 0.08945872912408159,
"grad_norm": 2.179824113845825,
"learning_rate": 4.941048273452008e-05,
"loss": 1.8682,
"step": 3534
},
{
"epoch": 0.09024345481815248,
"grad_norm": 1.954990029335022,
"learning_rate": 4.9392214684318605e-05,
"loss": 1.8807,
"step": 3565
},
{
"epoch": 0.09102818051222338,
"grad_norm": 1.7695640325546265,
"learning_rate": 4.93736713712897e-05,
"loss": 1.879,
"step": 3596
},
{
"epoch": 0.09181290620629426,
"grad_norm": 1.7708550691604614,
"learning_rate": 4.9354853004692124e-05,
"loss": 1.8677,
"step": 3627
},
{
"epoch": 0.09259763190036516,
"grad_norm": 1.9683934450149536,
"learning_rate": 4.93357597968886e-05,
"loss": 1.8595,
"step": 3658
},
{
"epoch": 0.09338235759443604,
"grad_norm": 2.00441312789917,
"learning_rate": 4.931639196334338e-05,
"loss": 1.8462,
"step": 3689
},
{
"epoch": 0.09416708328850694,
"grad_norm": 1.875543475151062,
"learning_rate": 4.9296749722619826e-05,
"loss": 1.8502,
"step": 3720
},
{
"epoch": 0.09495180898257782,
"grad_norm": 1.932658314704895,
"learning_rate": 4.9276833296377966e-05,
"loss": 1.8457,
"step": 3751
},
{
"epoch": 0.09573653467664872,
"grad_norm": 1.9957045316696167,
"learning_rate": 4.925664290937196e-05,
"loss": 1.843,
"step": 3782
},
{
"epoch": 0.0965212603707196,
"grad_norm": 1.8579176664352417,
"learning_rate": 4.9236178789447576e-05,
"loss": 1.8504,
"step": 3813
},
{
"epoch": 0.0973059860647905,
"grad_norm": 1.9646131992340088,
"learning_rate": 4.921544116753962e-05,
"loss": 1.8512,
"step": 3844
},
{
"epoch": 0.09809071175886139,
"grad_norm": 1.8213136196136475,
"learning_rate": 4.919443027766935e-05,
"loss": 1.8618,
"step": 3875
},
{
"epoch": 0.09887543745293229,
"grad_norm": 2.017280101776123,
"learning_rate": 4.91731463569418e-05,
"loss": 1.863,
"step": 3906
},
{
"epoch": 0.09966016314700317,
"grad_norm": 1.9125665426254272,
"learning_rate": 4.915158964554312e-05,
"loss": 1.8259,
"step": 3937
},
{
"epoch": 0.10044488884107407,
"grad_norm": 2.0414695739746094,
"learning_rate": 4.912976038673786e-05,
"loss": 1.8347,
"step": 3968
},
{
"epoch": 0.10122961453514495,
"grad_norm": 1.7705485820770264,
"learning_rate": 4.9107658826866254e-05,
"loss": 1.8502,
"step": 3999
},
{
"epoch": 0.10201434022921585,
"grad_norm": 1.8961102962493896,
"learning_rate": 4.908528521534139e-05,
"loss": 1.84,
"step": 4030
},
{
"epoch": 0.10279906592328673,
"grad_norm": 1.784387230873108,
"learning_rate": 4.906263980464644e-05,
"loss": 1.842,
"step": 4061
},
{
"epoch": 0.10358379161735763,
"grad_norm": 11.229472160339355,
"learning_rate": 4.903972285033178e-05,
"loss": 1.8476,
"step": 4092
},
{
"epoch": 0.10436851731142852,
"grad_norm": 1.9657154083251953,
"learning_rate": 4.901653461101213e-05,
"loss": 1.8465,
"step": 4123
},
{
"epoch": 0.10515324300549941,
"grad_norm": 1.7702244520187378,
"learning_rate": 4.8993075348363626e-05,
"loss": 1.8249,
"step": 4154
},
{
"epoch": 0.1059379686995703,
"grad_norm": 1.8672112226486206,
"learning_rate": 4.896934532712084e-05,
"loss": 1.8232,
"step": 4185
},
{
"epoch": 0.1067226943936412,
"grad_norm": 1.7806147336959839,
"learning_rate": 4.8945344815073846e-05,
"loss": 1.8256,
"step": 4216
},
{
"epoch": 0.10750742008771208,
"grad_norm": 1.7830456495285034,
"learning_rate": 4.892107408306516e-05,
"loss": 1.8271,
"step": 4247
},
{
"epoch": 0.10829214578178298,
"grad_norm": 1.96640944480896,
"learning_rate": 4.889653340498669e-05,
"loss": 1.82,
"step": 4278
},
{
"epoch": 0.10907687147585386,
"grad_norm": 1.8224470615386963,
"learning_rate": 4.8871723057776664e-05,
"loss": 1.8216,
"step": 4309
},
{
"epoch": 0.10986159716992476,
"grad_norm": 2.5164501667022705,
"learning_rate": 4.8846643321416476e-05,
"loss": 1.8252,
"step": 4340
},
{
"epoch": 0.11064632286399564,
"grad_norm": 1.7248613834381104,
"learning_rate": 4.882129447892753e-05,
"loss": 1.8133,
"step": 4371
},
{
"epoch": 0.11143104855806654,
"grad_norm": 2.060304880142212,
"learning_rate": 4.8795676816368076e-05,
"loss": 1.8282,
"step": 4402
},
{
"epoch": 0.11221577425213743,
"grad_norm": 1.8709039688110352,
"learning_rate": 4.876979062282995e-05,
"loss": 1.8154,
"step": 4433
},
{
"epoch": 0.11300049994620832,
"grad_norm": 1.7444674968719482,
"learning_rate": 4.8743636190435325e-05,
"loss": 1.8173,
"step": 4464
},
{
"epoch": 0.11378522564027921,
"grad_norm": 1.7357319593429565,
"learning_rate": 4.871721381433344e-05,
"loss": 1.8351,
"step": 4495
},
{
"epoch": 0.1145699513343501,
"grad_norm": 1.728070855140686,
"learning_rate": 4.869052379269719e-05,
"loss": 1.8119,
"step": 4526
},
{
"epoch": 0.11535467702842099,
"grad_norm": 1.742035984992981,
"learning_rate": 4.866356642671985e-05,
"loss": 1.7967,
"step": 4557
},
{
"epoch": 0.11613940272249189,
"grad_norm": 1.7010915279388428,
"learning_rate": 4.8636342020611634e-05,
"loss": 1.8004,
"step": 4588
},
{
"epoch": 0.11692412841656277,
"grad_norm": 1.6775914430618286,
"learning_rate": 4.860885088159626e-05,
"loss": 1.8173,
"step": 4619
},
{
"epoch": 0.11770885411063367,
"grad_norm": 1.9107964038848877,
"learning_rate": 4.858109331990751e-05,
"loss": 1.7984,
"step": 4650
},
{
"epoch": 0.11849357980470455,
"grad_norm": 1.713429570198059,
"learning_rate": 4.855306964878567e-05,
"loss": 1.7967,
"step": 4681
},
{
"epoch": 0.11927830549877545,
"grad_norm": 1.9373931884765625,
"learning_rate": 4.8524780184474084e-05,
"loss": 1.8072,
"step": 4712
},
{
"epoch": 0.12006303119284634,
"grad_norm": 1.8975365161895752,
"learning_rate": 4.8496225246215496e-05,
"loss": 1.8121,
"step": 4743
},
{
"epoch": 0.12084775688691723,
"grad_norm": 5.285326957702637,
"learning_rate": 4.8467405156248505e-05,
"loss": 1.8189,
"step": 4774
},
{
"epoch": 0.12163248258098812,
"grad_norm": 1.7155263423919678,
"learning_rate": 4.843832023980392e-05,
"loss": 1.8093,
"step": 4805
},
{
"epoch": 0.12241720827505902,
"grad_norm": 1.726831316947937,
"learning_rate": 4.840897082510106e-05,
"loss": 1.7952,
"step": 4836
},
{
"epoch": 0.1232019339691299,
"grad_norm": 1.739639401435852,
"learning_rate": 4.8379357243344084e-05,
"loss": 1.8103,
"step": 4867
},
{
"epoch": 0.1239866596632008,
"grad_norm": 1.6978296041488647,
"learning_rate": 4.8349479828718236e-05,
"loss": 1.8006,
"step": 4898
},
{
"epoch": 0.12477138535727168,
"grad_norm": 1.7154194116592407,
"learning_rate": 4.8319338918386075e-05,
"loss": 1.7876,
"step": 4929
},
{
"epoch": 0.12555611105134257,
"grad_norm": 1.6323316097259521,
"learning_rate": 4.828893485248369e-05,
"loss": 1.8159,
"step": 4960
},
{
"epoch": 0.12634083674541347,
"grad_norm": 1.641784429550171,
"learning_rate": 4.825826797411682e-05,
"loss": 1.7959,
"step": 4991
},
{
"epoch": 0.12712556243948436,
"grad_norm": 1.6947154998779297,
"learning_rate": 4.822733862935702e-05,
"loss": 1.7895,
"step": 5022
},
{
"epoch": 0.12791028813355526,
"grad_norm": 1.6331220865249634,
"learning_rate": 4.819614716723775e-05,
"loss": 1.7707,
"step": 5053
},
{
"epoch": 0.12869501382762613,
"grad_norm": 1.8207937479019165,
"learning_rate": 4.8164693939750425e-05,
"loss": 1.8123,
"step": 5084
},
{
"epoch": 0.12947973952169703,
"grad_norm": 1.6664263010025024,
"learning_rate": 4.813297930184042e-05,
"loss": 1.8089,
"step": 5115
},
{
"epoch": 0.13026446521576793,
"grad_norm": 1.9931398630142212,
"learning_rate": 4.810100361140314e-05,
"loss": 1.7757,
"step": 5146
},
{
"epoch": 0.13104919090983883,
"grad_norm": 1.839200735092163,
"learning_rate": 4.8068767229279885e-05,
"loss": 1.7969,
"step": 5177
},
{
"epoch": 0.1318339166039097,
"grad_norm": 1.781187653541565,
"learning_rate": 4.8036270519253854e-05,
"loss": 1.7937,
"step": 5208
},
{
"epoch": 0.1326186422979806,
"grad_norm": 1.7144343852996826,
"learning_rate": 4.8003513848046e-05,
"loss": 1.7816,
"step": 5239
},
{
"epoch": 0.1334033679920515,
"grad_norm": 1.6819554567337036,
"learning_rate": 4.79704975853109e-05,
"loss": 1.7851,
"step": 5270
},
{
"epoch": 0.1341880936861224,
"grad_norm": 1.6748546361923218,
"learning_rate": 4.793722210363262e-05,
"loss": 1.7941,
"step": 5301
},
{
"epoch": 0.13497281938019326,
"grad_norm": 1.615569829940796,
"learning_rate": 4.7903687778520414e-05,
"loss": 1.7799,
"step": 5332
},
{
"epoch": 0.13575754507426416,
"grad_norm": 1.7959198951721191,
"learning_rate": 4.7869894988404593e-05,
"loss": 1.7802,
"step": 5363
},
{
"epoch": 0.13654227076833506,
"grad_norm": 1.598946452140808,
"learning_rate": 4.783584411463221e-05,
"loss": 1.7929,
"step": 5394
},
{
"epoch": 0.13732699646240595,
"grad_norm": 1.793511986732483,
"learning_rate": 4.780153554146274e-05,
"loss": 1.7591,
"step": 5425
},
{
"epoch": 0.13811172215647682,
"grad_norm": 1.718671202659607,
"learning_rate": 4.7766969656063766e-05,
"loss": 1.7807,
"step": 5456
},
{
"epoch": 0.13889644785054772,
"grad_norm": 1.6548669338226318,
"learning_rate": 4.773214684850662e-05,
"loss": 1.775,
"step": 5487
},
{
"epoch": 0.13968117354461862,
"grad_norm": 1.6727256774902344,
"learning_rate": 4.769706751176193e-05,
"loss": 1.7756,
"step": 5518
},
{
"epoch": 0.14046589923868952,
"grad_norm": 1.7169344425201416,
"learning_rate": 4.7661732041695264e-05,
"loss": 1.7887,
"step": 5549
},
{
"epoch": 0.1412506249327604,
"grad_norm": 1.6376421451568604,
"learning_rate": 4.762614083706258e-05,
"loss": 1.7939,
"step": 5580
},
{
"epoch": 0.14203535062683129,
"grad_norm": 1.7083207368850708,
"learning_rate": 4.759029429950581e-05,
"loss": 1.7705,
"step": 5611
},
{
"epoch": 0.14282007632090218,
"grad_norm": 1.6359349489212036,
"learning_rate": 4.7554192833548235e-05,
"loss": 1.7732,
"step": 5642
},
{
"epoch": 0.14360480201497308,
"grad_norm": 1.684005618095398,
"learning_rate": 4.751783684659e-05,
"loss": 1.7766,
"step": 5673
},
{
"epoch": 0.14438952770904395,
"grad_norm": 1.7531359195709229,
"learning_rate": 4.748122674890348e-05,
"loss": 1.7815,
"step": 5704
},
{
"epoch": 0.14517425340311485,
"grad_norm": 1.5898247957229614,
"learning_rate": 4.7444362953628654e-05,
"loss": 1.7837,
"step": 5735
},
{
"epoch": 0.14595897909718575,
"grad_norm": 1.6781623363494873,
"learning_rate": 4.7407245876768424e-05,
"loss": 1.7381,
"step": 5766
},
{
"epoch": 0.14674370479125665,
"grad_norm": 1.6126357316970825,
"learning_rate": 4.736987593718397e-05,
"loss": 1.7714,
"step": 5797
},
{
"epoch": 0.14752843048532752,
"grad_norm": 1.6623587608337402,
"learning_rate": 4.733225355658999e-05,
"loss": 1.7625,
"step": 5828
},
{
"epoch": 0.14831315617939841,
"grad_norm": 1.6715524196624756,
"learning_rate": 4.7294379159549926e-05,
"loss": 1.7631,
"step": 5859
},
{
"epoch": 0.1490978818734693,
"grad_norm": 1.6739026308059692,
"learning_rate": 4.725625317347119e-05,
"loss": 1.775,
"step": 5890
},
{
"epoch": 0.1498826075675402,
"grad_norm": 1.8141075372695923,
"learning_rate": 4.7217876028600374e-05,
"loss": 1.7881,
"step": 5921
},
{
"epoch": 0.15066733326161108,
"grad_norm": 1.6842069625854492,
"learning_rate": 4.717924815801832e-05,
"loss": 1.7707,
"step": 5952
},
{
"epoch": 0.15145205895568198,
"grad_norm": 1.7032698392868042,
"learning_rate": 4.714036999763532e-05,
"loss": 1.7631,
"step": 5983
},
{
"epoch": 0.15223678464975288,
"grad_norm": 1.7856013774871826,
"learning_rate": 4.7101241986186116e-05,
"loss": 1.7545,
"step": 6014
},
{
"epoch": 0.15302151034382377,
"grad_norm": 1.679623007774353,
"learning_rate": 4.7061864565225e-05,
"loss": 1.7676,
"step": 6045
},
{
"epoch": 0.15380623603789464,
"grad_norm": 1.626792073249817,
"learning_rate": 4.702223817912081e-05,
"loss": 1.7434,
"step": 6076
}
],
"logging_steps": 31,
"max_steps": 30517,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 3052,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.527890108044542e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}