|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.996935648621042, |
|
"eval_steps": 82, |
|
"global_step": 732, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0040858018386108275, |
|
"grad_norm": 4.75867223739624, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 1.3989, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0040858018386108275, |
|
"eval_loss": 1.7111468315124512, |
|
"eval_runtime": 5.4436, |
|
"eval_samples_per_second": 14.512, |
|
"eval_steps_per_second": 1.837, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008171603677221655, |
|
"grad_norm": 4.975377559661865, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 1.4837, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.012257405515832482, |
|
"grad_norm": 5.219729900360107, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.5181, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01634320735444331, |
|
"grad_norm": 4.57335901260376, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 1.4106, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.020429009193054137, |
|
"grad_norm": 3.840559720993042, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 1.3763, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.024514811031664963, |
|
"grad_norm": 3.2056212425231934, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.1876, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.028600612870275793, |
|
"grad_norm": 2.6987595558166504, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 1.2154, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03268641470888662, |
|
"grad_norm": 2.378502130508423, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 1.1594, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03677221654749745, |
|
"grad_norm": 1.7688865661621094, |
|
"learning_rate": 6e-06, |
|
"loss": 0.8435, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04085801838610827, |
|
"grad_norm": 1.3263744115829468, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.7219, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0449438202247191, |
|
"grad_norm": 1.3509997129440308, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 0.8172, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.049029622063329927, |
|
"grad_norm": 1.4541417360305786, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.7393, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05311542390194075, |
|
"grad_norm": 1.181699275970459, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 0.664, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.05720122574055159, |
|
"grad_norm": 0.9503294825553894, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.6222, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06128702757916241, |
|
"grad_norm": 0.7614471316337585, |
|
"learning_rate": 1e-05, |
|
"loss": 0.56, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06537282941777324, |
|
"grad_norm": 0.9878801107406616, |
|
"learning_rate": 1.0666666666666667e-05, |
|
"loss": 0.5548, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06945863125638406, |
|
"grad_norm": 0.8131901025772095, |
|
"learning_rate": 1.1333333333333334e-05, |
|
"loss": 0.4878, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0735444330949949, |
|
"grad_norm": 0.7322743535041809, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.5159, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07763023493360573, |
|
"grad_norm": 0.6428759098052979, |
|
"learning_rate": 1.2666666666666667e-05, |
|
"loss": 0.4575, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08171603677221655, |
|
"grad_norm": 0.562318742275238, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.4571, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08580183861082738, |
|
"grad_norm": 0.5707699060440063, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.4592, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0898876404494382, |
|
"grad_norm": 0.5272228717803955, |
|
"learning_rate": 1.4666666666666666e-05, |
|
"loss": 0.4457, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09397344228804903, |
|
"grad_norm": 0.5120903253555298, |
|
"learning_rate": 1.5333333333333334e-05, |
|
"loss": 0.4034, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.09805924412665985, |
|
"grad_norm": 0.46359285712242126, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.4037, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10214504596527069, |
|
"grad_norm": 0.49431198835372925, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.3875, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1062308478038815, |
|
"grad_norm": 0.4450273811817169, |
|
"learning_rate": 1.7333333333333336e-05, |
|
"loss": 0.3797, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11031664964249234, |
|
"grad_norm": 0.4551868140697479, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.3512, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.11440245148110317, |
|
"grad_norm": 0.5083736777305603, |
|
"learning_rate": 1.866666666666667e-05, |
|
"loss": 0.3906, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.118488253319714, |
|
"grad_norm": 0.47295963764190674, |
|
"learning_rate": 1.9333333333333333e-05, |
|
"loss": 0.3554, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.12257405515832483, |
|
"grad_norm": 0.4848616123199463, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3712, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.12665985699693566, |
|
"grad_norm": 0.4398118555545807, |
|
"learning_rate": 1.999989986294826e-05, |
|
"loss": 0.3694, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13074565883554648, |
|
"grad_norm": 0.41183602809906006, |
|
"learning_rate": 1.9999599453798523e-05, |
|
"loss": 0.3336, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1348314606741573, |
|
"grad_norm": 0.492713987827301, |
|
"learning_rate": 1.999909877856721e-05, |
|
"loss": 0.3657, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.13891726251276812, |
|
"grad_norm": 0.4517015516757965, |
|
"learning_rate": 1.9998397847281548e-05, |
|
"loss": 0.367, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.14300306435137897, |
|
"grad_norm": 0.4641965627670288, |
|
"learning_rate": 1.9997496673979375e-05, |
|
"loss": 0.3565, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1470888661899898, |
|
"grad_norm": 0.4812065064907074, |
|
"learning_rate": 1.9996395276708856e-05, |
|
"loss": 0.3773, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1511746680286006, |
|
"grad_norm": 0.42300987243652344, |
|
"learning_rate": 1.999509367752813e-05, |
|
"loss": 0.3643, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.15526046986721145, |
|
"grad_norm": 0.4512963593006134, |
|
"learning_rate": 1.9993591902504854e-05, |
|
"loss": 0.3409, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.15934627170582227, |
|
"grad_norm": 0.41626426577568054, |
|
"learning_rate": 1.9991889981715696e-05, |
|
"loss": 0.3546, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1634320735444331, |
|
"grad_norm": 0.43549367785453796, |
|
"learning_rate": 1.9989987949245725e-05, |
|
"loss": 0.3091, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1675178753830439, |
|
"grad_norm": 0.4042600393295288, |
|
"learning_rate": 1.9987885843187717e-05, |
|
"loss": 0.3174, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.17160367722165476, |
|
"grad_norm": 0.4394363462924957, |
|
"learning_rate": 1.9985583705641418e-05, |
|
"loss": 0.3601, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.17568947906026558, |
|
"grad_norm": 0.4294170141220093, |
|
"learning_rate": 1.9983081582712684e-05, |
|
"loss": 0.3283, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.1797752808988764, |
|
"grad_norm": 0.44452300667762756, |
|
"learning_rate": 1.998037952451255e-05, |
|
"loss": 0.3367, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.18386108273748722, |
|
"grad_norm": 0.4113090932369232, |
|
"learning_rate": 1.9977477585156252e-05, |
|
"loss": 0.2986, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.18794688457609807, |
|
"grad_norm": 0.44443050026893616, |
|
"learning_rate": 1.9974375822762117e-05, |
|
"loss": 0.3463, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1920326864147089, |
|
"grad_norm": 0.4303809106349945, |
|
"learning_rate": 1.9971074299450414e-05, |
|
"loss": 0.3281, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1961184882533197, |
|
"grad_norm": 0.4178621470928192, |
|
"learning_rate": 1.9967573081342103e-05, |
|
"loss": 0.3629, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.20020429009193055, |
|
"grad_norm": 0.38657113909721375, |
|
"learning_rate": 1.9963872238557516e-05, |
|
"loss": 0.3225, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.20429009193054137, |
|
"grad_norm": 0.5300270915031433, |
|
"learning_rate": 1.9959971845214953e-05, |
|
"loss": 0.3279, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2083758937691522, |
|
"grad_norm": 0.4061177968978882, |
|
"learning_rate": 1.9955871979429188e-05, |
|
"loss": 0.3278, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.212461695607763, |
|
"grad_norm": 0.41504785418510437, |
|
"learning_rate": 1.9951572723309918e-05, |
|
"loss": 0.3096, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.21654749744637386, |
|
"grad_norm": 0.4208971858024597, |
|
"learning_rate": 1.9947074162960113e-05, |
|
"loss": 0.3187, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.22063329928498468, |
|
"grad_norm": 0.36819201707839966, |
|
"learning_rate": 1.9942376388474282e-05, |
|
"loss": 0.3167, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2247191011235955, |
|
"grad_norm": 0.43327596783638, |
|
"learning_rate": 1.993747949393668e-05, |
|
"loss": 0.3188, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.22880490296220635, |
|
"grad_norm": 0.4377865791320801, |
|
"learning_rate": 1.9932383577419432e-05, |
|
"loss": 0.3478, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.23289070480081717, |
|
"grad_norm": 0.43336397409439087, |
|
"learning_rate": 1.992708874098054e-05, |
|
"loss": 0.3025, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.236976506639428, |
|
"grad_norm": 0.4399135410785675, |
|
"learning_rate": 1.9921595090661872e-05, |
|
"loss": 0.3098, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2410623084780388, |
|
"grad_norm": 0.4253901243209839, |
|
"learning_rate": 1.991590273648702e-05, |
|
"loss": 0.3303, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.24514811031664965, |
|
"grad_norm": 0.39254307746887207, |
|
"learning_rate": 1.9910011792459086e-05, |
|
"loss": 0.3018, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.24923391215526047, |
|
"grad_norm": 0.4217659831047058, |
|
"learning_rate": 1.9903922376558432e-05, |
|
"loss": 0.285, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2533197139938713, |
|
"grad_norm": 0.48558109998703003, |
|
"learning_rate": 1.989763461074029e-05, |
|
"loss": 0.3221, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2574055158324821, |
|
"grad_norm": 0.47454214096069336, |
|
"learning_rate": 1.989114862093232e-05, |
|
"loss": 0.3056, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.26149131767109296, |
|
"grad_norm": 0.4013993442058563, |
|
"learning_rate": 1.9884464537032103e-05, |
|
"loss": 0.3376, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.26557711950970375, |
|
"grad_norm": 0.4264606237411499, |
|
"learning_rate": 1.9877582492904533e-05, |
|
"loss": 0.3158, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2696629213483146, |
|
"grad_norm": 0.5440453886985779, |
|
"learning_rate": 1.9870502626379127e-05, |
|
"loss": 0.3056, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.27374872318692545, |
|
"grad_norm": 0.40003377199172974, |
|
"learning_rate": 1.9863225079247286e-05, |
|
"loss": 0.3357, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.27783452502553624, |
|
"grad_norm": 0.39155763387680054, |
|
"learning_rate": 1.985574999725943e-05, |
|
"loss": 0.2819, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2819203268641471, |
|
"grad_norm": 0.4461009204387665, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 0.2732, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.28600612870275793, |
|
"grad_norm": 0.38970062136650085, |
|
"learning_rate": 1.9840207831494903e-05, |
|
"loss": 0.2957, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2900919305413687, |
|
"grad_norm": 0.4369664788246155, |
|
"learning_rate": 1.983214105898757e-05, |
|
"loss": 0.3158, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.2941777323799796, |
|
"grad_norm": 0.4734659492969513, |
|
"learning_rate": 1.9823877374156647e-05, |
|
"loss": 0.3054, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2982635342185904, |
|
"grad_norm": 0.3933468461036682, |
|
"learning_rate": 1.9815416942502346e-05, |
|
"loss": 0.286, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3023493360572012, |
|
"grad_norm": 0.4472273290157318, |
|
"learning_rate": 1.98067599334652e-05, |
|
"loss": 0.3149, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.30643513789581206, |
|
"grad_norm": 0.43143752217292786, |
|
"learning_rate": 1.979790652042268e-05, |
|
"loss": 0.2792, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3105209397344229, |
|
"grad_norm": 0.4325246512889862, |
|
"learning_rate": 1.978885688068572e-05, |
|
"loss": 0.3024, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.3146067415730337, |
|
"grad_norm": 0.48796600103378296, |
|
"learning_rate": 1.9779611195495177e-05, |
|
"loss": 0.3343, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.31869254341164455, |
|
"grad_norm": 0.40505748987197876, |
|
"learning_rate": 1.977016965001817e-05, |
|
"loss": 0.2753, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.32277834525025534, |
|
"grad_norm": 0.40753036737442017, |
|
"learning_rate": 1.976053243334442e-05, |
|
"loss": 0.3073, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.3268641470888662, |
|
"grad_norm": 0.4000149071216583, |
|
"learning_rate": 1.9750699738482403e-05, |
|
"loss": 0.284, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.33094994892747703, |
|
"grad_norm": 0.42099907994270325, |
|
"learning_rate": 1.9740671762355548e-05, |
|
"loss": 0.2881, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3350357507660878, |
|
"grad_norm": 0.4155902564525604, |
|
"learning_rate": 1.973044870579824e-05, |
|
"loss": 0.2969, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3350357507660878, |
|
"eval_loss": 0.31923907995224, |
|
"eval_runtime": 5.81, |
|
"eval_samples_per_second": 13.597, |
|
"eval_steps_per_second": 1.721, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3391215526046987, |
|
"grad_norm": 0.39282551407814026, |
|
"learning_rate": 1.972003077355183e-05, |
|
"loss": 0.2948, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3432073544433095, |
|
"grad_norm": 0.4381943643093109, |
|
"learning_rate": 1.9709418174260523e-05, |
|
"loss": 0.3454, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3472931562819203, |
|
"grad_norm": 0.4093382954597473, |
|
"learning_rate": 1.9698611120467196e-05, |
|
"loss": 0.2962, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.35137895812053116, |
|
"grad_norm": 0.450135737657547, |
|
"learning_rate": 1.9687609828609156e-05, |
|
"loss": 0.3243, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.355464759959142, |
|
"grad_norm": 0.4139018654823303, |
|
"learning_rate": 1.9676414519013782e-05, |
|
"loss": 0.2996, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.3595505617977528, |
|
"grad_norm": 0.40026575326919556, |
|
"learning_rate": 1.966502541589414e-05, |
|
"loss": 0.2788, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.36627820134162903, |
|
"learning_rate": 1.965344274734447e-05, |
|
"loss": 0.2857, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.36772216547497444, |
|
"grad_norm": 0.42685478925704956, |
|
"learning_rate": 1.9641666745335626e-05, |
|
"loss": 0.2995, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3718079673135853, |
|
"grad_norm": 0.374288946390152, |
|
"learning_rate": 1.9629697645710432e-05, |
|
"loss": 0.3056, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.37589376915219613, |
|
"grad_norm": 0.3649786114692688, |
|
"learning_rate": 1.961753568817896e-05, |
|
"loss": 0.2854, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.3799795709908069, |
|
"grad_norm": 0.38573023676872253, |
|
"learning_rate": 1.9605181116313725e-05, |
|
"loss": 0.2667, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.3840653728294178, |
|
"grad_norm": 0.37577807903289795, |
|
"learning_rate": 1.9592634177544803e-05, |
|
"loss": 0.2815, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.3881511746680286, |
|
"grad_norm": 0.4320047199726105, |
|
"learning_rate": 1.957989512315489e-05, |
|
"loss": 0.3094, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3922369765066394, |
|
"grad_norm": 0.3816889524459839, |
|
"learning_rate": 1.9566964208274254e-05, |
|
"loss": 0.292, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.39632277834525026, |
|
"grad_norm": 0.3946669399738312, |
|
"learning_rate": 1.9553841691875632e-05, |
|
"loss": 0.3002, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.4004085801838611, |
|
"grad_norm": 0.36885613203048706, |
|
"learning_rate": 1.9540527836769047e-05, |
|
"loss": 0.2583, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4044943820224719, |
|
"grad_norm": 0.37865176796913147, |
|
"learning_rate": 1.9527022909596537e-05, |
|
"loss": 0.2787, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.40858018386108275, |
|
"grad_norm": 0.4429585337638855, |
|
"learning_rate": 1.951332718082682e-05, |
|
"loss": 0.3226, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.41266598569969354, |
|
"grad_norm": 0.3926009237766266, |
|
"learning_rate": 1.9499440924749878e-05, |
|
"loss": 0.2914, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.4167517875383044, |
|
"grad_norm": 0.3467339277267456, |
|
"learning_rate": 1.9485364419471454e-05, |
|
"loss": 0.266, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.42083758937691523, |
|
"grad_norm": 0.4126642644405365, |
|
"learning_rate": 1.9471097946907506e-05, |
|
"loss": 0.2775, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.424923391215526, |
|
"grad_norm": 0.44586020708084106, |
|
"learning_rate": 1.9456641792778527e-05, |
|
"loss": 0.2884, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.4290091930541369, |
|
"grad_norm": 0.3969588279724121, |
|
"learning_rate": 1.9441996246603848e-05, |
|
"loss": 0.2835, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4330949948927477, |
|
"grad_norm": 0.38928356766700745, |
|
"learning_rate": 1.9427161601695833e-05, |
|
"loss": 0.2826, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.4371807967313585, |
|
"grad_norm": 0.4089799225330353, |
|
"learning_rate": 1.9412138155154e-05, |
|
"loss": 0.2817, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.44126659856996936, |
|
"grad_norm": 0.375505656003952, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 0.2882, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.4453524004085802, |
|
"grad_norm": 0.406118780374527, |
|
"learning_rate": 1.9381526064466995e-05, |
|
"loss": 0.2861, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 0.3882409334182739, |
|
"learning_rate": 1.9365938033402715e-05, |
|
"loss": 0.261, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.45352400408580185, |
|
"grad_norm": 0.4351583421230316, |
|
"learning_rate": 1.9350162426854152e-05, |
|
"loss": 0.3014, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.4576098059244127, |
|
"grad_norm": 0.3621097505092621, |
|
"learning_rate": 1.933419956076584e-05, |
|
"loss": 0.2728, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.4616956077630235, |
|
"grad_norm": 0.3881032466888428, |
|
"learning_rate": 1.9318049754832656e-05, |
|
"loss": 0.2736, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.46578140960163433, |
|
"grad_norm": 0.37627285718917847, |
|
"learning_rate": 1.9301713332493386e-05, |
|
"loss": 0.2707, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.4698672114402451, |
|
"grad_norm": 0.4285913109779358, |
|
"learning_rate": 1.9285190620924267e-05, |
|
"loss": 0.2815, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.473953013278856, |
|
"grad_norm": 0.35718926787376404, |
|
"learning_rate": 1.926848195103242e-05, |
|
"loss": 0.2621, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.4780388151174668, |
|
"grad_norm": 0.3852044641971588, |
|
"learning_rate": 1.925158765744924e-05, |
|
"loss": 0.283, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.4821246169560776, |
|
"grad_norm": 0.3884032368659973, |
|
"learning_rate": 1.923450807852367e-05, |
|
"loss": 0.2711, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.48621041879468846, |
|
"grad_norm": 0.4398249685764313, |
|
"learning_rate": 1.9217243556315445e-05, |
|
"loss": 0.2757, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.4902962206332993, |
|
"grad_norm": 0.36689624190330505, |
|
"learning_rate": 1.9199794436588244e-05, |
|
"loss": 0.2669, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4943820224719101, |
|
"grad_norm": 0.46398666501045227, |
|
"learning_rate": 1.9182161068802742e-05, |
|
"loss": 0.2683, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.49846782431052095, |
|
"grad_norm": 0.40020987391471863, |
|
"learning_rate": 1.916434380610963e-05, |
|
"loss": 0.2927, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5025536261491318, |
|
"grad_norm": 0.4032459259033203, |
|
"learning_rate": 1.9146343005342546e-05, |
|
"loss": 0.31, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5066394279877426, |
|
"grad_norm": 0.44166550040245056, |
|
"learning_rate": 1.912815902701091e-05, |
|
"loss": 0.2842, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5107252298263534, |
|
"grad_norm": 0.39895153045654297, |
|
"learning_rate": 1.9109792235292715e-05, |
|
"loss": 0.2766, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5148110316649642, |
|
"grad_norm": 0.3415013253688812, |
|
"learning_rate": 1.909124299802724e-05, |
|
"loss": 0.2761, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5188968335035751, |
|
"grad_norm": 0.3837663531303406, |
|
"learning_rate": 1.9072511686707663e-05, |
|
"loss": 0.2797, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5229826353421859, |
|
"grad_norm": 0.4030819833278656, |
|
"learning_rate": 1.9053598676473656e-05, |
|
"loss": 0.2932, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5270684371807968, |
|
"grad_norm": 0.40120938420295715, |
|
"learning_rate": 1.9034504346103825e-05, |
|
"loss": 0.2698, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5311542390194075, |
|
"grad_norm": 0.3621327579021454, |
|
"learning_rate": 1.9015229078008163e-05, |
|
"loss": 0.298, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5352400408580184, |
|
"grad_norm": 0.33476150035858154, |
|
"learning_rate": 1.8995773258220374e-05, |
|
"loss": 0.2612, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5393258426966292, |
|
"grad_norm": 0.3523140549659729, |
|
"learning_rate": 1.8976137276390145e-05, |
|
"loss": 0.2671, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.54341164453524, |
|
"grad_norm": 0.3624558746814728, |
|
"learning_rate": 1.8956321525775337e-05, |
|
"loss": 0.2687, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5474974463738509, |
|
"grad_norm": 0.35892072319984436, |
|
"learning_rate": 1.8936326403234125e-05, |
|
"loss": 0.2755, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5515832482124617, |
|
"grad_norm": 0.3678256869316101, |
|
"learning_rate": 1.891615230921703e-05, |
|
"loss": 0.278, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5556690500510725, |
|
"grad_norm": 0.38125160336494446, |
|
"learning_rate": 1.8895799647758912e-05, |
|
"loss": 0.2765, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5597548518896833, |
|
"grad_norm": 0.40152257680892944, |
|
"learning_rate": 1.8875268826470875e-05, |
|
"loss": 0.3239, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5638406537282942, |
|
"grad_norm": 0.3935178816318512, |
|
"learning_rate": 1.8854560256532098e-05, |
|
"loss": 0.2956, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.567926455566905, |
|
"grad_norm": 0.4389478266239166, |
|
"learning_rate": 1.8833674352681613e-05, |
|
"loss": 0.2968, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.5720122574055159, |
|
"grad_norm": 0.3884355127811432, |
|
"learning_rate": 1.881261153320999e-05, |
|
"loss": 0.3074, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5760980592441267, |
|
"grad_norm": 0.4054373502731323, |
|
"learning_rate": 1.879137221995095e-05, |
|
"loss": 0.2996, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.5801838610827375, |
|
"grad_norm": 0.4423893690109253, |
|
"learning_rate": 1.8769956838272937e-05, |
|
"loss": 0.3082, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.5842696629213483, |
|
"grad_norm": 0.42978307604789734, |
|
"learning_rate": 1.8748365817070586e-05, |
|
"loss": 0.2878, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.5883554647599591, |
|
"grad_norm": 0.38182228803634644, |
|
"learning_rate": 1.8726599588756144e-05, |
|
"loss": 0.2649, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.59244126659857, |
|
"grad_norm": 0.43477413058280945, |
|
"learning_rate": 1.8704658589250795e-05, |
|
"loss": 0.271, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5965270684371808, |
|
"grad_norm": 0.3876926898956299, |
|
"learning_rate": 1.868254325797594e-05, |
|
"loss": 0.2804, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6006128702757916, |
|
"grad_norm": 0.39310601353645325, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 0.2767, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6046986721144024, |
|
"grad_norm": 0.421290785074234, |
|
"learning_rate": 1.8637791375251505e-05, |
|
"loss": 0.2668, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6087844739530133, |
|
"grad_norm": 0.450023353099823, |
|
"learning_rate": 1.8615155720066247e-05, |
|
"loss": 0.2888, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6128702757916241, |
|
"grad_norm": 0.3645341396331787, |
|
"learning_rate": 1.859234752562217e-05, |
|
"loss": 0.2828, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.616956077630235, |
|
"grad_norm": 0.41853606700897217, |
|
"learning_rate": 1.8569367248708343e-05, |
|
"loss": 0.284, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6210418794688458, |
|
"grad_norm": 0.3675737679004669, |
|
"learning_rate": 1.8546215349560204e-05, |
|
"loss": 0.2933, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6251276813074566, |
|
"grad_norm": 0.3668256998062134, |
|
"learning_rate": 1.8522892291850335e-05, |
|
"loss": 0.2729, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6292134831460674, |
|
"grad_norm": 0.34576019644737244, |
|
"learning_rate": 1.849939854267919e-05, |
|
"loss": 0.2612, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6332992849846782, |
|
"grad_norm": 0.41370126605033875, |
|
"learning_rate": 1.847573457256571e-05, |
|
"loss": 0.2693, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6373850868232891, |
|
"grad_norm": 0.4205566644668579, |
|
"learning_rate": 1.845190085543795e-05, |
|
"loss": 0.2746, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6414708886618999, |
|
"grad_norm": 0.3997614085674286, |
|
"learning_rate": 1.8427897868623535e-05, |
|
"loss": 0.2813, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6455566905005107, |
|
"grad_norm": 0.41005200147628784, |
|
"learning_rate": 1.840372609284013e-05, |
|
"loss": 0.2647, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6496424923391215, |
|
"grad_norm": 0.4547550678253174, |
|
"learning_rate": 1.8379386012185813e-05, |
|
"loss": 0.2791, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.6537282941777324, |
|
"grad_norm": 0.4075047969818115, |
|
"learning_rate": 1.8354878114129368e-05, |
|
"loss": 0.2769, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6578140960163432, |
|
"grad_norm": 0.37060046195983887, |
|
"learning_rate": 1.8330202889500518e-05, |
|
"loss": 0.3028, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6618998978549541, |
|
"grad_norm": 0.35541340708732605, |
|
"learning_rate": 1.8305360832480118e-05, |
|
"loss": 0.2981, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.6659856996935649, |
|
"grad_norm": 0.3970625400543213, |
|
"learning_rate": 1.8280352440590236e-05, |
|
"loss": 0.2634, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.6700715015321757, |
|
"grad_norm": 0.4075865149497986, |
|
"learning_rate": 1.82551782146842e-05, |
|
"loss": 0.3027, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.6700715015321757, |
|
"eval_loss": 0.291363924741745, |
|
"eval_runtime": 5.7936, |
|
"eval_samples_per_second": 13.636, |
|
"eval_steps_per_second": 1.726, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.6741573033707865, |
|
"grad_norm": 0.34390076994895935, |
|
"learning_rate": 1.8229838658936566e-05, |
|
"loss": 0.2536, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6782431052093973, |
|
"grad_norm": 0.3729197084903717, |
|
"learning_rate": 1.8204334280833005e-05, |
|
"loss": 0.2739, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.6823289070480082, |
|
"grad_norm": 0.3974601924419403, |
|
"learning_rate": 1.817866559116017e-05, |
|
"loss": 0.2858, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.686414708886619, |
|
"grad_norm": 0.3424644470214844, |
|
"learning_rate": 1.8152833103995443e-05, |
|
"loss": 0.2305, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.6905005107252298, |
|
"grad_norm": 0.4293709397315979, |
|
"learning_rate": 1.8126837336696645e-05, |
|
"loss": 0.3179, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.6945863125638406, |
|
"grad_norm": 0.3259459435939789, |
|
"learning_rate": 1.8100678809891668e-05, |
|
"loss": 0.2589, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6986721144024515, |
|
"grad_norm": 0.40771302580833435, |
|
"learning_rate": 1.807435804746807e-05, |
|
"loss": 0.2637, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7027579162410623, |
|
"grad_norm": 0.3847212493419647, |
|
"learning_rate": 1.8047875576562556e-05, |
|
"loss": 0.2782, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7068437180796732, |
|
"grad_norm": 0.35547974705696106, |
|
"learning_rate": 1.802123192755044e-05, |
|
"loss": 0.2695, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.710929519918284, |
|
"grad_norm": 0.3954298198223114, |
|
"learning_rate": 1.7994427634035016e-05, |
|
"loss": 0.3005, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7150153217568948, |
|
"grad_norm": 0.3506409525871277, |
|
"learning_rate": 1.796746323283686e-05, |
|
"loss": 0.2716, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7191011235955056, |
|
"grad_norm": 0.42227277159690857, |
|
"learning_rate": 1.7940339263983112e-05, |
|
"loss": 0.2915, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7231869254341164, |
|
"grad_norm": 0.3948259949684143, |
|
"learning_rate": 1.791305627069662e-05, |
|
"loss": 0.2883, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.3580792248249054, |
|
"learning_rate": 1.7885614799385086e-05, |
|
"loss": 0.2782, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7313585291113381, |
|
"grad_norm": 0.39698660373687744, |
|
"learning_rate": 1.785801539963012e-05, |
|
"loss": 0.2657, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7354443309499489, |
|
"grad_norm": 0.3663792610168457, |
|
"learning_rate": 1.7830258624176224e-05, |
|
"loss": 0.2686, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7395301327885597, |
|
"grad_norm": 0.38216930627822876, |
|
"learning_rate": 1.7802345028919728e-05, |
|
"loss": 0.2706, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.7436159346271706, |
|
"grad_norm": 0.4187450706958771, |
|
"learning_rate": 1.777427517289766e-05, |
|
"loss": 0.2573, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7477017364657814, |
|
"grad_norm": 0.34619036316871643, |
|
"learning_rate": 1.7746049618276545e-05, |
|
"loss": 0.269, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7517875383043923, |
|
"grad_norm": 0.35370582342147827, |
|
"learning_rate": 1.7717668930341152e-05, |
|
"loss": 0.2552, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.7558733401430031, |
|
"grad_norm": 0.4264880418777466, |
|
"learning_rate": 1.768913367748316e-05, |
|
"loss": 0.2952, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.7599591419816139, |
|
"grad_norm": 0.39135676622390747, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 0.2661, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.7640449438202247, |
|
"grad_norm": 0.39061596989631653, |
|
"learning_rate": 1.7631601766032337e-05, |
|
"loss": 0.2737, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.7681307456588355, |
|
"grad_norm": 0.3799816966056824, |
|
"learning_rate": 1.7602606259654704e-05, |
|
"loss": 0.2767, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.7722165474974464, |
|
"grad_norm": 0.3592148721218109, |
|
"learning_rate": 1.7573458492761802e-05, |
|
"loss": 0.2448, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.7763023493360572, |
|
"grad_norm": 0.39084604382514954, |
|
"learning_rate": 1.7544159049107902e-05, |
|
"loss": 0.275, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.780388151174668, |
|
"grad_norm": 0.36443451046943665, |
|
"learning_rate": 1.7514708515485002e-05, |
|
"loss": 0.2645, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.7844739530132788, |
|
"grad_norm": 0.4001200497150421, |
|
"learning_rate": 1.7485107481711014e-05, |
|
"loss": 0.2724, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.7885597548518897, |
|
"grad_norm": 0.39093396067619324, |
|
"learning_rate": 1.7455356540617988e-05, |
|
"loss": 0.2712, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.7926455566905005, |
|
"grad_norm": 0.3430577218532562, |
|
"learning_rate": 1.7425456288040236e-05, |
|
"loss": 0.2489, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.7967313585291114, |
|
"grad_norm": 0.3573733866214752, |
|
"learning_rate": 1.7395407322802374e-05, |
|
"loss": 0.2696, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8008171603677222, |
|
"grad_norm": 0.38158077001571655, |
|
"learning_rate": 1.736521024670737e-05, |
|
"loss": 0.2814, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.804902962206333, |
|
"grad_norm": 0.366470068693161, |
|
"learning_rate": 1.733486566452446e-05, |
|
"loss": 0.2529, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.8089887640449438, |
|
"grad_norm": 0.3718278408050537, |
|
"learning_rate": 1.7304374183977032e-05, |
|
"loss": 0.2747, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8130745658835546, |
|
"grad_norm": 0.3395809233188629, |
|
"learning_rate": 1.7273736415730488e-05, |
|
"loss": 0.2693, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8171603677221655, |
|
"grad_norm": 0.307731032371521, |
|
"learning_rate": 1.7242952973379983e-05, |
|
"loss": 0.2081, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8212461695607763, |
|
"grad_norm": 0.3522433936595917, |
|
"learning_rate": 1.7212024473438145e-05, |
|
"loss": 0.2495, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8253319713993871, |
|
"grad_norm": 0.35946980118751526, |
|
"learning_rate": 1.7180951535322742e-05, |
|
"loss": 0.2702, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8294177732379979, |
|
"grad_norm": 0.3933047950267792, |
|
"learning_rate": 1.7149734781344247e-05, |
|
"loss": 0.2629, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8335035750766088, |
|
"grad_norm": 0.3658384084701538, |
|
"learning_rate": 1.7118374836693407e-05, |
|
"loss": 0.2538, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8375893769152196, |
|
"grad_norm": 0.3532220423221588, |
|
"learning_rate": 1.7086872329428702e-05, |
|
"loss": 0.2587, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8416751787538305, |
|
"grad_norm": 0.3619686961174011, |
|
"learning_rate": 1.705522789046377e-05, |
|
"loss": 0.2658, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8457609805924413, |
|
"grad_norm": 0.4083801209926605, |
|
"learning_rate": 1.7023442153554776e-05, |
|
"loss": 0.2614, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.849846782431052, |
|
"grad_norm": 0.3868924081325531, |
|
"learning_rate": 1.6991515755287715e-05, |
|
"loss": 0.2831, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.8539325842696629, |
|
"grad_norm": 0.38413897156715393, |
|
"learning_rate": 1.695944933506567e-05, |
|
"loss": 0.2596, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.8580183861082737, |
|
"grad_norm": 0.34999531507492065, |
|
"learning_rate": 1.6927243535095995e-05, |
|
"loss": 0.2842, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8621041879468846, |
|
"grad_norm": 0.328204482793808, |
|
"learning_rate": 1.6894899000377462e-05, |
|
"loss": 0.2332, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.8661899897854954, |
|
"grad_norm": 0.3802552819252014, |
|
"learning_rate": 1.686241637868734e-05, |
|
"loss": 0.2709, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.8702757916241062, |
|
"grad_norm": 0.35758858919143677, |
|
"learning_rate": 1.6829796320568416e-05, |
|
"loss": 0.279, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.874361593462717, |
|
"grad_norm": 0.3561984896659851, |
|
"learning_rate": 1.6797039479315994e-05, |
|
"loss": 0.2868, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.8784473953013279, |
|
"grad_norm": 0.32591065764427185, |
|
"learning_rate": 1.6764146510964762e-05, |
|
"loss": 0.2485, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8825331971399387, |
|
"grad_norm": 0.36409640312194824, |
|
"learning_rate": 1.67311180742757e-05, |
|
"loss": 0.2577, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.8866189989785496, |
|
"grad_norm": 0.34685492515563965, |
|
"learning_rate": 1.669795483072287e-05, |
|
"loss": 0.247, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.8907048008171604, |
|
"grad_norm": 0.3445712625980377, |
|
"learning_rate": 1.6664657444480145e-05, |
|
"loss": 0.2565, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.8947906026557712, |
|
"grad_norm": 0.34710460901260376, |
|
"learning_rate": 1.6631226582407954e-05, |
|
"loss": 0.2363, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 0.33726766705513, |
|
"learning_rate": 1.6597662914039885e-05, |
|
"loss": 0.2483, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9029622063329928, |
|
"grad_norm": 0.34024032950401306, |
|
"learning_rate": 1.65639671115693e-05, |
|
"loss": 0.2474, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9070480081716037, |
|
"grad_norm": 0.38807395100593567, |
|
"learning_rate": 1.653013984983585e-05, |
|
"loss": 0.2726, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9111338100102145, |
|
"grad_norm": 0.36375290155410767, |
|
"learning_rate": 1.6496181806312005e-05, |
|
"loss": 0.2726, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.9152196118488254, |
|
"grad_norm": 0.36927178502082825, |
|
"learning_rate": 1.6462093661089432e-05, |
|
"loss": 0.2518, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9193054136874361, |
|
"grad_norm": 0.3809269070625305, |
|
"learning_rate": 1.6427876096865394e-05, |
|
"loss": 0.2449, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.923391215526047, |
|
"grad_norm": 0.34634968638420105, |
|
"learning_rate": 1.6393529798929103e-05, |
|
"loss": 0.2575, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.9274770173646578, |
|
"grad_norm": 0.33054831624031067, |
|
"learning_rate": 1.635905545514795e-05, |
|
"loss": 0.2639, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9315628192032687, |
|
"grad_norm": 0.35482174158096313, |
|
"learning_rate": 1.6324453755953772e-05, |
|
"loss": 0.2667, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9356486210418795, |
|
"grad_norm": 0.3657509684562683, |
|
"learning_rate": 1.6289725394328998e-05, |
|
"loss": 0.255, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.9397344228804902, |
|
"grad_norm": 0.3343275785446167, |
|
"learning_rate": 1.6254871065792776e-05, |
|
"loss": 0.2336, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.9438202247191011, |
|
"grad_norm": 0.3493170142173767, |
|
"learning_rate": 1.621989146838704e-05, |
|
"loss": 0.2649, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.947906026557712, |
|
"grad_norm": 0.3305867612361908, |
|
"learning_rate": 1.618478730266255e-05, |
|
"loss": 0.2767, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.9519918283963228, |
|
"grad_norm": 0.35817259550094604, |
|
"learning_rate": 1.6149559271664835e-05, |
|
"loss": 0.2817, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.9560776302349336, |
|
"grad_norm": 0.37733370065689087, |
|
"learning_rate": 1.6114208080920125e-05, |
|
"loss": 0.2809, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.9601634320735445, |
|
"grad_norm": 0.3227766156196594, |
|
"learning_rate": 1.607873443842122e-05, |
|
"loss": 0.2545, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.9642492339121552, |
|
"grad_norm": 0.3445710241794586, |
|
"learning_rate": 1.6043139054613326e-05, |
|
"loss": 0.2476, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.9683350357507661, |
|
"grad_norm": 0.3375508785247803, |
|
"learning_rate": 1.600742264237979e-05, |
|
"loss": 0.2502, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.9724208375893769, |
|
"grad_norm": 0.356039434671402, |
|
"learning_rate": 1.5971585917027864e-05, |
|
"loss": 0.268, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.9765066394279878, |
|
"grad_norm": 0.34852373600006104, |
|
"learning_rate": 1.5935629596274345e-05, |
|
"loss": 0.2605, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.9805924412665986, |
|
"grad_norm": 0.3376101851463318, |
|
"learning_rate": 1.5899554400231233e-05, |
|
"loss": 0.2567, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9846782431052093, |
|
"grad_norm": 0.32361170649528503, |
|
"learning_rate": 1.586336105139127e-05, |
|
"loss": 0.2481, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.9887640449438202, |
|
"grad_norm": 0.35558903217315674, |
|
"learning_rate": 1.5827050274613512e-05, |
|
"loss": 0.2514, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.992849846782431, |
|
"grad_norm": 0.31636619567871094, |
|
"learning_rate": 1.579062279710879e-05, |
|
"loss": 0.2237, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.9969356486210419, |
|
"grad_norm": 0.3540779948234558, |
|
"learning_rate": 1.5754079348425137e-05, |
|
"loss": 0.2381, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.0040858018386107, |
|
"grad_norm": 0.7127255201339722, |
|
"learning_rate": 1.57174206604332e-05, |
|
"loss": 0.4477, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.0081716036772217, |
|
"grad_norm": 0.21768411993980408, |
|
"learning_rate": 1.568064746731156e-05, |
|
"loss": 0.177, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.0081716036772217, |
|
"eval_loss": 0.2854033410549164, |
|
"eval_runtime": 5.5756, |
|
"eval_samples_per_second": 14.169, |
|
"eval_steps_per_second": 1.794, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.0122574055158324, |
|
"grad_norm": 0.24506381154060364, |
|
"learning_rate": 1.564376050553205e-05, |
|
"loss": 0.1647, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.0163432073544434, |
|
"grad_norm": 0.24179627001285553, |
|
"learning_rate": 1.560676051384499e-05, |
|
"loss": 0.1908, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.0204290091930541, |
|
"grad_norm": 0.2527990937232971, |
|
"learning_rate": 1.5569648233264395e-05, |
|
"loss": 0.1728, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.0245148110316649, |
|
"grad_norm": 0.28597134351730347, |
|
"learning_rate": 1.553242440705314e-05, |
|
"loss": 0.1854, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0286006128702758, |
|
"grad_norm": 0.2613103985786438, |
|
"learning_rate": 1.5495089780708062e-05, |
|
"loss": 0.1762, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.0326864147088866, |
|
"grad_norm": 0.2806336581707001, |
|
"learning_rate": 1.5457645101945046e-05, |
|
"loss": 0.1801, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.0367722165474975, |
|
"grad_norm": 0.29933255910873413, |
|
"learning_rate": 1.5420091120684042e-05, |
|
"loss": 0.1869, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.0408580183861083, |
|
"grad_norm": 0.2678683400154114, |
|
"learning_rate": 1.538242858903404e-05, |
|
"loss": 0.1684, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.0449438202247192, |
|
"grad_norm": 0.27515852451324463, |
|
"learning_rate": 1.5344658261278013e-05, |
|
"loss": 0.1859, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.04902962206333, |
|
"grad_norm": 0.2876634895801544, |
|
"learning_rate": 1.530678089385782e-05, |
|
"loss": 0.1705, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.0531154239019407, |
|
"grad_norm": 0.2911262810230255, |
|
"learning_rate": 1.5268797245359035e-05, |
|
"loss": 0.1937, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.0572012257405516, |
|
"grad_norm": 0.3048553466796875, |
|
"learning_rate": 1.5230708076495777e-05, |
|
"loss": 0.1882, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.0612870275791624, |
|
"grad_norm": 0.28508955240249634, |
|
"learning_rate": 1.519251415009546e-05, |
|
"loss": 0.1767, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.0653728294177733, |
|
"grad_norm": 0.266313374042511, |
|
"learning_rate": 1.5154216231083522e-05, |
|
"loss": 0.1647, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.069458631256384, |
|
"grad_norm": 0.2724918723106384, |
|
"learning_rate": 1.5115815086468103e-05, |
|
"loss": 0.1685, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.0735444330949948, |
|
"grad_norm": 0.2324502021074295, |
|
"learning_rate": 1.507731148532468e-05, |
|
"loss": 0.1632, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.0776302349336058, |
|
"grad_norm": 0.26865899562835693, |
|
"learning_rate": 1.5038706198780673e-05, |
|
"loss": 0.1802, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.0817160367722165, |
|
"grad_norm": 0.29491883516311646, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.1803, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.0858018386108275, |
|
"grad_norm": 0.28987348079681396, |
|
"learning_rate": 1.496119366416759e-05, |
|
"loss": 0.1862, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.0898876404494382, |
|
"grad_norm": 0.27755048871040344, |
|
"learning_rate": 1.492228796847385e-05, |
|
"loss": 0.1741, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.093973442288049, |
|
"grad_norm": 0.2608552873134613, |
|
"learning_rate": 1.4883283692099114e-05, |
|
"loss": 0.1693, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.09805924412666, |
|
"grad_norm": 0.27284783124923706, |
|
"learning_rate": 1.4844181616198028e-05, |
|
"loss": 0.1878, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.1021450459652706, |
|
"grad_norm": 0.24481667578220367, |
|
"learning_rate": 1.4804982523883915e-05, |
|
"loss": 0.1589, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.1062308478038816, |
|
"grad_norm": 0.2996629774570465, |
|
"learning_rate": 1.4765687200213079e-05, |
|
"loss": 0.1823, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.1103166496424923, |
|
"grad_norm": 0.2922385632991791, |
|
"learning_rate": 1.4726296432169095e-05, |
|
"loss": 0.1769, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.1144024514811033, |
|
"grad_norm": 0.3046974241733551, |
|
"learning_rate": 1.4686811008647037e-05, |
|
"loss": 0.1823, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.118488253319714, |
|
"grad_norm": 0.2792796790599823, |
|
"learning_rate": 1.4647231720437687e-05, |
|
"loss": 0.1717, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.1225740551583248, |
|
"grad_norm": 0.27251774072647095, |
|
"learning_rate": 1.4607559360211688e-05, |
|
"loss": 0.1652, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.1266598569969357, |
|
"grad_norm": 0.2751109302043915, |
|
"learning_rate": 1.456779472250368e-05, |
|
"loss": 0.1713, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.1307456588355465, |
|
"grad_norm": 0.2737586796283722, |
|
"learning_rate": 1.4527938603696376e-05, |
|
"loss": 0.162, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.1348314606741572, |
|
"grad_norm": 0.24653682112693787, |
|
"learning_rate": 1.4487991802004625e-05, |
|
"loss": 0.1626, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.1389172625127681, |
|
"grad_norm": 0.46106576919555664, |
|
"learning_rate": 1.4447955117459414e-05, |
|
"loss": 0.1609, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.1430030643513789, |
|
"grad_norm": 0.27714091539382935, |
|
"learning_rate": 1.4407829351891858e-05, |
|
"loss": 0.1759, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.1470888661899898, |
|
"grad_norm": 0.2678029537200928, |
|
"learning_rate": 1.436761530891713e-05, |
|
"loss": 0.1753, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.1511746680286006, |
|
"grad_norm": 0.2559642791748047, |
|
"learning_rate": 1.4327313793918362e-05, |
|
"loss": 0.1778, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.1552604698672115, |
|
"grad_norm": 0.3033258616924286, |
|
"learning_rate": 1.4286925614030542e-05, |
|
"loss": 0.1871, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.1593462717058223, |
|
"grad_norm": 0.2658158540725708, |
|
"learning_rate": 1.4246451578124321e-05, |
|
"loss": 0.1782, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.163432073544433, |
|
"grad_norm": 0.2901168465614319, |
|
"learning_rate": 1.4205892496789816e-05, |
|
"loss": 0.174, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.167517875383044, |
|
"grad_norm": 0.23054322600364685, |
|
"learning_rate": 1.4165249182320401e-05, |
|
"loss": 0.1553, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.1716036772216547, |
|
"grad_norm": 0.267805278301239, |
|
"learning_rate": 1.4124522448696407e-05, |
|
"loss": 0.168, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.1756894790602657, |
|
"grad_norm": 0.26580214500427246, |
|
"learning_rate": 1.4083713111568841e-05, |
|
"loss": 0.167, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.1797752808988764, |
|
"grad_norm": 0.2736794948577881, |
|
"learning_rate": 1.404282198824305e-05, |
|
"loss": 0.1623, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.1838610827374871, |
|
"grad_norm": 0.25851017236709595, |
|
"learning_rate": 1.4001849897662337e-05, |
|
"loss": 0.1646, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.187946884576098, |
|
"grad_norm": 0.26858997344970703, |
|
"learning_rate": 1.396079766039157e-05, |
|
"loss": 0.1768, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.1920326864147088, |
|
"grad_norm": 0.2878361940383911, |
|
"learning_rate": 1.3919666098600753e-05, |
|
"loss": 0.1712, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.1961184882533198, |
|
"grad_norm": 0.23014627397060394, |
|
"learning_rate": 1.387845603604855e-05, |
|
"loss": 0.1595, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.2002042900919305, |
|
"grad_norm": 0.27550917863845825, |
|
"learning_rate": 1.3837168298065798e-05, |
|
"loss": 0.1639, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.2042900919305413, |
|
"grad_norm": 0.2697204053401947, |
|
"learning_rate": 1.3795803711538966e-05, |
|
"loss": 0.1619, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.2083758937691522, |
|
"grad_norm": 0.29666051268577576, |
|
"learning_rate": 1.37543631048936e-05, |
|
"loss": 0.1815, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.212461695607763, |
|
"grad_norm": 0.25596365332603455, |
|
"learning_rate": 1.3712847308077737e-05, |
|
"loss": 0.1629, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.216547497446374, |
|
"grad_norm": 0.25550931692123413, |
|
"learning_rate": 1.3671257152545277e-05, |
|
"loss": 0.1635, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.2206332992849847, |
|
"grad_norm": 0.2615107297897339, |
|
"learning_rate": 1.3629593471239328e-05, |
|
"loss": 0.1547, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.2247191011235956, |
|
"grad_norm": 0.2814185917377472, |
|
"learning_rate": 1.3587857098575534e-05, |
|
"loss": 0.1713, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.2288049029622063, |
|
"grad_norm": 0.2644117772579193, |
|
"learning_rate": 1.3546048870425356e-05, |
|
"loss": 0.1703, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.232890704800817, |
|
"grad_norm": 0.2645355463027954, |
|
"learning_rate": 1.350416962409934e-05, |
|
"loss": 0.159, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.236976506639428, |
|
"grad_norm": 0.2637065351009369, |
|
"learning_rate": 1.346222019833033e-05, |
|
"loss": 0.1647, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.2410623084780388, |
|
"grad_norm": 0.24007368087768555, |
|
"learning_rate": 1.342020143325669e-05, |
|
"loss": 0.1569, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.2451481103166497, |
|
"grad_norm": 0.2273741364479065, |
|
"learning_rate": 1.3378114170405473e-05, |
|
"loss": 0.1645, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.2492339121552605, |
|
"grad_norm": 0.2602927088737488, |
|
"learning_rate": 1.3335959252675566e-05, |
|
"loss": 0.1723, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.2533197139938714, |
|
"grad_norm": 0.28329333662986755, |
|
"learning_rate": 1.3293737524320798e-05, |
|
"loss": 0.1704, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.2574055158324822, |
|
"grad_norm": 0.270916610956192, |
|
"learning_rate": 1.3251449830933052e-05, |
|
"loss": 0.1621, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.261491317671093, |
|
"grad_norm": 0.268443763256073, |
|
"learning_rate": 1.3209097019425317e-05, |
|
"loss": 0.177, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.2655771195097039, |
|
"grad_norm": 0.2811964750289917, |
|
"learning_rate": 1.3166679938014728e-05, |
|
"loss": 0.1581, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.2696629213483146, |
|
"grad_norm": 0.2809509038925171, |
|
"learning_rate": 1.3124199436205575e-05, |
|
"loss": 0.1625, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.2737487231869253, |
|
"grad_norm": 0.27429160475730896, |
|
"learning_rate": 1.3081656364772308e-05, |
|
"loss": 0.1796, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.2778345250255363, |
|
"grad_norm": 0.2557787299156189, |
|
"learning_rate": 1.303905157574247e-05, |
|
"loss": 0.1664, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.281920326864147, |
|
"grad_norm": 0.3070502281188965, |
|
"learning_rate": 1.2996385922379657e-05, |
|
"loss": 0.1884, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.286006128702758, |
|
"grad_norm": 0.2685239315032959, |
|
"learning_rate": 1.2953660259166413e-05, |
|
"loss": 0.1728, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.2900919305413687, |
|
"grad_norm": 0.2761296331882477, |
|
"learning_rate": 1.291087544178713e-05, |
|
"loss": 0.1754, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.2941777323799797, |
|
"grad_norm": 0.29421859979629517, |
|
"learning_rate": 1.2868032327110904e-05, |
|
"loss": 0.1566, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.2982635342185904, |
|
"grad_norm": 0.2753983736038208, |
|
"learning_rate": 1.2825131773174371e-05, |
|
"loss": 0.1722, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.3023493360572012, |
|
"grad_norm": 0.280300498008728, |
|
"learning_rate": 1.2782174639164528e-05, |
|
"loss": 0.1743, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.3064351378958121, |
|
"grad_norm": 0.28724053502082825, |
|
"learning_rate": 1.2739161785401525e-05, |
|
"loss": 0.1727, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.3105209397344229, |
|
"grad_norm": 0.24978399276733398, |
|
"learning_rate": 1.269609407332144e-05, |
|
"loss": 0.1654, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.3146067415730336, |
|
"grad_norm": 0.2458401620388031, |
|
"learning_rate": 1.2652972365459008e-05, |
|
"loss": 0.1558, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.3186925434116445, |
|
"grad_norm": 0.29217007756233215, |
|
"learning_rate": 1.2609797525430374e-05, |
|
"loss": 0.1749, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.3227783452502553, |
|
"grad_norm": 0.2738885283470154, |
|
"learning_rate": 1.2566570417915769e-05, |
|
"loss": 0.1598, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.3268641470888662, |
|
"grad_norm": 0.23460422456264496, |
|
"learning_rate": 1.2523291908642219e-05, |
|
"loss": 0.1586, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.330949948927477, |
|
"grad_norm": 0.2899508476257324, |
|
"learning_rate": 1.2479962864366186e-05, |
|
"loss": 0.1698, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.335035750766088, |
|
"grad_norm": 0.2744244933128357, |
|
"learning_rate": 1.243658415285622e-05, |
|
"loss": 0.167, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.3391215526046987, |
|
"grad_norm": 0.3147677183151245, |
|
"learning_rate": 1.2393156642875579e-05, |
|
"loss": 0.1592, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.3432073544433094, |
|
"grad_norm": 0.26883426308631897, |
|
"learning_rate": 1.2349681204164823e-05, |
|
"loss": 0.1735, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.3432073544433094, |
|
"eval_loss": 0.2857210040092468, |
|
"eval_runtime": 5.8046, |
|
"eval_samples_per_second": 13.61, |
|
"eval_steps_per_second": 1.723, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.3472931562819204, |
|
"grad_norm": 0.26572638750076294, |
|
"learning_rate": 1.2306158707424402e-05, |
|
"loss": 0.172, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.351378958120531, |
|
"grad_norm": 0.3158324062824249, |
|
"learning_rate": 1.2262590024297226e-05, |
|
"loss": 0.184, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.355464759959142, |
|
"grad_norm": 0.2606561779975891, |
|
"learning_rate": 1.2218976027351177e-05, |
|
"loss": 0.1681, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.3595505617977528, |
|
"grad_norm": 0.2860865592956543, |
|
"learning_rate": 1.2175317590061676e-05, |
|
"loss": 0.1768, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.2928154766559601, |
|
"learning_rate": 1.2131615586794162e-05, |
|
"loss": 0.1654, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.3677221654749745, |
|
"grad_norm": 0.2754892110824585, |
|
"learning_rate": 1.2087870892786588e-05, |
|
"loss": 0.1679, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.3718079673135852, |
|
"grad_norm": 0.25418567657470703, |
|
"learning_rate": 1.2044084384131891e-05, |
|
"loss": 0.1692, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.3758937691521962, |
|
"grad_norm": 0.29680415987968445, |
|
"learning_rate": 1.2000256937760446e-05, |
|
"loss": 0.1835, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.379979570990807, |
|
"grad_norm": 0.25421565771102905, |
|
"learning_rate": 1.1956389431422508e-05, |
|
"loss": 0.1628, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.3840653728294177, |
|
"grad_norm": 0.26102015376091003, |
|
"learning_rate": 1.1912482743670624e-05, |
|
"loss": 0.1587, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.3881511746680286, |
|
"grad_norm": 0.2658519744873047, |
|
"learning_rate": 1.1868537753842052e-05, |
|
"loss": 0.1622, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.3922369765066394, |
|
"grad_norm": 0.25693395733833313, |
|
"learning_rate": 1.1824555342041129e-05, |
|
"loss": 0.1611, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.3963227783452503, |
|
"grad_norm": 0.24095548689365387, |
|
"learning_rate": 1.1780536389121668e-05, |
|
"loss": 0.1566, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.400408580183861, |
|
"grad_norm": 0.25440356135368347, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 0.1646, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.404494382022472, |
|
"grad_norm": 0.23900751769542694, |
|
"learning_rate": 1.1692392386983837e-05, |
|
"loss": 0.1567, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.4085801838610827, |
|
"grad_norm": 0.2516697645187378, |
|
"learning_rate": 1.1648269103061567e-05, |
|
"loss": 0.1693, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.4126659856996935, |
|
"grad_norm": 0.23285552859306335, |
|
"learning_rate": 1.1604112808577603e-05, |
|
"loss": 0.1565, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.4167517875383044, |
|
"grad_norm": 0.22535811364650726, |
|
"learning_rate": 1.155992438786818e-05, |
|
"loss": 0.1519, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.4208375893769152, |
|
"grad_norm": 0.2757152020931244, |
|
"learning_rate": 1.1515704725912926e-05, |
|
"loss": 0.1824, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.424923391215526, |
|
"grad_norm": 0.25517934560775757, |
|
"learning_rate": 1.1471454708317163e-05, |
|
"loss": 0.1524, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.4290091930541369, |
|
"grad_norm": 0.26882752776145935, |
|
"learning_rate": 1.1427175221294145e-05, |
|
"loss": 0.1653, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.4330949948927478, |
|
"grad_norm": 0.2248525470495224, |
|
"learning_rate": 1.1382867151647333e-05, |
|
"loss": 0.1458, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.4371807967313586, |
|
"grad_norm": 0.2648623585700989, |
|
"learning_rate": 1.1338531386752618e-05, |
|
"loss": 0.1663, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.4412665985699693, |
|
"grad_norm": 0.2239081859588623, |
|
"learning_rate": 1.1294168814540554e-05, |
|
"loss": 0.1488, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.4453524004085803, |
|
"grad_norm": 0.2529364824295044, |
|
"learning_rate": 1.1249780323478585e-05, |
|
"loss": 0.1633, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.449438202247191, |
|
"grad_norm": 0.22921797633171082, |
|
"learning_rate": 1.1205366802553231e-05, |
|
"loss": 0.1648, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.4535240040858017, |
|
"grad_norm": 0.29341360926628113, |
|
"learning_rate": 1.1160929141252303e-05, |
|
"loss": 0.1657, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.4576098059244127, |
|
"grad_norm": 0.2699342966079712, |
|
"learning_rate": 1.1116468229547079e-05, |
|
"loss": 0.1726, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.4616956077630234, |
|
"grad_norm": 0.22347010672092438, |
|
"learning_rate": 1.107198495787448e-05, |
|
"loss": 0.1549, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.4657814096016344, |
|
"grad_norm": 0.2765299677848816, |
|
"learning_rate": 1.1027480217119245e-05, |
|
"loss": 0.1567, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.4698672114402451, |
|
"grad_norm": 0.2796229422092438, |
|
"learning_rate": 1.0982954898596072e-05, |
|
"loss": 0.1673, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.473953013278856, |
|
"grad_norm": 0.2708180546760559, |
|
"learning_rate": 1.0938409894031793e-05, |
|
"loss": 0.1608, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.4780388151174668, |
|
"grad_norm": 0.26708030700683594, |
|
"learning_rate": 1.0893846095547493e-05, |
|
"loss": 0.1672, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.4821246169560776, |
|
"grad_norm": 0.25234729051589966, |
|
"learning_rate": 1.084926439564065e-05, |
|
"loss": 0.1695, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.4862104187946885, |
|
"grad_norm": 0.23701204359531403, |
|
"learning_rate": 1.0804665687167262e-05, |
|
"loss": 0.1478, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.4902962206332993, |
|
"grad_norm": 0.23572878539562225, |
|
"learning_rate": 1.0760050863323961e-05, |
|
"loss": 0.1518, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.49438202247191, |
|
"grad_norm": 0.26712414622306824, |
|
"learning_rate": 1.0715420817630137e-05, |
|
"loss": 0.1641, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.498467824310521, |
|
"grad_norm": 0.2618795931339264, |
|
"learning_rate": 1.0670776443910024e-05, |
|
"loss": 0.1584, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.502553626149132, |
|
"grad_norm": 0.24355687201023102, |
|
"learning_rate": 1.062611863627482e-05, |
|
"loss": 0.155, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.5066394279877426, |
|
"grad_norm": 0.28303593397140503, |
|
"learning_rate": 1.0581448289104759e-05, |
|
"loss": 0.1699, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.5107252298263534, |
|
"grad_norm": 0.2682429254055023, |
|
"learning_rate": 1.0536766297031216e-05, |
|
"loss": 0.1638, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.5148110316649643, |
|
"grad_norm": 0.2611052095890045, |
|
"learning_rate": 1.0492073554918782e-05, |
|
"loss": 0.162, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.518896833503575, |
|
"grad_norm": 0.2545654773712158, |
|
"learning_rate": 1.0447370957847343e-05, |
|
"loss": 0.171, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.5229826353421858, |
|
"grad_norm": 0.2540684640407562, |
|
"learning_rate": 1.0402659401094154e-05, |
|
"loss": 0.1609, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.5270684371807968, |
|
"grad_norm": 0.29473230242729187, |
|
"learning_rate": 1.0357939780115906e-05, |
|
"loss": 0.1739, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.5311542390194075, |
|
"grad_norm": 0.23088738322257996, |
|
"learning_rate": 1.0313212990530804e-05, |
|
"loss": 0.1396, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.5352400408580182, |
|
"grad_norm": 0.2865520119667053, |
|
"learning_rate": 1.0268479928100615e-05, |
|
"loss": 0.1587, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.5393258426966292, |
|
"grad_norm": 0.26724815368652344, |
|
"learning_rate": 1.0223741488712732e-05, |
|
"loss": 0.1643, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.5434116445352402, |
|
"grad_norm": 0.2568652033805847, |
|
"learning_rate": 1.0178998568362243e-05, |
|
"loss": 0.1502, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.547497446373851, |
|
"grad_norm": 0.25489166378974915, |
|
"learning_rate": 1.0134252063133976e-05, |
|
"loss": 0.1551, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.5515832482124616, |
|
"grad_norm": 0.2938600480556488, |
|
"learning_rate": 1.0089502869184549e-05, |
|
"loss": 0.1721, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.5556690500510726, |
|
"grad_norm": 0.2571638822555542, |
|
"learning_rate": 1.0044751882724436e-05, |
|
"loss": 0.1596, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.5597548518896833, |
|
"grad_norm": 0.2504737079143524, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1652, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.563840653728294, |
|
"grad_norm": 0.25643548369407654, |
|
"learning_rate": 9.955248117275566e-06, |
|
"loss": 0.1646, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.567926455566905, |
|
"grad_norm": 0.24690495431423187, |
|
"learning_rate": 9.910497130815454e-06, |
|
"loss": 0.1692, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.572012257405516, |
|
"grad_norm": 0.23503315448760986, |
|
"learning_rate": 9.865747936866027e-06, |
|
"loss": 0.1614, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.5760980592441267, |
|
"grad_norm": 0.2600212097167969, |
|
"learning_rate": 9.821001431637759e-06, |
|
"loss": 0.1843, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.5801838610827375, |
|
"grad_norm": 0.24049755930900574, |
|
"learning_rate": 9.776258511287271e-06, |
|
"loss": 0.1939, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.5842696629213484, |
|
"grad_norm": 0.26995447278022766, |
|
"learning_rate": 9.73152007189939e-06, |
|
"loss": 0.1608, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.5883554647599591, |
|
"grad_norm": 0.25705352425575256, |
|
"learning_rate": 9.6867870094692e-06, |
|
"loss": 0.1503, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.5924412665985699, |
|
"grad_norm": 0.2591187059879303, |
|
"learning_rate": 9.642060219884096e-06, |
|
"loss": 0.1601, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.5965270684371808, |
|
"grad_norm": 0.26638317108154297, |
|
"learning_rate": 9.597340598905851e-06, |
|
"loss": 0.1525, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.6006128702757916, |
|
"grad_norm": 0.27399975061416626, |
|
"learning_rate": 9.55262904215266e-06, |
|
"loss": 0.1571, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.6046986721144023, |
|
"grad_norm": 0.298513263463974, |
|
"learning_rate": 9.50792644508122e-06, |
|
"loss": 0.1734, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.6087844739530133, |
|
"grad_norm": 0.2932952344417572, |
|
"learning_rate": 9.463233702968784e-06, |
|
"loss": 0.1595, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.6128702757916242, |
|
"grad_norm": 0.2699350118637085, |
|
"learning_rate": 9.418551710895243e-06, |
|
"loss": 0.1513, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.616956077630235, |
|
"grad_norm": 0.2710689902305603, |
|
"learning_rate": 9.373881363725182e-06, |
|
"loss": 0.1558, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.6210418794688457, |
|
"grad_norm": 0.26967060565948486, |
|
"learning_rate": 9.329223556089976e-06, |
|
"loss": 0.1532, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.6251276813074567, |
|
"grad_norm": 0.26783767342567444, |
|
"learning_rate": 9.284579182369868e-06, |
|
"loss": 0.167, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.6292134831460674, |
|
"grad_norm": 0.2573103606700897, |
|
"learning_rate": 9.239949136676042e-06, |
|
"loss": 0.1675, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.6332992849846781, |
|
"grad_norm": 0.2554529905319214, |
|
"learning_rate": 9.195334312832742e-06, |
|
"loss": 0.1653, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.637385086823289, |
|
"grad_norm": 0.2697620391845703, |
|
"learning_rate": 9.15073560435935e-06, |
|
"loss": 0.1754, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.6414708886619, |
|
"grad_norm": 0.2908802032470703, |
|
"learning_rate": 9.10615390445251e-06, |
|
"loss": 0.1694, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.6455566905005106, |
|
"grad_norm": 0.28988802433013916, |
|
"learning_rate": 9.061590105968208e-06, |
|
"loss": 0.1596, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.6496424923391215, |
|
"grad_norm": 0.27670571208000183, |
|
"learning_rate": 9.01704510140393e-06, |
|
"loss": 0.1486, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.6537282941777325, |
|
"grad_norm": 0.29919058084487915, |
|
"learning_rate": 8.97251978288076e-06, |
|
"loss": 0.1668, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.6578140960163432, |
|
"grad_norm": 0.2605692446231842, |
|
"learning_rate": 8.928015042125523e-06, |
|
"loss": 0.1533, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.661899897854954, |
|
"grad_norm": 0.27188801765441895, |
|
"learning_rate": 8.883531770452924e-06, |
|
"loss": 0.1591, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.665985699693565, |
|
"grad_norm": 0.2607693374156952, |
|
"learning_rate": 8.839070858747697e-06, |
|
"loss": 0.1631, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.6700715015321757, |
|
"grad_norm": 0.26251208782196045, |
|
"learning_rate": 8.79463319744677e-06, |
|
"loss": 0.1669, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.6741573033707864, |
|
"grad_norm": 0.27655109763145447, |
|
"learning_rate": 8.750219676521417e-06, |
|
"loss": 0.1797, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.6782431052093973, |
|
"grad_norm": 0.2489909827709198, |
|
"learning_rate": 8.705831185459446e-06, |
|
"loss": 0.1684, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.6782431052093973, |
|
"eval_loss": 0.2804652154445648, |
|
"eval_runtime": 5.3248, |
|
"eval_samples_per_second": 14.836, |
|
"eval_steps_per_second": 1.878, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.6823289070480083, |
|
"grad_norm": 0.2541872560977936, |
|
"learning_rate": 8.661468613247387e-06, |
|
"loss": 0.1738, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.686414708886619, |
|
"grad_norm": 0.26432761549949646, |
|
"learning_rate": 8.617132848352672e-06, |
|
"loss": 0.1523, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.6905005107252298, |
|
"grad_norm": 0.24682320654392242, |
|
"learning_rate": 8.572824778705858e-06, |
|
"loss": 0.1685, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.6945863125638407, |
|
"grad_norm": 0.255575567483902, |
|
"learning_rate": 8.528545291682839e-06, |
|
"loss": 0.1603, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.6986721144024515, |
|
"grad_norm": 0.27255284786224365, |
|
"learning_rate": 8.484295274087077e-06, |
|
"loss": 0.1649, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.7027579162410622, |
|
"grad_norm": 0.2935710549354553, |
|
"learning_rate": 8.440075612131823e-06, |
|
"loss": 0.1824, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.7068437180796732, |
|
"grad_norm": 0.28145232796669006, |
|
"learning_rate": 8.395887191422397e-06, |
|
"loss": 0.1664, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.7109295199182841, |
|
"grad_norm": 0.2540966272354126, |
|
"learning_rate": 8.351730896938438e-06, |
|
"loss": 0.139, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.7150153217568946, |
|
"grad_norm": 0.2761797606945038, |
|
"learning_rate": 8.307607613016166e-06, |
|
"loss": 0.1468, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.7191011235955056, |
|
"grad_norm": 0.26004406809806824, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.1791, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.7231869254341166, |
|
"grad_norm": 0.26706498861312866, |
|
"learning_rate": 8.219463610878336e-06, |
|
"loss": 0.1767, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.7272727272727273, |
|
"grad_norm": 0.25433361530303955, |
|
"learning_rate": 8.175444657958875e-06, |
|
"loss": 0.1641, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.731358529111338, |
|
"grad_norm": 0.28011849522590637, |
|
"learning_rate": 8.131462246157953e-06, |
|
"loss": 0.1667, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.735444330949949, |
|
"grad_norm": 0.24411511421203613, |
|
"learning_rate": 8.087517256329376e-06, |
|
"loss": 0.1484, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.7395301327885597, |
|
"grad_norm": 0.2515384554862976, |
|
"learning_rate": 8.043610568577497e-06, |
|
"loss": 0.149, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.7436159346271705, |
|
"grad_norm": 0.28085580468177795, |
|
"learning_rate": 7.999743062239557e-06, |
|
"loss": 0.1758, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.7477017364657814, |
|
"grad_norm": 0.2542356848716736, |
|
"learning_rate": 7.95591561586811e-06, |
|
"loss": 0.1526, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.7517875383043924, |
|
"grad_norm": 0.2624610960483551, |
|
"learning_rate": 7.912129107213417e-06, |
|
"loss": 0.1669, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.7558733401430031, |
|
"grad_norm": 0.2531009316444397, |
|
"learning_rate": 7.868384413205842e-06, |
|
"loss": 0.1728, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.7599591419816139, |
|
"grad_norm": 0.26832813024520874, |
|
"learning_rate": 7.824682409938328e-06, |
|
"loss": 0.1689, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.7640449438202248, |
|
"grad_norm": 0.26647037267684937, |
|
"learning_rate": 7.781023972648826e-06, |
|
"loss": 0.1566, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.7681307456588355, |
|
"grad_norm": 0.2441844940185547, |
|
"learning_rate": 7.73740997570278e-06, |
|
"loss": 0.1475, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.7722165474974463, |
|
"grad_norm": 0.26222023367881775, |
|
"learning_rate": 7.6938412925756e-06, |
|
"loss": 0.1627, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.7763023493360572, |
|
"grad_norm": 0.27849847078323364, |
|
"learning_rate": 7.650318795835179e-06, |
|
"loss": 0.1692, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.780388151174668, |
|
"grad_norm": 0.23362480103969574, |
|
"learning_rate": 7.606843357124426e-06, |
|
"loss": 0.1486, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.7844739530132787, |
|
"grad_norm": 0.25098103284835815, |
|
"learning_rate": 7.563415847143782e-06, |
|
"loss": 0.1586, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.7885597548518897, |
|
"grad_norm": 0.2666711211204529, |
|
"learning_rate": 7.520037135633817e-06, |
|
"loss": 0.1631, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.7926455566905006, |
|
"grad_norm": 0.25154757499694824, |
|
"learning_rate": 7.476708091357783e-06, |
|
"loss": 0.1496, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.7967313585291114, |
|
"grad_norm": 0.2870493233203888, |
|
"learning_rate": 7.433429582084233e-06, |
|
"loss": 0.1718, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.800817160367722, |
|
"grad_norm": 0.2450946867465973, |
|
"learning_rate": 7.39020247456963e-06, |
|
"loss": 0.1551, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.804902962206333, |
|
"grad_norm": 0.2701391577720642, |
|
"learning_rate": 7.347027634540993e-06, |
|
"loss": 0.1611, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.8089887640449438, |
|
"grad_norm": 0.25652557611465454, |
|
"learning_rate": 7.303905926678565e-06, |
|
"loss": 0.1571, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.8130745658835545, |
|
"grad_norm": 0.24130114912986755, |
|
"learning_rate": 7.260838214598475e-06, |
|
"loss": 0.1525, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.8171603677221655, |
|
"grad_norm": 0.2391010969877243, |
|
"learning_rate": 7.217825360835475e-06, |
|
"loss": 0.1478, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.8212461695607765, |
|
"grad_norm": 0.24808183312416077, |
|
"learning_rate": 7.174868226825631e-06, |
|
"loss": 0.1449, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.825331971399387, |
|
"grad_norm": 0.24367845058441162, |
|
"learning_rate": 7.131967672889101e-06, |
|
"loss": 0.1527, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.829417773237998, |
|
"grad_norm": 0.24614740908145905, |
|
"learning_rate": 7.089124558212872e-06, |
|
"loss": 0.1473, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.8335035750766089, |
|
"grad_norm": 0.23732498288154602, |
|
"learning_rate": 7.04633974083359e-06, |
|
"loss": 0.1676, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.8375893769152196, |
|
"grad_norm": 0.26191797852516174, |
|
"learning_rate": 7.003614077620348e-06, |
|
"loss": 0.1625, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.8416751787538304, |
|
"grad_norm": 0.22175060212612152, |
|
"learning_rate": 6.960948424257532e-06, |
|
"loss": 0.1417, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.8457609805924413, |
|
"grad_norm": 0.2599637806415558, |
|
"learning_rate": 6.918343635227694e-06, |
|
"loss": 0.1542, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.849846782431052, |
|
"grad_norm": 0.2902531325817108, |
|
"learning_rate": 6.8758005637944245e-06, |
|
"loss": 0.1673, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.8539325842696628, |
|
"grad_norm": 0.26200827956199646, |
|
"learning_rate": 6.833320061985278e-06, |
|
"loss": 0.1507, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.8580183861082737, |
|
"grad_norm": 0.22496499121189117, |
|
"learning_rate": 6.7909029805746855e-06, |
|
"loss": 0.1563, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.8621041879468847, |
|
"grad_norm": 0.26499348878860474, |
|
"learning_rate": 6.7485501690669495e-06, |
|
"loss": 0.1588, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.8661899897854954, |
|
"grad_norm": 0.21678292751312256, |
|
"learning_rate": 6.706262475679205e-06, |
|
"loss": 0.1446, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.8702757916241062, |
|
"grad_norm": 0.249608114361763, |
|
"learning_rate": 6.664040747324437e-06, |
|
"loss": 0.1574, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.8743615934627171, |
|
"grad_norm": 0.27170929312705994, |
|
"learning_rate": 6.62188582959453e-06, |
|
"loss": 0.1714, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.8784473953013279, |
|
"grad_norm": 0.26091060042381287, |
|
"learning_rate": 6.579798566743314e-06, |
|
"loss": 0.153, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.8825331971399386, |
|
"grad_norm": 0.2784002125263214, |
|
"learning_rate": 6.537779801669677e-06, |
|
"loss": 0.1594, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.8866189989785496, |
|
"grad_norm": 0.2827843427658081, |
|
"learning_rate": 6.495830375900665e-06, |
|
"loss": 0.1713, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.8907048008171605, |
|
"grad_norm": 0.24465838074684143, |
|
"learning_rate": 6.453951129574644e-06, |
|
"loss": 0.1398, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.894790602655771, |
|
"grad_norm": 0.24695105850696564, |
|
"learning_rate": 6.41214290142447e-06, |
|
"loss": 0.1569, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.898876404494382, |
|
"grad_norm": 0.23522843420505524, |
|
"learning_rate": 6.370406528760675e-06, |
|
"loss": 0.1572, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.902962206332993, |
|
"grad_norm": 0.28958627581596375, |
|
"learning_rate": 6.3287428474547256e-06, |
|
"loss": 0.1576, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.9070480081716037, |
|
"grad_norm": 0.22417336702346802, |
|
"learning_rate": 6.287152691922264e-06, |
|
"loss": 0.151, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.9111338100102144, |
|
"grad_norm": 0.24010370671749115, |
|
"learning_rate": 6.245636895106403e-06, |
|
"loss": 0.1422, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.9152196118488254, |
|
"grad_norm": 0.257285475730896, |
|
"learning_rate": 6.204196288461037e-06, |
|
"loss": 0.1541, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.9193054136874361, |
|
"grad_norm": 0.2468208223581314, |
|
"learning_rate": 6.162831701934203e-06, |
|
"loss": 0.1618, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.9233912155260469, |
|
"grad_norm": 0.2693644165992737, |
|
"learning_rate": 6.121543963951453e-06, |
|
"loss": 0.1597, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.9274770173646578, |
|
"grad_norm": 0.22864265739917755, |
|
"learning_rate": 6.080333901399252e-06, |
|
"loss": 0.1447, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.9315628192032688, |
|
"grad_norm": 0.2744729518890381, |
|
"learning_rate": 6.039202339608432e-06, |
|
"loss": 0.1649, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.9356486210418795, |
|
"grad_norm": 0.2626800537109375, |
|
"learning_rate": 5.998150102337665e-06, |
|
"loss": 0.1465, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.9397344228804902, |
|
"grad_norm": 0.24998779594898224, |
|
"learning_rate": 5.957178011756952e-06, |
|
"loss": 0.1314, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.9438202247191012, |
|
"grad_norm": 0.25133228302001953, |
|
"learning_rate": 5.9162868884311596e-06, |
|
"loss": 0.1541, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.947906026557712, |
|
"grad_norm": 0.27924278378486633, |
|
"learning_rate": 5.875477551303596e-06, |
|
"loss": 0.1588, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.9519918283963227, |
|
"grad_norm": 0.23838290572166443, |
|
"learning_rate": 5.834750817679606e-06, |
|
"loss": 0.1559, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.9560776302349336, |
|
"grad_norm": 0.20889320969581604, |
|
"learning_rate": 5.794107503210187e-06, |
|
"loss": 0.1376, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.9601634320735446, |
|
"grad_norm": 0.24007071554660797, |
|
"learning_rate": 5.753548421875686e-06, |
|
"loss": 0.1641, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.9642492339121551, |
|
"grad_norm": 0.25776174664497375, |
|
"learning_rate": 5.713074385969457e-06, |
|
"loss": 0.1486, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.968335035750766, |
|
"grad_norm": 0.24709415435791016, |
|
"learning_rate": 5.672686206081638e-06, |
|
"loss": 0.1647, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.972420837589377, |
|
"grad_norm": 0.2545711398124695, |
|
"learning_rate": 5.632384691082874e-06, |
|
"loss": 0.1558, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.9765066394279878, |
|
"grad_norm": 0.25180289149284363, |
|
"learning_rate": 5.5921706481081405e-06, |
|
"loss": 0.1405, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.9805924412665985, |
|
"grad_norm": 0.2353358417749405, |
|
"learning_rate": 5.55204488254059e-06, |
|
"loss": 0.1496, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.9846782431052095, |
|
"grad_norm": 0.25672510266304016, |
|
"learning_rate": 5.512008197995379e-06, |
|
"loss": 0.1557, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.9887640449438202, |
|
"grad_norm": 0.24256597459316254, |
|
"learning_rate": 5.47206139630363e-06, |
|
"loss": 0.1366, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.992849846782431, |
|
"grad_norm": 0.2704496681690216, |
|
"learning_rate": 5.432205277496327e-06, |
|
"loss": 0.1492, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.996935648621042, |
|
"grad_norm": 0.24868719279766083, |
|
"learning_rate": 5.3924406397883174e-06, |
|
"loss": 0.1632, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 2.004085801838611, |
|
"grad_norm": 0.5672191977500916, |
|
"learning_rate": 5.352768279562315e-06, |
|
"loss": 0.2733, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 2.0081716036772215, |
|
"grad_norm": 0.15086303651332855, |
|
"learning_rate": 5.313188991352964e-06, |
|
"loss": 0.1187, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.0122574055158324, |
|
"grad_norm": 0.14860980212688446, |
|
"learning_rate": 5.273703567830908e-06, |
|
"loss": 0.1086, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 2.0163432073544434, |
|
"grad_norm": 0.16813896596431732, |
|
"learning_rate": 5.234312799786921e-06, |
|
"loss": 0.1109, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 2.0163432073544434, |
|
"eval_loss": 0.27407562732696533, |
|
"eval_runtime": 5.3937, |
|
"eval_samples_per_second": 14.647, |
|
"eval_steps_per_second": 1.854, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 2.0204290091930543, |
|
"grad_norm": 0.16929540038108826, |
|
"learning_rate": 5.195017476116089e-06, |
|
"loss": 0.1202, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 2.024514811031665, |
|
"grad_norm": 0.17897042632102966, |
|
"learning_rate": 5.155818383801976e-06, |
|
"loss": 0.1236, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 2.028600612870276, |
|
"grad_norm": 0.1761506348848343, |
|
"learning_rate": 5.116716307900893e-06, |
|
"loss": 0.117, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.032686414708887, |
|
"grad_norm": 0.22216179966926575, |
|
"learning_rate": 5.077712031526153e-06, |
|
"loss": 0.1182, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 2.0367722165474973, |
|
"grad_norm": 0.16491776704788208, |
|
"learning_rate": 5.038806335832414e-06, |
|
"loss": 0.1146, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 2.0408580183861083, |
|
"grad_norm": 0.1395263373851776, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.1119, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 2.044943820224719, |
|
"grad_norm": 0.15789788961410522, |
|
"learning_rate": 4.961293801219328e-06, |
|
"loss": 0.1224, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 2.0490296220633297, |
|
"grad_norm": 0.16785749793052673, |
|
"learning_rate": 4.922688514675325e-06, |
|
"loss": 0.1087, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0531154239019407, |
|
"grad_norm": 0.1611642688512802, |
|
"learning_rate": 4.8841849135319015e-06, |
|
"loss": 0.1107, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 2.0572012257405516, |
|
"grad_norm": 0.1986808031797409, |
|
"learning_rate": 4.845783768916482e-06, |
|
"loss": 0.118, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 2.0612870275791626, |
|
"grad_norm": 0.19198960065841675, |
|
"learning_rate": 4.8074858499045405e-06, |
|
"loss": 0.132, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 2.065372829417773, |
|
"grad_norm": 0.2015804499387741, |
|
"learning_rate": 4.769291923504226e-06, |
|
"loss": 0.1133, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 2.069458631256384, |
|
"grad_norm": 0.17852617800235748, |
|
"learning_rate": 4.731202754640969e-06, |
|
"loss": 0.106, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.073544433094995, |
|
"grad_norm": 0.1829053908586502, |
|
"learning_rate": 4.693219106142186e-06, |
|
"loss": 0.114, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 2.0776302349336055, |
|
"grad_norm": 0.1654207408428192, |
|
"learning_rate": 4.655341738721989e-06, |
|
"loss": 0.11, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 2.0817160367722165, |
|
"grad_norm": 0.1808289736509323, |
|
"learning_rate": 4.617571410965964e-06, |
|
"loss": 0.123, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 2.0858018386108275, |
|
"grad_norm": 0.18115544319152832, |
|
"learning_rate": 4.579908879315962e-06, |
|
"loss": 0.1143, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 2.0898876404494384, |
|
"grad_norm": 0.18940667808055878, |
|
"learning_rate": 4.542354898054953e-06, |
|
"loss": 0.1233, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.093973442288049, |
|
"grad_norm": 0.1968260258436203, |
|
"learning_rate": 4.504910219291941e-06, |
|
"loss": 0.1086, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 2.09805924412666, |
|
"grad_norm": 0.1746767908334732, |
|
"learning_rate": 4.467575592946865e-06, |
|
"loss": 0.1211, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 2.102145045965271, |
|
"grad_norm": 0.16945403814315796, |
|
"learning_rate": 4.430351766735609e-06, |
|
"loss": 0.1103, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 2.1062308478038814, |
|
"grad_norm": 0.16376259922981262, |
|
"learning_rate": 4.393239486155011e-06, |
|
"loss": 0.1058, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 2.1103166496424923, |
|
"grad_norm": 0.19992703199386597, |
|
"learning_rate": 4.356239494467952e-06, |
|
"loss": 0.115, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.1144024514811033, |
|
"grad_norm": 0.16445453464984894, |
|
"learning_rate": 4.319352532688444e-06, |
|
"loss": 0.0995, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 2.118488253319714, |
|
"grad_norm": 0.16085082292556763, |
|
"learning_rate": 4.282579339566802e-06, |
|
"loss": 0.1064, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 2.1225740551583248, |
|
"grad_norm": 0.18840906023979187, |
|
"learning_rate": 4.245920651574864e-06, |
|
"loss": 0.1204, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 2.1266598569969357, |
|
"grad_norm": 0.1801845282316208, |
|
"learning_rate": 4.209377202891212e-06, |
|
"loss": 0.1099, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 2.1307456588355467, |
|
"grad_norm": 0.19049790501594543, |
|
"learning_rate": 4.172949725386488e-06, |
|
"loss": 0.1075, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.134831460674157, |
|
"grad_norm": 0.14935700595378876, |
|
"learning_rate": 4.13663894860873e-06, |
|
"loss": 0.0983, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 2.138917262512768, |
|
"grad_norm": 0.1842905879020691, |
|
"learning_rate": 4.100445599768774e-06, |
|
"loss": 0.1108, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 2.143003064351379, |
|
"grad_norm": 0.16270151734352112, |
|
"learning_rate": 4.0643704037256556e-06, |
|
"loss": 0.1088, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 2.1470888661899896, |
|
"grad_norm": 0.16617348790168762, |
|
"learning_rate": 4.028414082972141e-06, |
|
"loss": 0.1065, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 2.1511746680286006, |
|
"grad_norm": 0.19406820833683014, |
|
"learning_rate": 3.99257735762021e-06, |
|
"loss": 0.112, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.1552604698672115, |
|
"grad_norm": 0.1523066610097885, |
|
"learning_rate": 3.956860945386677e-06, |
|
"loss": 0.102, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 2.1593462717058225, |
|
"grad_norm": 0.1776452362537384, |
|
"learning_rate": 3.921265561578781e-06, |
|
"loss": 0.1113, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 2.163432073544433, |
|
"grad_norm": 0.1429518759250641, |
|
"learning_rate": 3.885791919079878e-06, |
|
"loss": 0.0951, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 2.167517875383044, |
|
"grad_norm": 0.15642093122005463, |
|
"learning_rate": 3.850440728335171e-06, |
|
"loss": 0.0996, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 2.171603677221655, |
|
"grad_norm": 0.16918523609638214, |
|
"learning_rate": 3.815212697337451e-06, |
|
"loss": 0.1014, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.1756894790602654, |
|
"grad_norm": 0.1890415996313095, |
|
"learning_rate": 3.7801085316129615e-06, |
|
"loss": 0.1195, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 2.1797752808988764, |
|
"grad_norm": 0.1914844810962677, |
|
"learning_rate": 3.745128934207225e-06, |
|
"loss": 0.1112, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 2.1838610827374874, |
|
"grad_norm": 0.18005676567554474, |
|
"learning_rate": 3.7102746056710025e-06, |
|
"loss": 0.1103, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 2.187946884576098, |
|
"grad_norm": 0.18145185708999634, |
|
"learning_rate": 3.6755462440462288e-06, |
|
"loss": 0.1044, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 2.192032686414709, |
|
"grad_norm": 0.18146003782749176, |
|
"learning_rate": 3.6409445448520533e-06, |
|
"loss": 0.1033, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.19611848825332, |
|
"grad_norm": 0.15457145869731903, |
|
"learning_rate": 3.606470201070904e-06, |
|
"loss": 0.1044, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 2.2002042900919307, |
|
"grad_norm": 0.16790038347244263, |
|
"learning_rate": 3.5721239031346067e-06, |
|
"loss": 0.1148, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 2.2042900919305413, |
|
"grad_norm": 0.18558472394943237, |
|
"learning_rate": 3.5379063389105727e-06, |
|
"loss": 0.1085, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 2.208375893769152, |
|
"grad_norm": 0.1862039566040039, |
|
"learning_rate": 3.5038181936879932e-06, |
|
"loss": 0.1138, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 2.212461695607763, |
|
"grad_norm": 0.20844437181949615, |
|
"learning_rate": 3.4698601501641517e-06, |
|
"loss": 0.1072, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.2165474974463737, |
|
"grad_norm": 0.17171606421470642, |
|
"learning_rate": 3.4360328884307058e-06, |
|
"loss": 0.0999, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 2.2206332992849847, |
|
"grad_norm": 0.1774645447731018, |
|
"learning_rate": 3.4023370859601192e-06, |
|
"loss": 0.1039, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 2.2247191011235956, |
|
"grad_norm": 0.18054482340812683, |
|
"learning_rate": 3.3687734175920505e-06, |
|
"loss": 0.1057, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 2.2288049029622066, |
|
"grad_norm": 0.21771778166294098, |
|
"learning_rate": 3.335342555519855e-06, |
|
"loss": 0.1208, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 2.232890704800817, |
|
"grad_norm": 0.17016687989234924, |
|
"learning_rate": 3.3020451692771337e-06, |
|
"loss": 0.1124, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.236976506639428, |
|
"grad_norm": 0.15365496277809143, |
|
"learning_rate": 3.2688819257242963e-06, |
|
"loss": 0.1004, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 2.241062308478039, |
|
"grad_norm": 0.17153044044971466, |
|
"learning_rate": 3.235853489035241e-06, |
|
"loss": 0.1103, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 2.2451481103166495, |
|
"grad_norm": 0.19271762669086456, |
|
"learning_rate": 3.2029605206840088e-06, |
|
"loss": 0.117, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 2.2492339121552605, |
|
"grad_norm": 0.1731119304895401, |
|
"learning_rate": 3.1702036794315837e-06, |
|
"loss": 0.1027, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 2.2533197139938714, |
|
"grad_norm": 0.17215265333652496, |
|
"learning_rate": 3.1375836213126653e-06, |
|
"loss": 0.1125, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.257405515832482, |
|
"grad_norm": 0.17249596118927002, |
|
"learning_rate": 3.1051009996225434e-06, |
|
"loss": 0.1021, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 2.261491317671093, |
|
"grad_norm": 0.1713964343070984, |
|
"learning_rate": 3.0727564649040066e-06, |
|
"loss": 0.1141, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.265577119509704, |
|
"grad_norm": 0.15855085849761963, |
|
"learning_rate": 3.040550664934332e-06, |
|
"loss": 0.1144, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 2.2696629213483144, |
|
"grad_norm": 0.15687625110149384, |
|
"learning_rate": 3.008484244712286e-06, |
|
"loss": 0.0936, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 2.2737487231869253, |
|
"grad_norm": 0.17462803423404694, |
|
"learning_rate": 2.976557846445225e-06, |
|
"loss": 0.1106, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.2778345250255363, |
|
"grad_norm": 0.1664649397134781, |
|
"learning_rate": 2.9447721095362325e-06, |
|
"loss": 0.1086, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 2.2819203268641473, |
|
"grad_norm": 0.18073546886444092, |
|
"learning_rate": 2.9131276705713008e-06, |
|
"loss": 0.1138, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 2.2860061287027578, |
|
"grad_norm": 0.18023085594177246, |
|
"learning_rate": 2.8816251633065963e-06, |
|
"loss": 0.1002, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 2.2900919305413687, |
|
"grad_norm": 0.16031518578529358, |
|
"learning_rate": 2.8502652186557546e-06, |
|
"loss": 0.0979, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 2.2941777323799797, |
|
"grad_norm": 0.16551966965198517, |
|
"learning_rate": 2.819048464677261e-06, |
|
"loss": 0.1077, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.2982635342185906, |
|
"grad_norm": 0.20203644037246704, |
|
"learning_rate": 2.7879755265618558e-06, |
|
"loss": 0.1244, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 2.302349336057201, |
|
"grad_norm": 0.1538315862417221, |
|
"learning_rate": 2.7570470266200177e-06, |
|
"loss": 0.108, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 2.306435137895812, |
|
"grad_norm": 0.17754122614860535, |
|
"learning_rate": 2.726263584269513e-06, |
|
"loss": 0.1062, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 2.310520939734423, |
|
"grad_norm": 0.20123209059238434, |
|
"learning_rate": 2.69562581602297e-06, |
|
"loss": 0.1158, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 2.3146067415730336, |
|
"grad_norm": 0.22771477699279785, |
|
"learning_rate": 2.6651343354755453e-06, |
|
"loss": 0.1081, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.3186925434116445, |
|
"grad_norm": 0.20241940021514893, |
|
"learning_rate": 2.6347897532926293e-06, |
|
"loss": 0.1159, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 2.3227783452502555, |
|
"grad_norm": 0.17595934867858887, |
|
"learning_rate": 2.6045926771976306e-06, |
|
"loss": 0.1155, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 2.326864147088866, |
|
"grad_norm": 0.1674242913722992, |
|
"learning_rate": 2.5745437119597704e-06, |
|
"loss": 0.1057, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 2.330949948927477, |
|
"grad_norm": 0.1701403558254242, |
|
"learning_rate": 2.5446434593820156e-06, |
|
"loss": 0.1, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 2.335035750766088, |
|
"grad_norm": 0.18844358623027802, |
|
"learning_rate": 2.514892518288988e-06, |
|
"loss": 0.111, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.3391215526046985, |
|
"grad_norm": 0.1788492351770401, |
|
"learning_rate": 2.485291484515e-06, |
|
"loss": 0.1065, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 2.3432073544433094, |
|
"grad_norm": 0.15636250376701355, |
|
"learning_rate": 2.455840950892099e-06, |
|
"loss": 0.0929, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 2.3472931562819204, |
|
"grad_norm": 0.18517901003360748, |
|
"learning_rate": 2.4265415072382016e-06, |
|
"loss": 0.1069, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 2.3513789581205313, |
|
"grad_norm": 0.1569896936416626, |
|
"learning_rate": 2.3973937403452983e-06, |
|
"loss": 0.0946, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 2.3513789581205313, |
|
"eval_loss": 0.2827538251876831, |
|
"eval_runtime": 5.5006, |
|
"eval_samples_per_second": 14.362, |
|
"eval_steps_per_second": 1.818, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 2.355464759959142, |
|
"grad_norm": 0.1794838309288025, |
|
"learning_rate": 2.368398233967668e-06, |
|
"loss": 0.1243, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.359550561797753, |
|
"grad_norm": 0.16749253869056702, |
|
"learning_rate": 2.339555568810221e-06, |
|
"loss": 0.1163, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 0.17650862038135529, |
|
"learning_rate": 2.3108663225168436e-06, |
|
"loss": 0.1056, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 2.3677221654749743, |
|
"grad_norm": 0.15966176986694336, |
|
"learning_rate": 2.28233106965885e-06, |
|
"loss": 0.0985, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 2.3718079673135852, |
|
"grad_norm": 0.16680054366588593, |
|
"learning_rate": 2.2539503817234553e-06, |
|
"loss": 0.1158, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 2.375893769152196, |
|
"grad_norm": 0.1764342486858368, |
|
"learning_rate": 2.2257248271023424e-06, |
|
"loss": 0.1114, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.379979570990807, |
|
"grad_norm": 0.1928333342075348, |
|
"learning_rate": 2.1976549710802754e-06, |
|
"loss": 0.1047, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 2.3840653728294177, |
|
"grad_norm": 0.1916080117225647, |
|
"learning_rate": 2.1697413758237785e-06, |
|
"loss": 0.1049, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 2.3881511746680286, |
|
"grad_norm": 0.17298974096775055, |
|
"learning_rate": 2.141984600369882e-06, |
|
"loss": 0.1056, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 2.3922369765066396, |
|
"grad_norm": 0.1809270679950714, |
|
"learning_rate": 2.114385200614912e-06, |
|
"loss": 0.1029, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 2.39632277834525, |
|
"grad_norm": 0.17488853633403778, |
|
"learning_rate": 2.0869437293033835e-06, |
|
"loss": 0.1103, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.400408580183861, |
|
"grad_norm": 0.20704808831214905, |
|
"learning_rate": 2.0596607360168897e-06, |
|
"loss": 0.1138, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 2.404494382022472, |
|
"grad_norm": 0.18963633477687836, |
|
"learning_rate": 2.032536767163141e-06, |
|
"loss": 0.1109, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 2.4085801838610825, |
|
"grad_norm": 0.1389179229736328, |
|
"learning_rate": 2.0055723659649907e-06, |
|
"loss": 0.0881, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 2.4126659856996935, |
|
"grad_norm": 0.1745290607213974, |
|
"learning_rate": 1.9787680724495617e-06, |
|
"loss": 0.0973, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 2.4167517875383044, |
|
"grad_norm": 0.1597016453742981, |
|
"learning_rate": 1.952124423437447e-06, |
|
"loss": 0.0938, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.4208375893769154, |
|
"grad_norm": 0.17606374621391296, |
|
"learning_rate": 1.9256419525319316e-06, |
|
"loss": 0.0996, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 2.424923391215526, |
|
"grad_norm": 0.18109583854675293, |
|
"learning_rate": 1.8993211901083353e-06, |
|
"loss": 0.0999, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 2.429009193054137, |
|
"grad_norm": 0.1948283165693283, |
|
"learning_rate": 1.8731626633033573e-06, |
|
"loss": 0.1066, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 2.433094994892748, |
|
"grad_norm": 0.16293251514434814, |
|
"learning_rate": 1.8471668960045575e-06, |
|
"loss": 0.1001, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 2.4371807967313583, |
|
"grad_norm": 0.18315255641937256, |
|
"learning_rate": 1.82133440883983e-06, |
|
"loss": 0.1194, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.4412665985699693, |
|
"grad_norm": 0.20331861078739166, |
|
"learning_rate": 1.7956657191669969e-06, |
|
"loss": 0.1224, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 2.4453524004085803, |
|
"grad_norm": 0.19248269498348236, |
|
"learning_rate": 1.7701613410634367e-06, |
|
"loss": 0.1131, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 2.449438202247191, |
|
"grad_norm": 0.16477394104003906, |
|
"learning_rate": 1.7448217853158e-06, |
|
"loss": 0.1011, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 2.4535240040858017, |
|
"grad_norm": 0.1652829796075821, |
|
"learning_rate": 1.719647559409765e-06, |
|
"loss": 0.0964, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 2.4576098059244127, |
|
"grad_norm": 0.1879151612520218, |
|
"learning_rate": 1.6946391675198838e-06, |
|
"loss": 0.1083, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.4616956077630237, |
|
"grad_norm": 0.17156468331813812, |
|
"learning_rate": 1.6697971104994847e-06, |
|
"loss": 0.117, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 2.465781409601634, |
|
"grad_norm": 0.32563164830207825, |
|
"learning_rate": 1.6451218858706374e-06, |
|
"loss": 0.1413, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 2.469867211440245, |
|
"grad_norm": 0.1915392279624939, |
|
"learning_rate": 1.620613987814189e-06, |
|
"loss": 0.1158, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 2.473953013278856, |
|
"grad_norm": 0.2216257005929947, |
|
"learning_rate": 1.5962739071598709e-06, |
|
"loss": 0.1145, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 2.4780388151174666, |
|
"grad_norm": 0.19583038985729218, |
|
"learning_rate": 1.5721021313764684e-06, |
|
"loss": 0.1139, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.4821246169560776, |
|
"grad_norm": 0.18090856075286865, |
|
"learning_rate": 1.5480991445620541e-06, |
|
"loss": 0.1005, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 2.4862104187946885, |
|
"grad_norm": 0.16902458667755127, |
|
"learning_rate": 1.5242654274342895e-06, |
|
"loss": 0.1077, |
|
"step": 607 |
|
}, |
|
{ |
|
"epoch": 2.4902962206332995, |
|
"grad_norm": 0.20209279656410217, |
|
"learning_rate": 1.500601457320814e-06, |
|
"loss": 0.112, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 2.49438202247191, |
|
"grad_norm": 0.19891391694545746, |
|
"learning_rate": 1.4771077081496654e-06, |
|
"loss": 0.123, |
|
"step": 609 |
|
}, |
|
{ |
|
"epoch": 2.498467824310521, |
|
"grad_norm": 0.16217868030071259, |
|
"learning_rate": 1.453784650439798e-06, |
|
"loss": 0.1023, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.502553626149132, |
|
"grad_norm": 0.18772533535957336, |
|
"learning_rate": 1.4306327512916574e-06, |
|
"loss": 0.1089, |
|
"step": 611 |
|
}, |
|
{ |
|
"epoch": 2.506639427987743, |
|
"grad_norm": 0.17145603895187378, |
|
"learning_rate": 1.407652474377832e-06, |
|
"loss": 0.0982, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 2.5107252298263534, |
|
"grad_norm": 0.15923067927360535, |
|
"learning_rate": 1.384844279933757e-06, |
|
"loss": 0.0982, |
|
"step": 613 |
|
}, |
|
{ |
|
"epoch": 2.5148110316649643, |
|
"grad_norm": 0.1522497981786728, |
|
"learning_rate": 1.3622086247484989e-06, |
|
"loss": 0.1147, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 2.5188968335035753, |
|
"grad_norm": 0.1731443852186203, |
|
"learning_rate": 1.339745962155613e-06, |
|
"loss": 0.096, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.522982635342186, |
|
"grad_norm": 0.2923002243041992, |
|
"learning_rate": 1.3174567420240647e-06, |
|
"loss": 0.1157, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 2.5270684371807968, |
|
"grad_norm": 0.17199784517288208, |
|
"learning_rate": 1.295341410749208e-06, |
|
"loss": 0.1045, |
|
"step": 617 |
|
}, |
|
{ |
|
"epoch": 2.5311542390194077, |
|
"grad_norm": 0.1833561211824417, |
|
"learning_rate": 1.273400411243857e-06, |
|
"loss": 0.1105, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 2.5352400408580182, |
|
"grad_norm": 0.20159518718719482, |
|
"learning_rate": 1.2516341829294155e-06, |
|
"loss": 0.1117, |
|
"step": 619 |
|
}, |
|
{ |
|
"epoch": 2.539325842696629, |
|
"grad_norm": 0.18234069645404816, |
|
"learning_rate": 1.2300431617270669e-06, |
|
"loss": 0.1156, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.54341164453524, |
|
"grad_norm": 0.17154446244239807, |
|
"learning_rate": 1.2086277800490554e-06, |
|
"loss": 0.1128, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 2.5474974463738507, |
|
"grad_norm": 0.18493688106536865, |
|
"learning_rate": 1.1873884667900125e-06, |
|
"loss": 0.1196, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 2.5515832482124616, |
|
"grad_norm": 0.16059179604053497, |
|
"learning_rate": 1.1663256473183858e-06, |
|
"loss": 0.0966, |
|
"step": 623 |
|
}, |
|
{ |
|
"epoch": 2.5556690500510726, |
|
"grad_norm": 0.17189505696296692, |
|
"learning_rate": 1.1454397434679022e-06, |
|
"loss": 0.0964, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 2.559754851889683, |
|
"grad_norm": 0.14766576886177063, |
|
"learning_rate": 1.1247311735291255e-06, |
|
"loss": 0.0972, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.563840653728294, |
|
"grad_norm": 0.18592408299446106, |
|
"learning_rate": 1.1042003522410882e-06, |
|
"loss": 0.1171, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 2.567926455566905, |
|
"grad_norm": 0.1564607173204422, |
|
"learning_rate": 1.083847690782972e-06, |
|
"loss": 0.1054, |
|
"step": 627 |
|
}, |
|
{ |
|
"epoch": 2.572012257405516, |
|
"grad_norm": 0.1658332198858261, |
|
"learning_rate": 1.0636735967658785e-06, |
|
"loss": 0.1115, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 2.576098059244127, |
|
"grad_norm": 0.15802547335624695, |
|
"learning_rate": 1.0436784742246652e-06, |
|
"loss": 0.0945, |
|
"step": 629 |
|
}, |
|
{ |
|
"epoch": 2.5801838610827375, |
|
"grad_norm": 0.16254453361034393, |
|
"learning_rate": 1.0238627236098619e-06, |
|
"loss": 0.0969, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.5842696629213484, |
|
"grad_norm": 0.16888748109340668, |
|
"learning_rate": 1.0042267417796292e-06, |
|
"loss": 0.1015, |
|
"step": 631 |
|
}, |
|
{ |
|
"epoch": 2.5883554647599594, |
|
"grad_norm": 0.1927052140235901, |
|
"learning_rate": 9.8477092199184e-07, |
|
"loss": 0.1129, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 2.59244126659857, |
|
"grad_norm": 0.19136874377727509, |
|
"learning_rate": 9.65495653896179e-07, |
|
"loss": 0.1086, |
|
"step": 633 |
|
}, |
|
{ |
|
"epoch": 2.596527068437181, |
|
"grad_norm": 0.18852423131465912, |
|
"learning_rate": 9.464013235263458e-07, |
|
"loss": 0.1075, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.600612870275792, |
|
"grad_norm": 0.17885924875736237, |
|
"learning_rate": 9.274883132923362e-07, |
|
"loss": 0.1023, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.6046986721144023, |
|
"grad_norm": 0.17020930349826813, |
|
"learning_rate": 9.08757001972762e-07, |
|
"loss": 0.1054, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.6087844739530133, |
|
"grad_norm": 0.162260502576828, |
|
"learning_rate": 8.902077647072883e-07, |
|
"loss": 0.0956, |
|
"step": 637 |
|
}, |
|
{ |
|
"epoch": 2.6128702757916242, |
|
"grad_norm": 0.18622389435768127, |
|
"learning_rate": 8.71840972989092e-07, |
|
"loss": 0.1156, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 2.6169560776302347, |
|
"grad_norm": 0.16960519552230835, |
|
"learning_rate": 8.536569946574546e-07, |
|
"loss": 0.1061, |
|
"step": 639 |
|
}, |
|
{ |
|
"epoch": 2.6210418794688457, |
|
"grad_norm": 0.1904251128435135, |
|
"learning_rate": 8.356561938903707e-07, |
|
"loss": 0.0937, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.6251276813074567, |
|
"grad_norm": 0.19458377361297607, |
|
"learning_rate": 8.178389311972612e-07, |
|
"loss": 0.1094, |
|
"step": 641 |
|
}, |
|
{ |
|
"epoch": 2.629213483146067, |
|
"grad_norm": 0.16130903363227844, |
|
"learning_rate": 8.002055634117578e-07, |
|
"loss": 0.0978, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.633299284984678, |
|
"grad_norm": 0.3152656853199005, |
|
"learning_rate": 7.827564436845569e-07, |
|
"loss": 0.1257, |
|
"step": 643 |
|
}, |
|
{ |
|
"epoch": 2.637385086823289, |
|
"grad_norm": 0.15360267460346222, |
|
"learning_rate": 7.654919214763357e-07, |
|
"loss": 0.0991, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 2.6414708886619, |
|
"grad_norm": 0.1769319772720337, |
|
"learning_rate": 7.48412342550765e-07, |
|
"loss": 0.1308, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.6455566905005106, |
|
"grad_norm": 0.16229580342769623, |
|
"learning_rate": 7.315180489675822e-07, |
|
"loss": 0.105, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 2.6496424923391215, |
|
"grad_norm": 0.19041888415813446, |
|
"learning_rate": 7.148093790757371e-07, |
|
"loss": 0.1044, |
|
"step": 647 |
|
}, |
|
{ |
|
"epoch": 2.6537282941777325, |
|
"grad_norm": 0.1719420999288559, |
|
"learning_rate": 6.98286667506618e-07, |
|
"loss": 0.103, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 2.6578140960163434, |
|
"grad_norm": 0.17871129512786865, |
|
"learning_rate": 6.819502451673477e-07, |
|
"loss": 0.1089, |
|
"step": 649 |
|
}, |
|
{ |
|
"epoch": 2.661899897854954, |
|
"grad_norm": 0.1983802616596222, |
|
"learning_rate": 6.658004392341633e-07, |
|
"loss": 0.1226, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.665985699693565, |
|
"grad_norm": 0.18126778304576874, |
|
"learning_rate": 6.498375731458529e-07, |
|
"loss": 0.0942, |
|
"step": 651 |
|
}, |
|
{ |
|
"epoch": 2.670071501532176, |
|
"grad_norm": 0.22269214689731598, |
|
"learning_rate": 6.340619665972847e-07, |
|
"loss": 0.12, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 2.6741573033707864, |
|
"grad_norm": 0.17433296144008636, |
|
"learning_rate": 6.184739355330083e-07, |
|
"loss": 0.0908, |
|
"step": 653 |
|
}, |
|
{ |
|
"epoch": 2.6782431052093973, |
|
"grad_norm": 0.1892101764678955, |
|
"learning_rate": 6.030737921409169e-07, |
|
"loss": 0.1206, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 2.6823289070480083, |
|
"grad_norm": 0.15882223844528198, |
|
"learning_rate": 5.878618448460005e-07, |
|
"loss": 0.1025, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.686414708886619, |
|
"grad_norm": 0.16614261269569397, |
|
"learning_rate": 5.728383983041696e-07, |
|
"loss": 0.0968, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 2.686414708886619, |
|
"eval_loss": 0.27833399176597595, |
|
"eval_runtime": 5.2873, |
|
"eval_samples_per_second": 14.941, |
|
"eval_steps_per_second": 1.891, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 2.6905005107252298, |
|
"grad_norm": 0.1896563172340393, |
|
"learning_rate": 5.580037533961546e-07, |
|
"loss": 0.0972, |
|
"step": 657 |
|
}, |
|
{ |
|
"epoch": 2.6945863125638407, |
|
"grad_norm": 0.14636199176311493, |
|
"learning_rate": 5.43358207221476e-07, |
|
"loss": 0.0963, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 2.6986721144024512, |
|
"grad_norm": 0.15723063051700592, |
|
"learning_rate": 5.28902053092496e-07, |
|
"loss": 0.1, |
|
"step": 659 |
|
}, |
|
{ |
|
"epoch": 2.702757916241062, |
|
"grad_norm": 0.17722751200199127, |
|
"learning_rate": 5.146355805285452e-07, |
|
"loss": 0.1079, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.706843718079673, |
|
"grad_norm": 0.19270357489585876, |
|
"learning_rate": 5.005590752501244e-07, |
|
"loss": 0.1004, |
|
"step": 661 |
|
}, |
|
{ |
|
"epoch": 2.710929519918284, |
|
"grad_norm": 0.16365192830562592, |
|
"learning_rate": 4.866728191731829e-07, |
|
"loss": 0.0966, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 2.7150153217568946, |
|
"grad_norm": 0.16443882882595062, |
|
"learning_rate": 4.7297709040346474e-07, |
|
"loss": 0.0995, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 2.7191011235955056, |
|
"grad_norm": 0.18089786171913147, |
|
"learning_rate": 4.594721632309551e-07, |
|
"loss": 0.1028, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 2.7231869254341166, |
|
"grad_norm": 0.16443659365177155, |
|
"learning_rate": 4.4615830812437035e-07, |
|
"loss": 0.1073, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.16640505194664001, |
|
"learning_rate": 4.3303579172574884e-07, |
|
"loss": 0.0955, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 2.731358529111338, |
|
"grad_norm": 0.14559435844421387, |
|
"learning_rate": 4.2010487684511105e-07, |
|
"loss": 0.0885, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 2.735444330949949, |
|
"grad_norm": 0.17073681950569153, |
|
"learning_rate": 4.0736582245519795e-07, |
|
"loss": 0.0995, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 2.73953013278856, |
|
"grad_norm": 0.15490970015525818, |
|
"learning_rate": 3.9481888368627764e-07, |
|
"loss": 0.1093, |
|
"step": 669 |
|
}, |
|
{ |
|
"epoch": 2.7436159346271705, |
|
"grad_norm": 0.1694924384355545, |
|
"learning_rate": 3.824643118210403e-07, |
|
"loss": 0.1058, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.7477017364657814, |
|
"grad_norm": 0.17682881653308868, |
|
"learning_rate": 3.7030235428956895e-07, |
|
"loss": 0.0963, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 2.7517875383043924, |
|
"grad_norm": 0.1799306720495224, |
|
"learning_rate": 3.5833325466437697e-07, |
|
"loss": 0.106, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 2.755873340143003, |
|
"grad_norm": 0.15657733380794525, |
|
"learning_rate": 3.4655725265553276e-07, |
|
"loss": 0.0995, |
|
"step": 673 |
|
}, |
|
{ |
|
"epoch": 2.759959141981614, |
|
"grad_norm": 0.1477828323841095, |
|
"learning_rate": 3.349745841058605e-07, |
|
"loss": 0.0975, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 2.764044943820225, |
|
"grad_norm": 0.162074476480484, |
|
"learning_rate": 3.235854809862193e-07, |
|
"loss": 0.1012, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.7681307456588353, |
|
"grad_norm": 0.15835687518119812, |
|
"learning_rate": 3.1239017139084725e-07, |
|
"loss": 0.0981, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 2.7722165474974463, |
|
"grad_norm": 0.1847173422574997, |
|
"learning_rate": 3.0138887953280573e-07, |
|
"loss": 0.1121, |
|
"step": 677 |
|
}, |
|
{ |
|
"epoch": 2.7763023493360572, |
|
"grad_norm": 0.17560510337352753, |
|
"learning_rate": 2.905818257394799e-07, |
|
"loss": 0.0903, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 2.7803881511746678, |
|
"grad_norm": 0.1589270681142807, |
|
"learning_rate": 2.7996922644817126e-07, |
|
"loss": 0.0931, |
|
"step": 679 |
|
}, |
|
{ |
|
"epoch": 2.7844739530132787, |
|
"grad_norm": 0.20804014801979065, |
|
"learning_rate": 2.6955129420176193e-07, |
|
"loss": 0.1128, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.7885597548518897, |
|
"grad_norm": 0.16977344453334808, |
|
"learning_rate": 2.593282376444539e-07, |
|
"loss": 0.0988, |
|
"step": 681 |
|
}, |
|
{ |
|
"epoch": 2.7926455566905006, |
|
"grad_norm": 0.20647379755973816, |
|
"learning_rate": 2.493002615175977e-07, |
|
"loss": 0.1138, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 2.7967313585291116, |
|
"grad_norm": 0.17715241014957428, |
|
"learning_rate": 2.3946756665558457e-07, |
|
"loss": 0.1117, |
|
"step": 683 |
|
}, |
|
{ |
|
"epoch": 2.800817160367722, |
|
"grad_norm": 0.16735659539699554, |
|
"learning_rate": 2.2983034998182997e-07, |
|
"loss": 0.0986, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 2.804902962206333, |
|
"grad_norm": 0.1884794533252716, |
|
"learning_rate": 2.2038880450482635e-07, |
|
"loss": 0.1075, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.808988764044944, |
|
"grad_norm": 0.18061450123786926, |
|
"learning_rate": 2.11143119314281e-07, |
|
"loss": 0.1187, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 2.8130745658835545, |
|
"grad_norm": 0.18973694741725922, |
|
"learning_rate": 2.0209347957732328e-07, |
|
"loss": 0.0976, |
|
"step": 687 |
|
}, |
|
{ |
|
"epoch": 2.8171603677221655, |
|
"grad_norm": 0.16097040474414825, |
|
"learning_rate": 1.9324006653480332e-07, |
|
"loss": 0.1105, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 2.8212461695607765, |
|
"grad_norm": 0.18477506935596466, |
|
"learning_rate": 1.845830574976548e-07, |
|
"loss": 0.109, |
|
"step": 689 |
|
}, |
|
{ |
|
"epoch": 2.825331971399387, |
|
"grad_norm": 0.18649916350841522, |
|
"learning_rate": 1.761226258433524e-07, |
|
"loss": 0.1098, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.829417773237998, |
|
"grad_norm": 0.1677372008562088, |
|
"learning_rate": 1.6785894101243205e-07, |
|
"loss": 0.0956, |
|
"step": 691 |
|
}, |
|
{ |
|
"epoch": 2.833503575076609, |
|
"grad_norm": 0.18176457285881042, |
|
"learning_rate": 1.5979216850509848e-07, |
|
"loss": 0.1085, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 2.8375893769152194, |
|
"grad_norm": 0.18912231922149658, |
|
"learning_rate": 1.519224698779198e-07, |
|
"loss": 0.1212, |
|
"step": 693 |
|
}, |
|
{ |
|
"epoch": 2.8416751787538304, |
|
"grad_norm": 0.158539280295372, |
|
"learning_rate": 1.4425000274057577e-07, |
|
"loss": 0.1033, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 2.8457609805924413, |
|
"grad_norm": 0.18345937132835388, |
|
"learning_rate": 1.367749207527147e-07, |
|
"loss": 0.112, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.849846782431052, |
|
"grad_norm": 0.1726057231426239, |
|
"learning_rate": 1.2949737362087156e-07, |
|
"loss": 0.1051, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 2.853932584269663, |
|
"grad_norm": 0.15934613347053528, |
|
"learning_rate": 1.2241750709546918e-07, |
|
"loss": 0.1102, |
|
"step": 697 |
|
}, |
|
{ |
|
"epoch": 2.8580183861082737, |
|
"grad_norm": 0.1927506923675537, |
|
"learning_rate": 1.1553546296789952e-07, |
|
"loss": 0.1037, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 2.8621041879468847, |
|
"grad_norm": 0.18815335631370544, |
|
"learning_rate": 1.0885137906768373e-07, |
|
"loss": 0.1076, |
|
"step": 699 |
|
}, |
|
{ |
|
"epoch": 2.8661899897854957, |
|
"grad_norm": 0.16629058122634888, |
|
"learning_rate": 1.0236538925971429e-07, |
|
"loss": 0.0988, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.870275791624106, |
|
"grad_norm": 0.18162870407104492, |
|
"learning_rate": 9.607762344156946e-08, |
|
"loss": 0.1088, |
|
"step": 701 |
|
}, |
|
{ |
|
"epoch": 2.874361593462717, |
|
"grad_norm": 0.1731279492378235, |
|
"learning_rate": 8.99882075409153e-08, |
|
"loss": 0.1004, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 2.878447395301328, |
|
"grad_norm": 0.18259696662425995, |
|
"learning_rate": 8.409726351298441e-08, |
|
"loss": 0.0976, |
|
"step": 703 |
|
}, |
|
{ |
|
"epoch": 2.8825331971399386, |
|
"grad_norm": 0.16938042640686035, |
|
"learning_rate": 7.840490933812783e-08, |
|
"loss": 0.0924, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 2.8866189989785496, |
|
"grad_norm": 0.19586586952209473, |
|
"learning_rate": 7.291125901946027e-08, |
|
"loss": 0.104, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.8907048008171605, |
|
"grad_norm": 0.17584678530693054, |
|
"learning_rate": 6.761642258056977e-08, |
|
"loss": 0.1156, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 2.894790602655771, |
|
"grad_norm": 0.19378376007080078, |
|
"learning_rate": 6.252050606332049e-08, |
|
"loss": 0.0969, |
|
"step": 707 |
|
}, |
|
{ |
|
"epoch": 2.898876404494382, |
|
"grad_norm": 0.1698794811964035, |
|
"learning_rate": 5.7623611525721155e-08, |
|
"loss": 0.1018, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 2.902962206332993, |
|
"grad_norm": 0.18257030844688416, |
|
"learning_rate": 5.292583703988885e-08, |
|
"loss": 0.1136, |
|
"step": 709 |
|
}, |
|
{ |
|
"epoch": 2.9070480081716035, |
|
"grad_norm": 0.17587602138519287, |
|
"learning_rate": 4.8427276690081735e-08, |
|
"loss": 0.0981, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.9111338100102144, |
|
"grad_norm": 0.1458127200603485, |
|
"learning_rate": 4.412802057081278e-08, |
|
"loss": 0.0896, |
|
"step": 711 |
|
}, |
|
{ |
|
"epoch": 2.9152196118488254, |
|
"grad_norm": 0.20564356446266174, |
|
"learning_rate": 4.002815478505007e-08, |
|
"loss": 0.1078, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 2.919305413687436, |
|
"grad_norm": 0.18535220623016357, |
|
"learning_rate": 3.612776144248597e-08, |
|
"loss": 0.1122, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 2.923391215526047, |
|
"grad_norm": 0.17741656303405762, |
|
"learning_rate": 3.242691865790071e-08, |
|
"loss": 0.094, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 2.927477017364658, |
|
"grad_norm": 0.1449773907661438, |
|
"learning_rate": 2.8925700549589096e-08, |
|
"loss": 0.0972, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.9315628192032688, |
|
"grad_norm": 0.17832088470458984, |
|
"learning_rate": 2.5624177237884017e-08, |
|
"loss": 0.097, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 2.9356486210418797, |
|
"grad_norm": 0.17102226614952087, |
|
"learning_rate": 2.2522414843748618e-08, |
|
"loss": 0.1049, |
|
"step": 717 |
|
}, |
|
{ |
|
"epoch": 2.9397344228804902, |
|
"grad_norm": 0.17965158820152283, |
|
"learning_rate": 1.962047548744961e-08, |
|
"loss": 0.0871, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 2.943820224719101, |
|
"grad_norm": 0.18163391947746277, |
|
"learning_rate": 1.6918417287318245e-08, |
|
"loss": 0.1123, |
|
"step": 719 |
|
}, |
|
{ |
|
"epoch": 2.947906026557712, |
|
"grad_norm": 0.16020391881465912, |
|
"learning_rate": 1.4416294358582383e-08, |
|
"loss": 0.0958, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.9519918283963227, |
|
"grad_norm": 0.15003512799739838, |
|
"learning_rate": 1.2114156812284006e-08, |
|
"loss": 0.088, |
|
"step": 721 |
|
}, |
|
{ |
|
"epoch": 2.9560776302349336, |
|
"grad_norm": 0.15326683223247528, |
|
"learning_rate": 1.0012050754277802e-08, |
|
"loss": 0.0928, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 2.9601634320735446, |
|
"grad_norm": 0.1993480920791626, |
|
"learning_rate": 8.110018284304132e-09, |
|
"loss": 0.108, |
|
"step": 723 |
|
}, |
|
{ |
|
"epoch": 2.964249233912155, |
|
"grad_norm": 0.15941934287548065, |
|
"learning_rate": 6.40809749514637e-09, |
|
"loss": 0.1031, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 2.968335035750766, |
|
"grad_norm": 0.17054963111877441, |
|
"learning_rate": 4.9063224718726154e-09, |
|
"loss": 0.1164, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.972420837589377, |
|
"grad_norm": 0.3544883728027344, |
|
"learning_rate": 3.6047232911462506e-09, |
|
"loss": 0.1313, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 2.9765066394279875, |
|
"grad_norm": 0.1857762336730957, |
|
"learning_rate": 2.5033260206275277e-09, |
|
"loss": 0.1108, |
|
"step": 727 |
|
}, |
|
{ |
|
"epoch": 2.9805924412665985, |
|
"grad_norm": 0.18837085366249084, |
|
"learning_rate": 1.6021527184528761e-09, |
|
"loss": 0.1058, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 2.9846782431052095, |
|
"grad_norm": 0.16174080967903137, |
|
"learning_rate": 9.012214327897006e-10, |
|
"loss": 0.1195, |
|
"step": 729 |
|
}, |
|
{ |
|
"epoch": 2.98876404494382, |
|
"grad_norm": 0.18156123161315918, |
|
"learning_rate": 4.005462014766703e-10, |
|
"loss": 0.1092, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.992849846782431, |
|
"grad_norm": 0.1630842536687851, |
|
"learning_rate": 1.0013705174061195e-10, |
|
"loss": 0.1045, |
|
"step": 731 |
|
}, |
|
{ |
|
"epoch": 2.996935648621042, |
|
"grad_norm": 0.1864880621433258, |
|
"learning_rate": 0.0, |
|
"loss": 0.1053, |
|
"step": 732 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 732, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 244, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.237981338905084e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|