0x1202's picture
Training in progress, epoch 0, checkpoint
edb7f2f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.17012227538543329,
"eval_steps": 100,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004253056884635832,
"eval_loss": 1.0832202434539795,
"eval_runtime": 32.6752,
"eval_samples_per_second": 30.298,
"eval_steps_per_second": 15.149,
"step": 1
},
{
"epoch": 0.002126528442317916,
"grad_norm": 0.7841684222221375,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.7602,
"step": 5
},
{
"epoch": 0.004253056884635832,
"grad_norm": 0.921779453754425,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.8528,
"step": 10
},
{
"epoch": 0.006379585326953748,
"grad_norm": 0.9533646106719971,
"learning_rate": 5e-05,
"loss": 1.0032,
"step": 15
},
{
"epoch": 0.008506113769271665,
"grad_norm": 0.9230514764785767,
"learning_rate": 6.666666666666667e-05,
"loss": 0.9854,
"step": 20
},
{
"epoch": 0.01063264221158958,
"grad_norm": 0.907342255115509,
"learning_rate": 8.333333333333334e-05,
"loss": 0.9975,
"step": 25
},
{
"epoch": 0.012759170653907496,
"grad_norm": 1.0756146907806396,
"learning_rate": 0.0001,
"loss": 1.0462,
"step": 30
},
{
"epoch": 0.014885699096225412,
"grad_norm": 1.1117802858352661,
"learning_rate": 9.995494831023409e-05,
"loss": 1.0375,
"step": 35
},
{
"epoch": 0.01701222753854333,
"grad_norm": 1.5911064147949219,
"learning_rate": 9.981987442712633e-05,
"loss": 0.8992,
"step": 40
},
{
"epoch": 0.019138755980861243,
"grad_norm": 1.1971668004989624,
"learning_rate": 9.959502176294383e-05,
"loss": 0.7314,
"step": 45
},
{
"epoch": 0.02126528442317916,
"grad_norm": 1.0723843574523926,
"learning_rate": 9.928079551738543e-05,
"loss": 0.7988,
"step": 50
},
{
"epoch": 0.023391812865497075,
"grad_norm": 0.5400782227516174,
"learning_rate": 9.887776194738432e-05,
"loss": 0.7173,
"step": 55
},
{
"epoch": 0.025518341307814992,
"grad_norm": 0.5201585292816162,
"learning_rate": 9.838664734667495e-05,
"loss": 0.7165,
"step": 60
},
{
"epoch": 0.02764486975013291,
"grad_norm": 0.5986294150352478,
"learning_rate": 9.780833673696254e-05,
"loss": 0.7515,
"step": 65
},
{
"epoch": 0.029771398192450824,
"grad_norm": 0.6949173808097839,
"learning_rate": 9.714387227305422e-05,
"loss": 0.8674,
"step": 70
},
{
"epoch": 0.03189792663476874,
"grad_norm": 0.7511739730834961,
"learning_rate": 9.639445136482548e-05,
"loss": 0.8676,
"step": 75
},
{
"epoch": 0.03402445507708666,
"grad_norm": 0.8884425759315491,
"learning_rate": 9.55614245194068e-05,
"loss": 0.9426,
"step": 80
},
{
"epoch": 0.03615098351940457,
"grad_norm": 1.1355093717575073,
"learning_rate": 9.464629290747842e-05,
"loss": 0.9845,
"step": 85
},
{
"epoch": 0.03827751196172249,
"grad_norm": 0.7691839933395386,
"learning_rate": 9.365070565805941e-05,
"loss": 0.7586,
"step": 90
},
{
"epoch": 0.04040404040404041,
"grad_norm": 0.9200711846351624,
"learning_rate": 9.257645688666556e-05,
"loss": 0.7443,
"step": 95
},
{
"epoch": 0.04253056884635832,
"grad_norm": 1.6259804964065552,
"learning_rate": 9.142548246219212e-05,
"loss": 0.7642,
"step": 100
},
{
"epoch": 0.04253056884635832,
"eval_loss": 0.7883314490318298,
"eval_runtime": 32.4892,
"eval_samples_per_second": 30.472,
"eval_steps_per_second": 15.236,
"step": 100
},
{
"epoch": 0.044657097288676235,
"grad_norm": 0.45502185821533203,
"learning_rate": 9.019985651834703e-05,
"loss": 0.6729,
"step": 105
},
{
"epoch": 0.04678362573099415,
"grad_norm": 0.5358193516731262,
"learning_rate": 8.890178771592199e-05,
"loss": 0.7389,
"step": 110
},
{
"epoch": 0.04891015417331207,
"grad_norm": 0.6835756301879883,
"learning_rate": 8.753361526263621e-05,
"loss": 0.7012,
"step": 115
},
{
"epoch": 0.051036682615629984,
"grad_norm": 0.7369337677955627,
"learning_rate": 8.609780469772623e-05,
"loss": 0.7961,
"step": 120
},
{
"epoch": 0.0531632110579479,
"grad_norm": 0.7770721316337585,
"learning_rate": 8.459694344887732e-05,
"loss": 0.8449,
"step": 125
},
{
"epoch": 0.05528973950026582,
"grad_norm": 0.9152151346206665,
"learning_rate": 8.303373616950408e-05,
"loss": 0.7941,
"step": 130
},
{
"epoch": 0.05741626794258373,
"grad_norm": 1.1904630661010742,
"learning_rate": 8.141099986478212e-05,
"loss": 0.8666,
"step": 135
},
{
"epoch": 0.05954279638490165,
"grad_norm": 1.1595726013183594,
"learning_rate": 7.973165881521434e-05,
"loss": 0.8341,
"step": 140
},
{
"epoch": 0.06166932482721956,
"grad_norm": 0.9822093844413757,
"learning_rate": 7.799873930687978e-05,
"loss": 0.8322,
"step": 145
},
{
"epoch": 0.06379585326953748,
"grad_norm": 1.2074886560440063,
"learning_rate": 7.621536417786159e-05,
"loss": 0.6697,
"step": 150
},
{
"epoch": 0.0659223817118554,
"grad_norm": 0.5012809038162231,
"learning_rate": 7.438474719068173e-05,
"loss": 0.6677,
"step": 155
},
{
"epoch": 0.06804891015417332,
"grad_norm": 0.588683545589447,
"learning_rate": 7.251018724088367e-05,
"loss": 0.6848,
"step": 160
},
{
"epoch": 0.07017543859649122,
"grad_norm": 0.6733798980712891,
"learning_rate": 7.059506241219965e-05,
"loss": 0.6562,
"step": 165
},
{
"epoch": 0.07230196703880915,
"grad_norm": 0.6261290311813354,
"learning_rate": 6.864282388901544e-05,
"loss": 0.731,
"step": 170
},
{
"epoch": 0.07442849548112707,
"grad_norm": 0.9433246850967407,
"learning_rate": 6.665698973710288e-05,
"loss": 0.7953,
"step": 175
},
{
"epoch": 0.07655502392344497,
"grad_norm": 0.8979871273040771,
"learning_rate": 6.464113856382752e-05,
"loss": 0.8661,
"step": 180
},
{
"epoch": 0.0786815523657629,
"grad_norm": 0.8001134395599365,
"learning_rate": 6.259890306925627e-05,
"loss": 0.8672,
"step": 185
},
{
"epoch": 0.08080808080808081,
"grad_norm": 0.9000585675239563,
"learning_rate": 6.0533963499786314e-05,
"loss": 0.7966,
"step": 190
},
{
"epoch": 0.08293460925039872,
"grad_norm": 0.9679039120674133,
"learning_rate": 5.8450041016092464e-05,
"loss": 0.7287,
"step": 195
},
{
"epoch": 0.08506113769271664,
"grad_norm": 0.9612869024276733,
"learning_rate": 5.6350890987343944e-05,
"loss": 0.6869,
"step": 200
},
{
"epoch": 0.08506113769271664,
"eval_loss": 0.74885493516922,
"eval_runtime": 32.4345,
"eval_samples_per_second": 30.523,
"eval_steps_per_second": 15.262,
"step": 200
},
{
"epoch": 0.08718766613503455,
"grad_norm": 0.45820483565330505,
"learning_rate": 5.4240296223775465e-05,
"loss": 0.5357,
"step": 205
},
{
"epoch": 0.08931419457735247,
"grad_norm": 0.6561435461044312,
"learning_rate": 5.212206015980742e-05,
"loss": 0.7261,
"step": 210
},
{
"epoch": 0.09144072301967039,
"grad_norm": 0.706652045249939,
"learning_rate": 5e-05,
"loss": 0.7772,
"step": 215
},
{
"epoch": 0.0935672514619883,
"grad_norm": 0.7001965641975403,
"learning_rate": 4.78779398401926e-05,
"loss": 0.829,
"step": 220
},
{
"epoch": 0.09569377990430622,
"grad_norm": 0.9561212658882141,
"learning_rate": 4.575970377622456e-05,
"loss": 0.8341,
"step": 225
},
{
"epoch": 0.09782030834662414,
"grad_norm": 1.039799690246582,
"learning_rate": 4.364910901265606e-05,
"loss": 0.796,
"step": 230
},
{
"epoch": 0.09994683678894205,
"grad_norm": 1.2402150630950928,
"learning_rate": 4.1549958983907555e-05,
"loss": 0.8059,
"step": 235
},
{
"epoch": 0.10207336523125997,
"grad_norm": 0.7450068593025208,
"learning_rate": 3.94660365002137e-05,
"loss": 0.7875,
"step": 240
},
{
"epoch": 0.10419989367357789,
"grad_norm": 0.8528944253921509,
"learning_rate": 3.740109693074375e-05,
"loss": 0.7362,
"step": 245
},
{
"epoch": 0.1063264221158958,
"grad_norm": 1.327515721321106,
"learning_rate": 3.5358861436172485e-05,
"loss": 0.8017,
"step": 250
},
{
"epoch": 0.10845295055821372,
"grad_norm": 0.5836326479911804,
"learning_rate": 3.334301026289712e-05,
"loss": 0.6268,
"step": 255
},
{
"epoch": 0.11057947900053164,
"grad_norm": 0.5506888031959534,
"learning_rate": 3.135717611098458e-05,
"loss": 0.6116,
"step": 260
},
{
"epoch": 0.11270600744284955,
"grad_norm": 0.7588187456130981,
"learning_rate": 2.9404937587800375e-05,
"loss": 0.6762,
"step": 265
},
{
"epoch": 0.11483253588516747,
"grad_norm": 0.8159733414649963,
"learning_rate": 2.748981275911633e-05,
"loss": 0.8094,
"step": 270
},
{
"epoch": 0.11695906432748537,
"grad_norm": 0.815102756023407,
"learning_rate": 2.5615252809318284e-05,
"loss": 0.6845,
"step": 275
},
{
"epoch": 0.1190855927698033,
"grad_norm": 0.990524172782898,
"learning_rate": 2.3784635822138424e-05,
"loss": 0.7858,
"step": 280
},
{
"epoch": 0.12121212121212122,
"grad_norm": 1.0764540433883667,
"learning_rate": 2.2001260693120233e-05,
"loss": 0.8093,
"step": 285
},
{
"epoch": 0.12333864965443912,
"grad_norm": 0.8942254185676575,
"learning_rate": 2.026834118478567e-05,
"loss": 0.698,
"step": 290
},
{
"epoch": 0.12546517809675706,
"grad_norm": 1.1617945432662964,
"learning_rate": 1.858900013521788e-05,
"loss": 0.8188,
"step": 295
},
{
"epoch": 0.12759170653907495,
"grad_norm": 1.6578449010849,
"learning_rate": 1.6966263830495936e-05,
"loss": 0.6237,
"step": 300
},
{
"epoch": 0.12759170653907495,
"eval_loss": 0.7355306148529053,
"eval_runtime": 32.3471,
"eval_samples_per_second": 30.606,
"eval_steps_per_second": 15.303,
"step": 300
},
{
"epoch": 0.12971823498139287,
"grad_norm": 0.5733609795570374,
"learning_rate": 1.5403056551122697e-05,
"loss": 0.6604,
"step": 305
},
{
"epoch": 0.1318447634237108,
"grad_norm": 0.611932635307312,
"learning_rate": 1.3902195302273779e-05,
"loss": 0.6755,
"step": 310
},
{
"epoch": 0.1339712918660287,
"grad_norm": 0.6192579865455627,
"learning_rate": 1.246638473736378e-05,
"loss": 0.7273,
"step": 315
},
{
"epoch": 0.13609782030834663,
"grad_norm": 2.9742281436920166,
"learning_rate": 1.1098212284078036e-05,
"loss": 0.7241,
"step": 320
},
{
"epoch": 0.13822434875066453,
"grad_norm": 0.8530343174934387,
"learning_rate": 9.800143481652979e-06,
"loss": 0.784,
"step": 325
},
{
"epoch": 0.14035087719298245,
"grad_norm": 1.0411911010742188,
"learning_rate": 8.574517537807897e-06,
"loss": 0.7906,
"step": 330
},
{
"epoch": 0.14247740563530037,
"grad_norm": 1.3315730094909668,
"learning_rate": 7.423543113334436e-06,
"loss": 0.7863,
"step": 335
},
{
"epoch": 0.1446039340776183,
"grad_norm": 0.773786187171936,
"learning_rate": 6.349294341940593e-06,
"loss": 0.7532,
"step": 340
},
{
"epoch": 0.1467304625199362,
"grad_norm": 0.8631799817085266,
"learning_rate": 5.353707092521582e-06,
"loss": 0.7101,
"step": 345
},
{
"epoch": 0.14885699096225413,
"grad_norm": 1.1954724788665771,
"learning_rate": 4.43857548059321e-06,
"loss": 0.8269,
"step": 350
},
{
"epoch": 0.15098351940457203,
"grad_norm": 0.5093288421630859,
"learning_rate": 3.605548635174533e-06,
"loss": 0.6281,
"step": 355
},
{
"epoch": 0.15311004784688995,
"grad_norm": 0.5996906161308289,
"learning_rate": 2.85612772694579e-06,
"loss": 0.6559,
"step": 360
},
{
"epoch": 0.15523657628920787,
"grad_norm": 0.7968803644180298,
"learning_rate": 2.191663263037458e-06,
"loss": 0.7888,
"step": 365
},
{
"epoch": 0.1573631047315258,
"grad_norm": 0.6822494864463806,
"learning_rate": 1.6133526533250565e-06,
"loss": 0.7492,
"step": 370
},
{
"epoch": 0.1594896331738437,
"grad_norm": 0.9301332235336304,
"learning_rate": 1.1222380526156928e-06,
"loss": 0.7315,
"step": 375
},
{
"epoch": 0.16161616161616163,
"grad_norm": 0.9796470999717712,
"learning_rate": 7.192044826145771e-07,
"loss": 0.8134,
"step": 380
},
{
"epoch": 0.16374269005847952,
"grad_norm": 1.4449524879455566,
"learning_rate": 4.049782370561583e-07,
"loss": 0.8235,
"step": 385
},
{
"epoch": 0.16586921850079744,
"grad_norm": 0.774215817451477,
"learning_rate": 1.8012557287367392e-07,
"loss": 0.6907,
"step": 390
},
{
"epoch": 0.16799574694311536,
"grad_norm": 0.9537460207939148,
"learning_rate": 4.5051689765929214e-08,
"loss": 0.6446,
"step": 395
},
{
"epoch": 0.17012227538543329,
"grad_norm": 1.345268726348877,
"learning_rate": 0.0,
"loss": 0.5913,
"step": 400
},
{
"epoch": 0.17012227538543329,
"eval_loss": 0.7333823442459106,
"eval_runtime": 32.4093,
"eval_samples_per_second": 30.547,
"eval_steps_per_second": 15.273,
"step": 400
}
],
"logging_steps": 5,
"max_steps": 400,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2523217739055104e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}