End of training

Browse files

Files changed (5) hide show

README.md +1 -1
all_results.json +6 -6
train_results.json +6 -6
trainer_state.json +1425 -11
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -16,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
 # qwen2.5-32b-openalex
-This model is a fine-tuned version of [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) on an unknown dataset.
 ## Model description

 # qwen2.5-32b-openalex
+This model is a fine-tuned version of [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) on the openalex dataset.
 ## Model description

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 0.9977164939399262,
-    "total_flos": 4998655006212096.0,
-    "train_loss": 1.5558027747651222,
-    "train_runtime": 142154.3689,
-    "train_samples_per_second": 0.641,
-    "train_steps_per_second": 0.002
 }

 {
+    "epoch": 0.9991031390134529,
+    "total_flos": 7841700554735616.0,
+    "train_loss": 0.4275342236729456,
+    "train_runtime": 63728.3403,
+    "train_samples_per_second": 2.239,
+    "train_steps_per_second": 0.009
 }

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "epoch": 0.9977164939399262,
-    "total_flos": 4998655006212096.0,
-    "train_loss": 1.5558027747651222,
-    "train_runtime": 142154.3689,
-    "train_samples_per_second": 0.641,
-    "train_steps_per_second": 0.002
 }

 {
+    "epoch": 0.9991031390134529,
+    "total_flos": 7841700554735616.0,
+    "train_loss": 0.4275342236729456,
+    "train_runtime": 63728.3403,
+    "train_samples_per_second": 2.239,
+    "train_steps_per_second": 0.009
 }

trainer_state.json CHANGED Viewed

@@ -1,9 +1,9 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 0.9977164939399262,
   "eval_steps": 500,
-  "global_step": 355,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
@@ -2494,17 +2494,1431 @@
       "step": 355
     },
     {
-      "epoch": 0.9977164939399262,
-      "step": 355,
-      "total_flos": 4998655006212096.0,
-      "train_loss": 1.5558027747651222,
-      "train_runtime": 142154.3689,
-      "train_samples_per_second": 0.641,
-      "train_steps_per_second": 0.002
     }
   ],
   "logging_steps": 1,
-  "max_steps": 355,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
@@ -2520,7 +3934,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 4998655006212096.0,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 0.9991031390134529,
   "eval_steps": 500,
+  "global_step": 557,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
       "step": 355
     },
     {
+      "epoch": 0.6385650224215247,
+      "grad_norm": 0.010235507041215897,
+      "learning_rate": 2e-05,
+      "loss": 1.5054,
+      "step": 356
+    },
+    {
+      "epoch": 0.6403587443946188,
+      "grad_norm": 0.010560178197920322,
+      "learning_rate": 2e-05,
+      "loss": 1.5118,
+      "step": 357
+    },
+    {
+      "epoch": 0.6421524663677131,
+      "grad_norm": 0.010353959165513515,
+      "learning_rate": 2e-05,
+      "loss": 1.5226,
+      "step": 358
+    },
+    {
+      "epoch": 0.6439461883408072,
+      "grad_norm": 0.010382940992712975,
+      "learning_rate": 2e-05,
+      "loss": 1.5078,
+      "step": 359
+    },
+    {
+      "epoch": 0.6457399103139013,
+      "grad_norm": 0.009856803342700005,
+      "learning_rate": 2e-05,
+      "loss": 1.5167,
+      "step": 360
+    },
+    {
+      "epoch": 0.6475336322869956,
+      "grad_norm": 0.010195410810410976,
+      "learning_rate": 2e-05,
+      "loss": 1.5142,
+      "step": 361
+    },
+    {
+      "epoch": 0.6493273542600897,
+      "grad_norm": 0.010302864946424961,
+      "learning_rate": 2e-05,
+      "loss": 1.5136,
+      "step": 362
+    },
+    {
+      "epoch": 0.6511210762331838,
+      "grad_norm": 0.010046405717730522,
+      "learning_rate": 2e-05,
+      "loss": 1.5112,
+      "step": 363
+    },
+    {
+      "epoch": 0.6529147982062781,
+      "grad_norm": 0.010849208571016788,
+      "learning_rate": 2e-05,
+      "loss": 1.5114,
+      "step": 364
+    },
+    {
+      "epoch": 0.6547085201793722,
+      "grad_norm": 0.010421674698591232,
+      "learning_rate": 2e-05,
+      "loss": 1.5173,
+      "step": 365
+    },
+    {
+      "epoch": 0.6565022421524663,
+      "grad_norm": 0.00989589188247919,
+      "learning_rate": 2e-05,
+      "loss": 1.5063,
+      "step": 366
+    },
+    {
+      "epoch": 0.6582959641255606,
+      "grad_norm": 0.010465629398822784,
+      "learning_rate": 2e-05,
+      "loss": 1.5031,
+      "step": 367
+    },
+    {
+      "epoch": 0.6600896860986547,
+      "grad_norm": 0.009964341297745705,
+      "learning_rate": 2e-05,
+      "loss": 1.5207,
+      "step": 368
+    },
+    {
+      "epoch": 0.6618834080717488,
+      "grad_norm": 0.01189314667135477,
+      "learning_rate": 2e-05,
+      "loss": 1.5361,
+      "step": 369
+    },
+    {
+      "epoch": 0.6636771300448431,
+      "grad_norm": 0.01012677513062954,
+      "learning_rate": 2e-05,
+      "loss": 1.5215,
+      "step": 370
+    },
+    {
+      "epoch": 0.6654708520179372,
+      "grad_norm": 0.009877102449536324,
+      "learning_rate": 2e-05,
+      "loss": 1.5262,
+      "step": 371
+    },
+    {
+      "epoch": 0.6672645739910313,
+      "grad_norm": 0.01000463031232357,
+      "learning_rate": 2e-05,
+      "loss": 1.5183,
+      "step": 372
+    },
+    {
+      "epoch": 0.6690582959641256,
+      "grad_norm": 0.010188892483711243,
+      "learning_rate": 2e-05,
+      "loss": 1.5183,
+      "step": 373
+    },
+    {
+      "epoch": 0.6708520179372197,
+      "grad_norm": 0.010129815898835659,
+      "learning_rate": 2e-05,
+      "loss": 1.5245,
+      "step": 374
+    },
+    {
+      "epoch": 0.672645739910314,
+      "grad_norm": 0.010608335956931114,
+      "learning_rate": 2e-05,
+      "loss": 1.5169,
+      "step": 375
+    },
+    {
+      "epoch": 0.6744394618834081,
+      "grad_norm": 0.010223207995295525,
+      "learning_rate": 2e-05,
+      "loss": 1.5185,
+      "step": 376
+    },
+    {
+      "epoch": 0.6762331838565022,
+      "grad_norm": 0.010141369886696339,
+      "learning_rate": 2e-05,
+      "loss": 1.5161,
+      "step": 377
+    },
+    {
+      "epoch": 0.6780269058295965,
+      "grad_norm": 0.01027351152151823,
+      "learning_rate": 2e-05,
+      "loss": 1.5119,
+      "step": 378
+    },
+    {
+      "epoch": 0.6798206278026906,
+      "grad_norm": 0.010362266562879086,
+      "learning_rate": 2e-05,
+      "loss": 1.5126,
+      "step": 379
+    },
+    {
+      "epoch": 0.6816143497757847,
+      "grad_norm": 0.010336722247302532,
+      "learning_rate": 2e-05,
+      "loss": 1.5173,
+      "step": 380
+    },
+    {
+      "epoch": 0.683408071748879,
+      "grad_norm": 0.01007298193871975,
+      "learning_rate": 2e-05,
+      "loss": 1.5075,
+      "step": 381
+    },
+    {
+      "epoch": 0.6852017937219731,
+      "grad_norm": 0.010275410488247871,
+      "learning_rate": 2e-05,
+      "loss": 1.5123,
+      "step": 382
+    },
+    {
+      "epoch": 0.6869955156950672,
+      "grad_norm": 0.010203160345554352,
+      "learning_rate": 2e-05,
+      "loss": 1.5151,
+      "step": 383
+    },
+    {
+      "epoch": 0.6887892376681615,
+      "grad_norm": 0.010127630084753036,
+      "learning_rate": 2e-05,
+      "loss": 1.5211,
+      "step": 384
+    },
+    {
+      "epoch": 0.6905829596412556,
+      "grad_norm": 0.009799284860491753,
+      "learning_rate": 2e-05,
+      "loss": 1.5191,
+      "step": 385
+    },
+    {
+      "epoch": 0.6923766816143497,
+      "grad_norm": 0.01014394499361515,
+      "learning_rate": 2e-05,
+      "loss": 1.5261,
+      "step": 386
+    },
+    {
+      "epoch": 0.694170403587444,
+      "grad_norm": 0.010567774064838886,
+      "learning_rate": 2e-05,
+      "loss": 1.5232,
+      "step": 387
+    },
+    {
+      "epoch": 0.6959641255605381,
+      "grad_norm": 0.010051852092146873,
+      "learning_rate": 2e-05,
+      "loss": 1.5212,
+      "step": 388
+    },
+    {
+      "epoch": 0.6977578475336322,
+      "grad_norm": 0.010241293348371983,
+      "learning_rate": 2e-05,
+      "loss": 1.5094,
+      "step": 389
+    },
+    {
+      "epoch": 0.6995515695067265,
+      "grad_norm": 0.0095717404037714,
+      "learning_rate": 2e-05,
+      "loss": 1.5115,
+      "step": 390
+    },
+    {
+      "epoch": 0.7013452914798206,
+      "grad_norm": 0.00974031537771225,
+      "learning_rate": 2e-05,
+      "loss": 1.5195,
+      "step": 391
+    },
+    {
+      "epoch": 0.7031390134529149,
+      "grad_norm": 0.010140657424926758,
+      "learning_rate": 2e-05,
+      "loss": 1.5048,
+      "step": 392
+    },
+    {
+      "epoch": 0.704932735426009,
+      "grad_norm": 0.010055477730929852,
+      "learning_rate": 2e-05,
+      "loss": 1.5162,
+      "step": 393
+    },
+    {
+      "epoch": 0.7067264573991031,
+      "grad_norm": 0.01005468424409628,
+      "learning_rate": 2e-05,
+      "loss": 1.5258,
+      "step": 394
+    },
+    {
+      "epoch": 0.7085201793721974,
+      "grad_norm": 0.010284669697284698,
+      "learning_rate": 2e-05,
+      "loss": 1.5094,
+      "step": 395
+    },
+    {
+      "epoch": 0.7103139013452915,
+      "grad_norm": 0.010200968012213707,
+      "learning_rate": 2e-05,
+      "loss": 1.5172,
+      "step": 396
+    },
+    {
+      "epoch": 0.7121076233183856,
+      "grad_norm": 0.01015354972332716,
+      "learning_rate": 2e-05,
+      "loss": 1.5117,
+      "step": 397
+    },
+    {
+      "epoch": 0.7139013452914799,
+      "grad_norm": 0.009913373738527298,
+      "learning_rate": 2e-05,
+      "loss": 1.5268,
+      "step": 398
+    },
+    {
+      "epoch": 0.715695067264574,
+      "grad_norm": 0.010287330485880375,
+      "learning_rate": 2e-05,
+      "loss": 1.5211,
+      "step": 399
+    },
+    {
+      "epoch": 0.7174887892376681,
+      "grad_norm": 0.01057345885783434,
+      "learning_rate": 2e-05,
+      "loss": 1.5199,
+      "step": 400
+    },
+    {
+      "epoch": 0.7192825112107624,
+      "grad_norm": 0.010113878175616264,
+      "learning_rate": 2e-05,
+      "loss": 1.5168,
+      "step": 401
+    },
+    {
+      "epoch": 0.7210762331838565,
+      "grad_norm": 0.009940318763256073,
+      "learning_rate": 2e-05,
+      "loss": 1.5175,
+      "step": 402
+    },
+    {
+      "epoch": 0.7228699551569506,
+      "grad_norm": 0.010180394165217876,
+      "learning_rate": 2e-05,
+      "loss": 1.5211,
+      "step": 403
+    },
+    {
+      "epoch": 0.7246636771300449,
+      "grad_norm": 0.00961736124008894,
+      "learning_rate": 2e-05,
+      "loss": 1.5228,
+      "step": 404
+    },
+    {
+      "epoch": 0.726457399103139,
+      "grad_norm": 0.010378845036029816,
+      "learning_rate": 2e-05,
+      "loss": 1.522,
+      "step": 405
+    },
+    {
+      "epoch": 0.7282511210762331,
+      "grad_norm": 0.010189516469836235,
+      "learning_rate": 2e-05,
+      "loss": 1.525,
+      "step": 406
+    },
+    {
+      "epoch": 0.7300448430493274,
+      "grad_norm": 0.010004358366131783,
+      "learning_rate": 2e-05,
+      "loss": 1.5172,
+      "step": 407
+    },
+    {
+      "epoch": 0.7318385650224215,
+      "grad_norm": 0.010387993417680264,
+      "learning_rate": 2e-05,
+      "loss": 1.5246,
+      "step": 408
+    },
+    {
+      "epoch": 0.7336322869955157,
+      "grad_norm": 0.010004810988903046,
+      "learning_rate": 2e-05,
+      "loss": 1.5132,
+      "step": 409
+    },
+    {
+      "epoch": 0.7354260089686099,
+      "grad_norm": 0.009845850057899952,
+      "learning_rate": 2e-05,
+      "loss": 1.5248,
+      "step": 410
+    },
+    {
+      "epoch": 0.737219730941704,
+      "grad_norm": 0.010015097446739674,
+      "learning_rate": 2e-05,
+      "loss": 1.5196,
+      "step": 411
+    },
+    {
+      "epoch": 0.7390134529147983,
+      "grad_norm": 0.009975203312933445,
+      "learning_rate": 2e-05,
+      "loss": 1.5096,
+      "step": 412
+    },
+    {
+      "epoch": 0.7408071748878924,
+      "grad_norm": 0.010078891180455685,
+      "learning_rate": 2e-05,
+      "loss": 1.5162,
+      "step": 413
+    },
+    {
+      "epoch": 0.7426008968609865,
+      "grad_norm": 0.011885426007211208,
+      "learning_rate": 2e-05,
+      "loss": 1.5189,
+      "step": 414
+    },
+    {
+      "epoch": 0.7443946188340808,
+      "grad_norm": 0.009693853557109833,
+      "learning_rate": 2e-05,
+      "loss": 1.5194,
+      "step": 415
+    },
+    {
+      "epoch": 0.7461883408071749,
+      "grad_norm": 0.010337116196751595,
+      "learning_rate": 2e-05,
+      "loss": 1.5191,
+      "step": 416
+    },
+    {
+      "epoch": 0.747982062780269,
+      "grad_norm": 0.00993486400693655,
+      "learning_rate": 2e-05,
+      "loss": 1.5177,
+      "step": 417
+    },
+    {
+      "epoch": 0.7497757847533633,
+      "grad_norm": 0.010143253020942211,
+      "learning_rate": 2e-05,
+      "loss": 1.514,
+      "step": 418
+    },
+    {
+      "epoch": 0.7515695067264574,
+      "grad_norm": 0.010233073495328426,
+      "learning_rate": 2e-05,
+      "loss": 1.5154,
+      "step": 419
+    },
+    {
+      "epoch": 0.7533632286995515,
+      "grad_norm": 0.009982983581721783,
+      "learning_rate": 2e-05,
+      "loss": 1.5223,
+      "step": 420
+    },
+    {
+      "epoch": 0.7551569506726458,
+      "grad_norm": 0.010409766808152199,
+      "learning_rate": 2e-05,
+      "loss": 1.5152,
+      "step": 421
+    },
+    {
+      "epoch": 0.7569506726457399,
+      "grad_norm": 0.0099264495074749,
+      "learning_rate": 2e-05,
+      "loss": 1.5185,
+      "step": 422
+    },
+    {
+      "epoch": 0.758744394618834,
+      "grad_norm": 0.009928545914590359,
+      "learning_rate": 2e-05,
+      "loss": 1.4986,
+      "step": 423
+    },
+    {
+      "epoch": 0.7605381165919283,
+      "grad_norm": 0.009940563701093197,
+      "learning_rate": 2e-05,
+      "loss": 1.5071,
+      "step": 424
+    },
+    {
+      "epoch": 0.7623318385650224,
+      "grad_norm": 0.010767797008156776,
+      "learning_rate": 2e-05,
+      "loss": 1.5006,
+      "step": 425
+    },
+    {
+      "epoch": 0.7641255605381166,
+      "grad_norm": 0.010551121085882187,
+      "learning_rate": 2e-05,
+      "loss": 1.5201,
+      "step": 426
+    },
+    {
+      "epoch": 0.7659192825112108,
+      "grad_norm": 0.010118665173649788,
+      "learning_rate": 2e-05,
+      "loss": 1.5213,
+      "step": 427
+    },
+    {
+      "epoch": 0.7677130044843049,
+      "grad_norm": 0.010247626341879368,
+      "learning_rate": 2e-05,
+      "loss": 1.5178,
+      "step": 428
+    },
+    {
+      "epoch": 0.7695067264573991,
+      "grad_norm": 0.010188435204327106,
+      "learning_rate": 2e-05,
+      "loss": 1.5085,
+      "step": 429
+    },
+    {
+      "epoch": 0.7713004484304933,
+      "grad_norm": 0.010428003035485744,
+      "learning_rate": 2e-05,
+      "loss": 1.5124,
+      "step": 430
+    },
+    {
+      "epoch": 0.7730941704035874,
+      "grad_norm": 0.01012035645544529,
+      "learning_rate": 2e-05,
+      "loss": 1.5299,
+      "step": 431
+    },
+    {
+      "epoch": 0.7748878923766817,
+      "grad_norm": 0.010584665462374687,
+      "learning_rate": 2e-05,
+      "loss": 1.5095,
+      "step": 432
+    },
+    {
+      "epoch": 0.7766816143497758,
+      "grad_norm": 0.009979243390262127,
+      "learning_rate": 2e-05,
+      "loss": 1.5193,
+      "step": 433
+    },
+    {
+      "epoch": 0.7784753363228699,
+      "grad_norm": 0.00958004966378212,
+      "learning_rate": 2e-05,
+      "loss": 1.5214,
+      "step": 434
+    },
+    {
+      "epoch": 0.7802690582959642,
+      "grad_norm": 0.00973733700811863,
+      "learning_rate": 2e-05,
+      "loss": 1.5208,
+      "step": 435
+    },
+    {
+      "epoch": 0.7820627802690583,
+      "grad_norm": 0.010465665720403194,
+      "learning_rate": 2e-05,
+      "loss": 1.5227,
+      "step": 436
+    },
+    {
+      "epoch": 0.7838565022421524,
+      "grad_norm": 0.010098133236169815,
+      "learning_rate": 2e-05,
+      "loss": 1.5248,
+      "step": 437
+    },
+    {
+      "epoch": 0.7856502242152467,
+      "grad_norm": 0.10259313136339188,
+      "learning_rate": 2e-05,
+      "loss": 1.5222,
+      "step": 438
+    },
+    {
+      "epoch": 0.7874439461883408,
+      "grad_norm": 0.01040815282613039,
+      "learning_rate": 2e-05,
+      "loss": 1.5205,
+      "step": 439
+    },
+    {
+      "epoch": 0.7892376681614349,
+      "grad_norm": 0.010325520299375057,
+      "learning_rate": 2e-05,
+      "loss": 1.5189,
+      "step": 440
+    },
+    {
+      "epoch": 0.7910313901345292,
+      "grad_norm": 0.010079775005578995,
+      "learning_rate": 2e-05,
+      "loss": 1.5156,
+      "step": 441
+    },
+    {
+      "epoch": 0.7928251121076233,
+      "grad_norm": 0.010167201980948448,
+      "learning_rate": 2e-05,
+      "loss": 1.5116,
+      "step": 442
+    },
+    {
+      "epoch": 0.7946188340807175,
+      "grad_norm": 0.010806124657392502,
+      "learning_rate": 2e-05,
+      "loss": 1.5153,
+      "step": 443
+    },
+    {
+      "epoch": 0.7964125560538117,
+      "grad_norm": 0.010324080474674702,
+      "learning_rate": 2e-05,
+      "loss": 1.5246,
+      "step": 444
+    },
+    {
+      "epoch": 0.7982062780269058,
+      "grad_norm": 0.010092305950820446,
+      "learning_rate": 2e-05,
+      "loss": 1.5282,
+      "step": 445
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.01007048413157463,
+      "learning_rate": 2e-05,
+      "loss": 1.5108,
+      "step": 446
+    },
+    {
+      "epoch": 0.8017937219730942,
+      "grad_norm": 0.010184276849031448,
+      "learning_rate": 2e-05,
+      "loss": 1.51,
+      "step": 447
+    },
+    {
+      "epoch": 0.8035874439461883,
+      "grad_norm": 0.010521662421524525,
+      "learning_rate": 2e-05,
+      "loss": 1.5139,
+      "step": 448
+    },
+    {
+      "epoch": 0.8053811659192825,
+      "grad_norm": 0.010600044392049313,
+      "learning_rate": 2e-05,
+      "loss": 1.5091,
+      "step": 449
+    },
+    {
+      "epoch": 0.8071748878923767,
+      "grad_norm": 0.009714100509881973,
+      "learning_rate": 2e-05,
+      "loss": 1.5122,
+      "step": 450
+    },
+    {
+      "epoch": 0.8089686098654708,
+      "grad_norm": 0.010295005515217781,
+      "learning_rate": 2e-05,
+      "loss": 1.52,
+      "step": 451
+    },
+    {
+      "epoch": 0.810762331838565,
+      "grad_norm": 0.010034569539129734,
+      "learning_rate": 2e-05,
+      "loss": 1.5197,
+      "step": 452
+    },
+    {
+      "epoch": 0.8125560538116592,
+      "grad_norm": 0.010086962021887302,
+      "learning_rate": 2e-05,
+      "loss": 1.5117,
+      "step": 453
+    },
+    {
+      "epoch": 0.8143497757847533,
+      "grad_norm": 0.010277335532009602,
+      "learning_rate": 2e-05,
+      "loss": 1.5033,
+      "step": 454
+    },
+    {
+      "epoch": 0.8161434977578476,
+      "grad_norm": 0.010540721006691456,
+      "learning_rate": 2e-05,
+      "loss": 1.5166,
+      "step": 455
+    },
+    {
+      "epoch": 0.8179372197309417,
+      "grad_norm": 0.009755424223840237,
+      "learning_rate": 2e-05,
+      "loss": 1.5149,
+      "step": 456
+    },
+    {
+      "epoch": 0.8197309417040358,
+      "grad_norm": 0.00984253827482462,
+      "learning_rate": 2e-05,
+      "loss": 1.5093,
+      "step": 457
+    },
+    {
+      "epoch": 0.8215246636771301,
+      "grad_norm": 0.009836334735155106,
+      "learning_rate": 2e-05,
+      "loss": 1.5141,
+      "step": 458
+    },
+    {
+      "epoch": 0.8233183856502242,
+      "grad_norm": 0.01032332144677639,
+      "learning_rate": 2e-05,
+      "loss": 1.5241,
+      "step": 459
+    },
+    {
+      "epoch": 0.8251121076233184,
+      "grad_norm": 0.010635129176080227,
+      "learning_rate": 2e-05,
+      "loss": 1.5068,
+      "step": 460
+    },
+    {
+      "epoch": 0.8269058295964126,
+      "grad_norm": 0.009664127603173256,
+      "learning_rate": 2e-05,
+      "loss": 1.5052,
+      "step": 461
+    },
+    {
+      "epoch": 0.8286995515695067,
+      "grad_norm": 0.010554889217019081,
+      "learning_rate": 2e-05,
+      "loss": 1.5071,
+      "step": 462
+    },
+    {
+      "epoch": 0.8304932735426009,
+      "grad_norm": 0.009871057234704494,
+      "learning_rate": 2e-05,
+      "loss": 1.5189,
+      "step": 463
+    },
+    {
+      "epoch": 0.8322869955156951,
+      "grad_norm": 0.010431516915559769,
+      "learning_rate": 2e-05,
+      "loss": 1.5183,
+      "step": 464
+    },
+    {
+      "epoch": 0.8340807174887892,
+      "grad_norm": 0.009860005229711533,
+      "learning_rate": 2e-05,
+      "loss": 1.5213,
+      "step": 465
+    },
+    {
+      "epoch": 0.8358744394618834,
+      "grad_norm": 0.010233579203486443,
+      "learning_rate": 2e-05,
+      "loss": 1.5182,
+      "step": 466
+    },
+    {
+      "epoch": 0.8376681614349776,
+      "grad_norm": 0.010311591438949108,
+      "learning_rate": 2e-05,
+      "loss": 1.5092,
+      "step": 467
+    },
+    {
+      "epoch": 0.8394618834080717,
+      "grad_norm": 0.010733729228377342,
+      "learning_rate": 2e-05,
+      "loss": 1.5186,
+      "step": 468
+    },
+    {
+      "epoch": 0.841255605381166,
+      "grad_norm": 0.009951340965926647,
+      "learning_rate": 2e-05,
+      "loss": 1.5097,
+      "step": 469
+    },
+    {
+      "epoch": 0.8430493273542601,
+      "grad_norm": 0.01003777701407671,
+      "learning_rate": 2e-05,
+      "loss": 1.5173,
+      "step": 470
+    },
+    {
+      "epoch": 0.8448430493273542,
+      "grad_norm": 0.009939250536262989,
+      "learning_rate": 2e-05,
+      "loss": 1.5108,
+      "step": 471
+    },
+    {
+      "epoch": 0.8466367713004485,
+      "grad_norm": 0.009835812263190746,
+      "learning_rate": 2e-05,
+      "loss": 1.5272,
+      "step": 472
+    },
+    {
+      "epoch": 0.8484304932735426,
+      "grad_norm": 0.010321546345949173,
+      "learning_rate": 2e-05,
+      "loss": 1.5193,
+      "step": 473
+    },
+    {
+      "epoch": 0.8502242152466367,
+      "grad_norm": 0.01006554439663887,
+      "learning_rate": 2e-05,
+      "loss": 1.5165,
+      "step": 474
+    },
+    {
+      "epoch": 0.852017937219731,
+      "grad_norm": 0.009972809813916683,
+      "learning_rate": 2e-05,
+      "loss": 1.5228,
+      "step": 475
+    },
+    {
+      "epoch": 0.8538116591928251,
+      "grad_norm": 0.010388972237706184,
+      "learning_rate": 2e-05,
+      "loss": 1.5188,
+      "step": 476
+    },
+    {
+      "epoch": 0.8556053811659193,
+      "grad_norm": 0.010111154057085514,
+      "learning_rate": 2e-05,
+      "loss": 1.5199,
+      "step": 477
+    },
+    {
+      "epoch": 0.8573991031390135,
+      "grad_norm": 0.01029327604919672,
+      "learning_rate": 2e-05,
+      "loss": 1.516,
+      "step": 478
+    },
+    {
+      "epoch": 0.8591928251121076,
+      "grad_norm": 0.010400544852018356,
+      "learning_rate": 2e-05,
+      "loss": 1.5218,
+      "step": 479
+    },
+    {
+      "epoch": 0.8609865470852018,
+      "grad_norm": 0.0099885743111372,
+      "learning_rate": 2e-05,
+      "loss": 1.5155,
+      "step": 480
+    },
+    {
+      "epoch": 0.862780269058296,
+      "grad_norm": 0.010007279925048351,
+      "learning_rate": 2e-05,
+      "loss": 1.5205,
+      "step": 481
+    },
+    {
+      "epoch": 0.8645739910313901,
+      "grad_norm": 0.01053563691675663,
+      "learning_rate": 2e-05,
+      "loss": 1.5019,
+      "step": 482
+    },
+    {
+      "epoch": 0.8663677130044843,
+      "grad_norm": 0.01031608134508133,
+      "learning_rate": 2e-05,
+      "loss": 1.5217,
+      "step": 483
+    },
+    {
+      "epoch": 0.8681614349775785,
+      "grad_norm": 0.010082092136144638,
+      "learning_rate": 2e-05,
+      "loss": 1.5073,
+      "step": 484
+    },
+    {
+      "epoch": 0.8699551569506726,
+      "grad_norm": 0.01012254785746336,
+      "learning_rate": 2e-05,
+      "loss": 1.5101,
+      "step": 485
+    },
+    {
+      "epoch": 0.8717488789237668,
+      "grad_norm": 0.010539901442825794,
+      "learning_rate": 2e-05,
+      "loss": 1.5209,
+      "step": 486
+    },
+    {
+      "epoch": 0.873542600896861,
+      "grad_norm": 0.009883386082947254,
+      "learning_rate": 2e-05,
+      "loss": 1.5275,
+      "step": 487
+    },
+    {
+      "epoch": 0.8753363228699551,
+      "grad_norm": 0.010055874474346638,
+      "learning_rate": 2e-05,
+      "loss": 1.521,
+      "step": 488
+    },
+    {
+      "epoch": 0.8771300448430494,
+      "grad_norm": 0.010441599413752556,
+      "learning_rate": 2e-05,
+      "loss": 1.5253,
+      "step": 489
+    },
+    {
+      "epoch": 0.8789237668161435,
+      "grad_norm": 0.010321282781660557,
+      "learning_rate": 2e-05,
+      "loss": 1.5128,
+      "step": 490
+    },
+    {
+      "epoch": 0.8807174887892377,
+      "grad_norm": 0.010404079221189022,
+      "learning_rate": 2e-05,
+      "loss": 1.5216,
+      "step": 491
+    },
+    {
+      "epoch": 0.8825112107623319,
+      "grad_norm": 0.010680857114493847,
+      "learning_rate": 2e-05,
+      "loss": 1.5102,
+      "step": 492
+    },
+    {
+      "epoch": 0.884304932735426,
+      "grad_norm": 0.009785238653421402,
+      "learning_rate": 2e-05,
+      "loss": 1.5152,
+      "step": 493
+    },
+    {
+      "epoch": 0.8860986547085202,
+      "grad_norm": 0.010622934438288212,
+      "learning_rate": 2e-05,
+      "loss": 1.5134,
+      "step": 494
+    },
+    {
+      "epoch": 0.8878923766816144,
+      "grad_norm": 0.009563595987856388,
+      "learning_rate": 2e-05,
+      "loss": 1.5213,
+      "step": 495
+    },
+    {
+      "epoch": 0.8896860986547085,
+      "grad_norm": 0.009900403209030628,
+      "learning_rate": 2e-05,
+      "loss": 1.5254,
+      "step": 496
+    },
+    {
+      "epoch": 0.8914798206278027,
+      "grad_norm": 0.010441206395626068,
+      "learning_rate": 2e-05,
+      "loss": 1.5042,
+      "step": 497
+    },
+    {
+      "epoch": 0.8932735426008969,
+      "grad_norm": 0.010110273025929928,
+      "learning_rate": 2e-05,
+      "loss": 1.5141,
+      "step": 498
+    },
+    {
+      "epoch": 0.895067264573991,
+      "grad_norm": 0.00976527575403452,
+      "learning_rate": 2e-05,
+      "loss": 1.5189,
+      "step": 499
+    },
+    {
+      "epoch": 0.8968609865470852,
+      "grad_norm": 0.010270185768604279,
+      "learning_rate": 2e-05,
+      "loss": 1.5128,
+      "step": 500
+    },
+    {
+      "epoch": 0.8986547085201794,
+      "grad_norm": 0.010477078147232533,
+      "learning_rate": 2e-05,
+      "loss": 1.5331,
+      "step": 501
+    },
+    {
+      "epoch": 0.9004484304932735,
+      "grad_norm": 0.009786723181605339,
+      "learning_rate": 2e-05,
+      "loss": 1.5143,
+      "step": 502
+    },
+    {
+      "epoch": 0.9022421524663677,
+      "grad_norm": 0.009838691912591457,
+      "learning_rate": 2e-05,
+      "loss": 1.5237,
+      "step": 503
+    },
+    {
+      "epoch": 0.9040358744394619,
+      "grad_norm": 0.010305250994861126,
+      "learning_rate": 2e-05,
+      "loss": 1.5236,
+      "step": 504
+    },
+    {
+      "epoch": 0.905829596412556,
+      "grad_norm": 0.010098317638039589,
+      "learning_rate": 2e-05,
+      "loss": 1.5189,
+      "step": 505
+    },
+    {
+      "epoch": 0.9076233183856502,
+      "grad_norm": 0.010335841216146946,
+      "learning_rate": 2e-05,
+      "loss": 1.519,
+      "step": 506
+    },
+    {
+      "epoch": 0.9094170403587444,
+      "grad_norm": 0.009809168055653572,
+      "learning_rate": 2e-05,
+      "loss": 1.5176,
+      "step": 507
+    },
+    {
+      "epoch": 0.9112107623318386,
+      "grad_norm": 0.01069081760942936,
+      "learning_rate": 2e-05,
+      "loss": 1.5055,
+      "step": 508
+    },
+    {
+      "epoch": 0.9130044843049328,
+      "grad_norm": 0.009927291423082352,
+      "learning_rate": 2e-05,
+      "loss": 1.5224,
+      "step": 509
+    },
+    {
+      "epoch": 0.9147982062780269,
+      "grad_norm": 0.010560589842498302,
+      "learning_rate": 2e-05,
+      "loss": 1.5129,
+      "step": 510
+    },
+    {
+      "epoch": 0.9165919282511211,
+      "grad_norm": 0.010154438205063343,
+      "learning_rate": 2e-05,
+      "loss": 1.52,
+      "step": 511
+    },
+    {
+      "epoch": 0.9183856502242153,
+      "grad_norm": 0.010346156544983387,
+      "learning_rate": 2e-05,
+      "loss": 1.5194,
+      "step": 512
+    },
+    {
+      "epoch": 0.9201793721973094,
+      "grad_norm": 0.010523281060159206,
+      "learning_rate": 2e-05,
+      "loss": 1.5187,
+      "step": 513
+    },
+    {
+      "epoch": 0.9219730941704036,
+      "grad_norm": 0.010443002916872501,
+      "learning_rate": 2e-05,
+      "loss": 1.5059,
+      "step": 514
+    },
+    {
+      "epoch": 0.9237668161434978,
+      "grad_norm": 0.010005362331867218,
+      "learning_rate": 2e-05,
+      "loss": 1.5102,
+      "step": 515
+    },
+    {
+      "epoch": 0.9255605381165919,
+      "grad_norm": 0.010285025462508202,
+      "learning_rate": 2e-05,
+      "loss": 1.5217,
+      "step": 516
+    },
+    {
+      "epoch": 0.9273542600896861,
+      "grad_norm": 0.010401098988950253,
+      "learning_rate": 2e-05,
+      "loss": 1.5243,
+      "step": 517
+    },
+    {
+      "epoch": 0.9291479820627803,
+      "grad_norm": 0.010455128736793995,
+      "learning_rate": 2e-05,
+      "loss": 1.5054,
+      "step": 518
+    },
+    {
+      "epoch": 0.9309417040358744,
+      "grad_norm": 0.00987928081303835,
+      "learning_rate": 2e-05,
+      "loss": 1.5053,
+      "step": 519
+    },
+    {
+      "epoch": 0.9327354260089686,
+      "grad_norm": 0.010212692432105541,
+      "learning_rate": 2e-05,
+      "loss": 1.5151,
+      "step": 520
+    },
+    {
+      "epoch": 0.9345291479820628,
+      "grad_norm": 0.010937588289380074,
+      "learning_rate": 2e-05,
+      "loss": 1.5099,
+      "step": 521
+    },
+    {
+      "epoch": 0.9363228699551569,
+      "grad_norm": 0.010248001664876938,
+      "learning_rate": 2e-05,
+      "loss": 1.5256,
+      "step": 522
+    },
+    {
+      "epoch": 0.9381165919282511,
+      "grad_norm": 0.010430903173983097,
+      "learning_rate": 2e-05,
+      "loss": 1.5056,
+      "step": 523
+    },
+    {
+      "epoch": 0.9399103139013453,
+      "grad_norm": 0.0102499695494771,
+      "learning_rate": 2e-05,
+      "loss": 1.5285,
+      "step": 524
+    },
+    {
+      "epoch": 0.9417040358744395,
+      "grad_norm": 0.010674213990569115,
+      "learning_rate": 2e-05,
+      "loss": 1.5136,
+      "step": 525
+    },
+    {
+      "epoch": 0.9434977578475336,
+      "grad_norm": 0.010732615366578102,
+      "learning_rate": 2e-05,
+      "loss": 1.5119,
+      "step": 526
+    },
+    {
+      "epoch": 0.9452914798206278,
+      "grad_norm": 0.009994648396968842,
+      "learning_rate": 2e-05,
+      "loss": 1.5228,
+      "step": 527
+    },
+    {
+      "epoch": 0.947085201793722,
+      "grad_norm": 0.010234368033707142,
+      "learning_rate": 2e-05,
+      "loss": 1.5258,
+      "step": 528
+    },
+    {
+      "epoch": 0.9488789237668162,
+      "grad_norm": 0.010327205993235111,
+      "learning_rate": 2e-05,
+      "loss": 1.5156,
+      "step": 529
+    },
+    {
+      "epoch": 0.9506726457399103,
+      "grad_norm": 0.009836922399699688,
+      "learning_rate": 2e-05,
+      "loss": 1.5171,
+      "step": 530
+    },
+    {
+      "epoch": 0.9524663677130045,
+      "grad_norm": 0.009962068870663643,
+      "learning_rate": 2e-05,
+      "loss": 1.5125,
+      "step": 531
+    },
+    {
+      "epoch": 0.9542600896860987,
+      "grad_norm": 0.010127882473170757,
+      "learning_rate": 2e-05,
+      "loss": 1.5182,
+      "step": 532
+    },
+    {
+      "epoch": 0.9560538116591928,
+      "grad_norm": 0.010251611471176147,
+      "learning_rate": 2e-05,
+      "loss": 1.5139,
+      "step": 533
+    },
+    {
+      "epoch": 0.957847533632287,
+      "grad_norm": 0.010081682354211807,
+      "learning_rate": 2e-05,
+      "loss": 1.5239,
+      "step": 534
+    },
+    {
+      "epoch": 0.9596412556053812,
+      "grad_norm": 0.010235367342829704,
+      "learning_rate": 2e-05,
+      "loss": 1.5159,
+      "step": 535
+    },
+    {
+      "epoch": 0.9614349775784753,
+      "grad_norm": 0.009694702923297882,
+      "learning_rate": 2e-05,
+      "loss": 1.5174,
+      "step": 536
+    },
+    {
+      "epoch": 0.9632286995515695,
+      "grad_norm": 0.010224996134638786,
+      "learning_rate": 2e-05,
+      "loss": 1.5171,
+      "step": 537
+    },
+    {
+      "epoch": 0.9650224215246637,
+      "grad_norm": 0.010206632316112518,
+      "learning_rate": 2e-05,
+      "loss": 1.5223,
+      "step": 538
+    },
+    {
+      "epoch": 0.9668161434977578,
+      "grad_norm": 0.010011864826083183,
+      "learning_rate": 2e-05,
+      "loss": 1.5282,
+      "step": 539
+    },
+    {
+      "epoch": 0.968609865470852,
+      "grad_norm": 0.010364921763539314,
+      "learning_rate": 2e-05,
+      "loss": 1.5092,
+      "step": 540
+    },
+    {
+      "epoch": 0.9704035874439462,
+      "grad_norm": 0.010109508410096169,
+      "learning_rate": 2e-05,
+      "loss": 1.5068,
+      "step": 541
+    },
+    {
+      "epoch": 0.9721973094170404,
+      "grad_norm": 0.00964987464249134,
+      "learning_rate": 2e-05,
+      "loss": 1.5089,
+      "step": 542
+    },
+    {
+      "epoch": 0.9739910313901345,
+      "grad_norm": 0.010244207456707954,
+      "learning_rate": 2e-05,
+      "loss": 1.5217,
+      "step": 543
+    },
+    {
+      "epoch": 0.9757847533632287,
+      "grad_norm": 0.009797874838113785,
+      "learning_rate": 2e-05,
+      "loss": 1.5143,
+      "step": 544
+    },
+    {
+      "epoch": 0.9775784753363229,
+      "grad_norm": 0.010056640952825546,
+      "learning_rate": 2e-05,
+      "loss": 1.5276,
+      "step": 545
+    },
+    {
+      "epoch": 0.979372197309417,
+      "grad_norm": 0.009898710064589977,
+      "learning_rate": 2e-05,
+      "loss": 1.5222,
+      "step": 546
+    },
+    {
+      "epoch": 0.9811659192825112,
+      "grad_norm": 0.0099082225933671,
+      "learning_rate": 2e-05,
+      "loss": 1.5276,
+      "step": 547
+    },
+    {
+      "epoch": 0.9829596412556054,
+      "grad_norm": 0.01018478162586689,
+      "learning_rate": 2e-05,
+      "loss": 1.5217,
+      "step": 548
+    },
+    {
+      "epoch": 0.9847533632286996,
+      "grad_norm": 0.009828625246882439,
+      "learning_rate": 2e-05,
+      "loss": 1.5194,
+      "step": 549
+    },
+    {
+      "epoch": 0.9865470852017937,
+      "grad_norm": 0.010311014950275421,
+      "learning_rate": 2e-05,
+      "loss": 1.5138,
+      "step": 550
+    },
+    {
+      "epoch": 0.9883408071748879,
+      "grad_norm": 0.010840130038559437,
+      "learning_rate": 2e-05,
+      "loss": 1.5044,
+      "step": 551
+    },
+    {
+      "epoch": 0.9901345291479821,
+      "grad_norm": 0.009595104493200779,
+      "learning_rate": 2e-05,
+      "loss": 1.5165,
+      "step": 552
+    },
+    {
+      "epoch": 0.9919282511210762,
+      "grad_norm": 0.01027593482285738,
+      "learning_rate": 2e-05,
+      "loss": 1.5291,
+      "step": 553
+    },
+    {
+      "epoch": 0.9937219730941704,
+      "grad_norm": 0.010394555516541004,
+      "learning_rate": 2e-05,
+      "loss": 1.5109,
+      "step": 554
+    },
+    {
+      "epoch": 0.9955156950672646,
+      "grad_norm": 0.00996735692024231,
+      "learning_rate": 2e-05,
+      "loss": 1.5212,
+      "step": 555
+    },
+    {
+      "epoch": 0.9973094170403587,
+      "grad_norm": 0.010095257312059402,
+      "learning_rate": 2e-05,
+      "loss": 1.5106,
+      "step": 556
+    },
+    {
+      "epoch": 0.9991031390134529,
+      "grad_norm": 0.01082176435738802,
+      "learning_rate": 2e-05,
+      "loss": 1.5099,
+      "step": 557
+    },
+    {
+      "epoch": 0.9991031390134529,
+      "step": 557,
+      "total_flos": 7841700554735616.0,
+      "train_loss": 0.4275342236729456,
+      "train_runtime": 63728.3403,
+      "train_samples_per_second": 2.239,
+      "train_steps_per_second": 0.009
     }
   ],
   "logging_steps": 1,
+  "max_steps": 557,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 1,
   "save_steps": 100,
       "attributes": {}
     }
   },
+  "total_flos": 7841700554735616.0,
   "train_batch_size": 2,
   "trial_name": null,
   "trial_params": null

training_loss.png CHANGED Viewed