ncbateman commited on
Commit
04a37b0
·
verified ·
1 Parent(s): 3b05342

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1cab7e9a8a987404549f49afdd2936be09b05aa346de6bd994dcd584e6c7c99
3
  size 35237104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6595bf55f02395c5cbd4666c9cb95f98302cd9ef023ebeb628cdc4cf4bf4caee
3
  size 35237104
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a3af5b950a75d8130ae2831e845bdb2d9c0f2568b854448eb1a41e6ca5caf699
3
  size 18810356
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4e6e0de1bcf5bda93ea3307cefce39a83cb8a6899981f858868c87622dba1fd
3
  size 18810356
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:31fd0ef2088134d693702f76cf93ec3d3456380164b3e8cc27330c341fd530f6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7b6fd1c9514da6d7da36a4b4159526160fde1963c15f9d5ba39d98a761f42c0
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0428512ada8c2471b2f37ecbdd4efa5f13e3ba0e777fddbfec0396eebc36c01a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4077036d99500a708f700f75da24d51b5300e184ad35fda49dc5a4df5596cca2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.32617559119325906,
5
  "eval_steps": 250,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6339,6 +6339,714 @@
6339
  "learning_rate": 2.7091379149682685e-06,
6340
  "loss": 2.5194,
6341
  "step": 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6342
  }
6343
  ],
6344
  "logging_steps": 1,
@@ -6353,12 +7061,12 @@
6353
  "should_evaluate": false,
6354
  "should_log": false,
6355
  "should_save": true,
6356
- "should_training_stop": false
6357
  },
6358
  "attributes": {}
6359
  }
6360
  },
6361
- "total_flos": 6.4885887074304e+16,
6362
  "train_batch_size": 2,
6363
  "trial_name": null,
6364
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.3624173235480656,
5
  "eval_steps": 250,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6339
  "learning_rate": 2.7091379149682685e-06,
6340
  "loss": 2.5194,
6341
  "step": 900
6342
+ },
6343
+ {
6344
+ "epoch": 0.3265380085168071,
6345
+ "grad_norm": 6.8190178871154785,
6346
+ "learning_rate": 2.6557085182532582e-06,
6347
+ "loss": 2.4827,
6348
+ "step": 901
6349
+ },
6350
+ {
6351
+ "epoch": 0.32690042584035517,
6352
+ "grad_norm": 8.026456832885742,
6353
+ "learning_rate": 2.602796871124663e-06,
6354
+ "loss": 3.6636,
6355
+ "step": 902
6356
+ },
6357
+ {
6358
+ "epoch": 0.32726284316390325,
6359
+ "grad_norm": 6.956340789794922,
6360
+ "learning_rate": 2.5504035522157854e-06,
6361
+ "loss": 2.8476,
6362
+ "step": 903
6363
+ },
6364
+ {
6365
+ "epoch": 0.3276252604874513,
6366
+ "grad_norm": 5.77375602722168,
6367
+ "learning_rate": 2.4985291344915674e-06,
6368
+ "loss": 2.6426,
6369
+ "step": 904
6370
+ },
6371
+ {
6372
+ "epoch": 0.32798767781099936,
6373
+ "grad_norm": 6.200889587402344,
6374
+ "learning_rate": 2.4471741852423237e-06,
6375
+ "loss": 3.4329,
6376
+ "step": 905
6377
+ },
6378
+ {
6379
+ "epoch": 0.32835009513454744,
6380
+ "grad_norm": 6.780580043792725,
6381
+ "learning_rate": 2.3963392660775575e-06,
6382
+ "loss": 3.2439,
6383
+ "step": 906
6384
+ },
6385
+ {
6386
+ "epoch": 0.3287125124580955,
6387
+ "grad_norm": 6.5239644050598145,
6388
+ "learning_rate": 2.3460249329197824e-06,
6389
+ "loss": 2.5931,
6390
+ "step": 907
6391
+ },
6392
+ {
6393
+ "epoch": 0.32907492978164354,
6394
+ "grad_norm": 7.130517482757568,
6395
+ "learning_rate": 2.296231735998511e-06,
6396
+ "loss": 2.6862,
6397
+ "step": 908
6398
+ },
6399
+ {
6400
+ "epoch": 0.3294373471051916,
6401
+ "grad_norm": 6.246675968170166,
6402
+ "learning_rate": 2.2469602198441573e-06,
6403
+ "loss": 2.8948,
6404
+ "step": 909
6405
+ },
6406
+ {
6407
+ "epoch": 0.3297997644287397,
6408
+ "grad_norm": 7.630551338195801,
6409
+ "learning_rate": 2.1982109232821178e-06,
6410
+ "loss": 2.615,
6411
+ "step": 910
6412
+ },
6413
+ {
6414
+ "epoch": 0.3301621817522878,
6415
+ "grad_norm": 4.334846019744873,
6416
+ "learning_rate": 2.149984379426906e-06,
6417
+ "loss": 2.2025,
6418
+ "step": 911
6419
+ },
6420
+ {
6421
+ "epoch": 0.3305245990758358,
6422
+ "grad_norm": 6.460603713989258,
6423
+ "learning_rate": 2.102281115676258e-06,
6424
+ "loss": 2.6511,
6425
+ "step": 912
6426
+ },
6427
+ {
6428
+ "epoch": 0.3308870163993839,
6429
+ "grad_norm": 7.5470991134643555,
6430
+ "learning_rate": 2.0551016537054493e-06,
6431
+ "loss": 2.6114,
6432
+ "step": 913
6433
+ },
6434
+ {
6435
+ "epoch": 0.331249433722932,
6436
+ "grad_norm": 6.667960166931152,
6437
+ "learning_rate": 2.008446509461498e-06,
6438
+ "loss": 2.8898,
6439
+ "step": 914
6440
+ },
6441
+ {
6442
+ "epoch": 0.33161185104648,
6443
+ "grad_norm": 7.1564836502075195,
6444
+ "learning_rate": 1.962316193157593e-06,
6445
+ "loss": 2.7416,
6446
+ "step": 915
6447
+ },
6448
+ {
6449
+ "epoch": 0.3319742683700281,
6450
+ "grad_norm": 6.149023056030273,
6451
+ "learning_rate": 1.91671120926748e-06,
6452
+ "loss": 2.5522,
6453
+ "step": 916
6454
+ },
6455
+ {
6456
+ "epoch": 0.33233668569357616,
6457
+ "grad_norm": 8.589316368103027,
6458
+ "learning_rate": 1.8716320565199618e-06,
6459
+ "loss": 1.8536,
6460
+ "step": 917
6461
+ },
6462
+ {
6463
+ "epoch": 0.33269910301712424,
6464
+ "grad_norm": 5.793923377990723,
6465
+ "learning_rate": 1.8270792278934302e-06,
6466
+ "loss": 2.9145,
6467
+ "step": 918
6468
+ },
6469
+ {
6470
+ "epoch": 0.33306152034067227,
6471
+ "grad_norm": 6.867883682250977,
6472
+ "learning_rate": 1.7830532106104747e-06,
6473
+ "loss": 3.1564,
6474
+ "step": 919
6475
+ },
6476
+ {
6477
+ "epoch": 0.33342393766422035,
6478
+ "grad_norm": 5.381682872772217,
6479
+ "learning_rate": 1.7395544861325718e-06,
6480
+ "loss": 2.5914,
6481
+ "step": 920
6482
+ },
6483
+ {
6484
+ "epoch": 0.33378635498776843,
6485
+ "grad_norm": 6.665421009063721,
6486
+ "learning_rate": 1.696583530154794e-06,
6487
+ "loss": 2.2271,
6488
+ "step": 921
6489
+ },
6490
+ {
6491
+ "epoch": 0.33414877231131646,
6492
+ "grad_norm": 9.370551109313965,
6493
+ "learning_rate": 1.6541408126006463e-06,
6494
+ "loss": 2.8091,
6495
+ "step": 922
6496
+ },
6497
+ {
6498
+ "epoch": 0.33451118963486454,
6499
+ "grad_norm": 7.959995746612549,
6500
+ "learning_rate": 1.6122267976168781e-06,
6501
+ "loss": 3.6784,
6502
+ "step": 923
6503
+ },
6504
+ {
6505
+ "epoch": 0.3348736069584126,
6506
+ "grad_norm": 7.0754475593566895,
6507
+ "learning_rate": 1.5708419435684462e-06,
6508
+ "loss": 2.9994,
6509
+ "step": 924
6510
+ },
6511
+ {
6512
+ "epoch": 0.3352360242819607,
6513
+ "grad_norm": 6.9350361824035645,
6514
+ "learning_rate": 1.5299867030334814e-06,
6515
+ "loss": 1.8455,
6516
+ "step": 925
6517
+ },
6518
+ {
6519
+ "epoch": 0.3355984416055087,
6520
+ "grad_norm": 6.33629846572876,
6521
+ "learning_rate": 1.4896615227983468e-06,
6522
+ "loss": 2.2597,
6523
+ "step": 926
6524
+ },
6525
+ {
6526
+ "epoch": 0.3359608589290568,
6527
+ "grad_norm": 6.027073383331299,
6528
+ "learning_rate": 1.4498668438527597e-06,
6529
+ "loss": 2.548,
6530
+ "step": 927
6531
+ },
6532
+ {
6533
+ "epoch": 0.3363232762526049,
6534
+ "grad_norm": 6.1353044509887695,
6535
+ "learning_rate": 1.4106031013849496e-06,
6536
+ "loss": 2.6825,
6537
+ "step": 928
6538
+ },
6539
+ {
6540
+ "epoch": 0.33668569357615297,
6541
+ "grad_norm": 4.776915550231934,
6542
+ "learning_rate": 1.3718707247769135e-06,
6543
+ "loss": 3.3943,
6544
+ "step": 929
6545
+ },
6546
+ {
6547
+ "epoch": 0.337048110899701,
6548
+ "grad_norm": 7.03324031829834,
6549
+ "learning_rate": 1.333670137599713e-06,
6550
+ "loss": 2.7431,
6551
+ "step": 930
6552
+ },
6553
+ {
6554
+ "epoch": 0.3374105282232491,
6555
+ "grad_norm": 7.59218692779541,
6556
+ "learning_rate": 1.2960017576088446e-06,
6557
+ "loss": 2.9679,
6558
+ "step": 931
6559
+ },
6560
+ {
6561
+ "epoch": 0.33777294554679715,
6562
+ "grad_norm": 6.613968372344971,
6563
+ "learning_rate": 1.2588659967397e-06,
6564
+ "loss": 2.2011,
6565
+ "step": 932
6566
+ },
6567
+ {
6568
+ "epoch": 0.3381353628703452,
6569
+ "grad_norm": 5.920099258422852,
6570
+ "learning_rate": 1.222263261102985e-06,
6571
+ "loss": 2.3399,
6572
+ "step": 933
6573
+ },
6574
+ {
6575
+ "epoch": 0.33849778019389326,
6576
+ "grad_norm": 9.415234565734863,
6577
+ "learning_rate": 1.1861939509803687e-06,
6578
+ "loss": 2.8695,
6579
+ "step": 934
6580
+ },
6581
+ {
6582
+ "epoch": 0.33886019751744134,
6583
+ "grad_norm": 5.54183292388916,
6584
+ "learning_rate": 1.1506584608200367e-06,
6585
+ "loss": 2.8506,
6586
+ "step": 935
6587
+ },
6588
+ {
6589
+ "epoch": 0.3392226148409894,
6590
+ "grad_norm": 6.957115650177002,
6591
+ "learning_rate": 1.1156571792324211e-06,
6592
+ "loss": 2.3371,
6593
+ "step": 936
6594
+ },
6595
+ {
6596
+ "epoch": 0.33958503216453745,
6597
+ "grad_norm": 4.749189376831055,
6598
+ "learning_rate": 1.0811904889859336e-06,
6599
+ "loss": 2.3668,
6600
+ "step": 937
6601
+ },
6602
+ {
6603
+ "epoch": 0.33994744948808553,
6604
+ "grad_norm": 7.631181716918945,
6605
+ "learning_rate": 1.0472587670027678e-06,
6606
+ "loss": 3.3614,
6607
+ "step": 938
6608
+ },
6609
+ {
6610
+ "epoch": 0.3403098668116336,
6611
+ "grad_norm": 5.728475570678711,
6612
+ "learning_rate": 1.0138623843548078e-06,
6613
+ "loss": 2.0496,
6614
+ "step": 939
6615
+ },
6616
+ {
6617
+ "epoch": 0.34067228413518164,
6618
+ "grad_norm": 6.60684871673584,
6619
+ "learning_rate": 9.810017062595322e-07,
6620
+ "loss": 2.7532,
6621
+ "step": 940
6622
+ },
6623
+ {
6624
+ "epoch": 0.3410347014587297,
6625
+ "grad_norm": 6.326074123382568,
6626
+ "learning_rate": 9.486770920760668e-07,
6627
+ "loss": 2.5077,
6628
+ "step": 941
6629
+ },
6630
+ {
6631
+ "epoch": 0.3413971187822778,
6632
+ "grad_norm": 7.019238471984863,
6633
+ "learning_rate": 9.168888953011989e-07,
6634
+ "loss": 2.0309,
6635
+ "step": 942
6636
+ },
6637
+ {
6638
+ "epoch": 0.3417595361058259,
6639
+ "grad_norm": 8.484062194824219,
6640
+ "learning_rate": 8.856374635655695e-07,
6641
+ "loss": 3.3332,
6642
+ "step": 943
6643
+ },
6644
+ {
6645
+ "epoch": 0.3421219534293739,
6646
+ "grad_norm": 7.0083842277526855,
6647
+ "learning_rate": 8.549231386298151e-07,
6648
+ "loss": 2.9812,
6649
+ "step": 944
6650
+ },
6651
+ {
6652
+ "epoch": 0.342484370752922,
6653
+ "grad_norm": 6.267253875732422,
6654
+ "learning_rate": 8.247462563808817e-07,
6655
+ "loss": 3.0255,
6656
+ "step": 945
6657
+ },
6658
+ {
6659
+ "epoch": 0.34284678807647007,
6660
+ "grad_norm": 8.533939361572266,
6661
+ "learning_rate": 7.951071468283167e-07,
6662
+ "loss": 3.2914,
6663
+ "step": 946
6664
+ },
6665
+ {
6666
+ "epoch": 0.34320920540001815,
6667
+ "grad_norm": 6.0240373611450195,
6668
+ "learning_rate": 7.66006134100672e-07,
6669
+ "loss": 3.0139,
6670
+ "step": 947
6671
+ },
6672
+ {
6673
+ "epoch": 0.3435716227235662,
6674
+ "grad_norm": 6.486016750335693,
6675
+ "learning_rate": 7.374435364419674e-07,
6676
+ "loss": 2.9023,
6677
+ "step": 948
6678
+ },
6679
+ {
6680
+ "epoch": 0.34393404004711425,
6681
+ "grad_norm": 8.553296089172363,
6682
+ "learning_rate": 7.094196662081831e-07,
6683
+ "loss": 2.9581,
6684
+ "step": 949
6685
+ },
6686
+ {
6687
+ "epoch": 0.34429645737066233,
6688
+ "grad_norm": 6.792461395263672,
6689
+ "learning_rate": 6.819348298638839e-07,
6690
+ "loss": 3.0948,
6691
+ "step": 950
6692
+ },
6693
+ {
6694
+ "epoch": 0.34465887469421036,
6695
+ "grad_norm": 6.591213703155518,
6696
+ "learning_rate": 6.549893279788277e-07,
6697
+ "loss": 2.7468,
6698
+ "step": 951
6699
+ },
6700
+ {
6701
+ "epoch": 0.34502129201775844,
6702
+ "grad_norm": 7.372961044311523,
6703
+ "learning_rate": 6.285834552247128e-07,
6704
+ "loss": 2.6526,
6705
+ "step": 952
6706
+ },
6707
+ {
6708
+ "epoch": 0.3453837093413065,
6709
+ "grad_norm": 5.669467926025391,
6710
+ "learning_rate": 6.027175003719354e-07,
6711
+ "loss": 2.7276,
6712
+ "step": 953
6713
+ },
6714
+ {
6715
+ "epoch": 0.3457461266648546,
6716
+ "grad_norm": 6.340987682342529,
6717
+ "learning_rate": 5.773917462864264e-07,
6718
+ "loss": 2.5295,
6719
+ "step": 954
6720
+ },
6721
+ {
6722
+ "epoch": 0.34610854398840263,
6723
+ "grad_norm": 6.276302814483643,
6724
+ "learning_rate": 5.526064699265753e-07,
6725
+ "loss": 3.6131,
6726
+ "step": 955
6727
+ },
6728
+ {
6729
+ "epoch": 0.3464709613119507,
6730
+ "grad_norm": 5.535942077636719,
6731
+ "learning_rate": 5.283619423401998e-07,
6732
+ "loss": 2.4283,
6733
+ "step": 956
6734
+ },
6735
+ {
6736
+ "epoch": 0.3468333786354988,
6737
+ "grad_norm": 7.801699638366699,
6738
+ "learning_rate": 5.046584286615697e-07,
6739
+ "loss": 2.8357,
6740
+ "step": 957
6741
+ },
6742
+ {
6743
+ "epoch": 0.3471957959590468,
6744
+ "grad_norm": 8.31689167022705,
6745
+ "learning_rate": 4.814961881085045e-07,
6746
+ "loss": 3.3685,
6747
+ "step": 958
6748
+ },
6749
+ {
6750
+ "epoch": 0.3475582132825949,
6751
+ "grad_norm": 6.9537577629089355,
6752
+ "learning_rate": 4.5887547397955864e-07,
6753
+ "loss": 3.3186,
6754
+ "step": 959
6755
+ },
6756
+ {
6757
+ "epoch": 0.347920630606143,
6758
+ "grad_norm": 6.808418273925781,
6759
+ "learning_rate": 4.367965336512403e-07,
6760
+ "loss": 2.6928,
6761
+ "step": 960
6762
+ },
6763
+ {
6764
+ "epoch": 0.34828304792969106,
6765
+ "grad_norm": 5.8220086097717285,
6766
+ "learning_rate": 4.1525960857530243e-07,
6767
+ "loss": 2.7677,
6768
+ "step": 961
6769
+ },
6770
+ {
6771
+ "epoch": 0.3486454652532391,
6772
+ "grad_norm": 5.832417011260986,
6773
+ "learning_rate": 3.9426493427611177e-07,
6774
+ "loss": 2.819,
6775
+ "step": 962
6776
+ },
6777
+ {
6778
+ "epoch": 0.34900788257678717,
6779
+ "grad_norm": 7.183162689208984,
6780
+ "learning_rate": 3.738127403480507e-07,
6781
+ "loss": 2.7156,
6782
+ "step": 963
6783
+ },
6784
+ {
6785
+ "epoch": 0.34937029990033525,
6786
+ "grad_norm": 7.9874677658081055,
6787
+ "learning_rate": 3.5390325045304706e-07,
6788
+ "loss": 3.0892,
6789
+ "step": 964
6790
+ },
6791
+ {
6792
+ "epoch": 0.3497327172238833,
6793
+ "grad_norm": 7.457160949707031,
6794
+ "learning_rate": 3.3453668231809286e-07,
6795
+ "loss": 3.4502,
6796
+ "step": 965
6797
+ },
6798
+ {
6799
+ "epoch": 0.35009513454743135,
6800
+ "grad_norm": 7.874043941497803,
6801
+ "learning_rate": 3.157132477328628e-07,
6802
+ "loss": 2.7757,
6803
+ "step": 966
6804
+ },
6805
+ {
6806
+ "epoch": 0.35045755187097943,
6807
+ "grad_norm": 7.999273777008057,
6808
+ "learning_rate": 2.9743315254743833e-07,
6809
+ "loss": 3.0866,
6810
+ "step": 967
6811
+ },
6812
+ {
6813
+ "epoch": 0.3508199691945275,
6814
+ "grad_norm": 8.767258644104004,
6815
+ "learning_rate": 2.796965966699927e-07,
6816
+ "loss": 2.5532,
6817
+ "step": 968
6818
+ },
6819
+ {
6820
+ "epoch": 0.35118238651807554,
6821
+ "grad_norm": 6.00533390045166,
6822
+ "learning_rate": 2.625037740646763e-07,
6823
+ "loss": 2.9059,
6824
+ "step": 969
6825
+ },
6826
+ {
6827
+ "epoch": 0.3515448038416236,
6828
+ "grad_norm": 5.833471298217773,
6829
+ "learning_rate": 2.458548727494292e-07,
6830
+ "loss": 3.3388,
6831
+ "step": 970
6832
+ },
6833
+ {
6834
+ "epoch": 0.3519072211651717,
6835
+ "grad_norm": 6.837587833404541,
6836
+ "learning_rate": 2.2975007479397738e-07,
6837
+ "loss": 2.9694,
6838
+ "step": 971
6839
+ },
6840
+ {
6841
+ "epoch": 0.3522696384887198,
6842
+ "grad_norm": 7.076719760894775,
6843
+ "learning_rate": 2.1418955631781202e-07,
6844
+ "loss": 2.2279,
6845
+ "step": 972
6846
+ },
6847
+ {
6848
+ "epoch": 0.3526320558122678,
6849
+ "grad_norm": 5.842435359954834,
6850
+ "learning_rate": 1.9917348748826335e-07,
6851
+ "loss": 2.7922,
6852
+ "step": 973
6853
+ },
6854
+ {
6855
+ "epoch": 0.3529944731358159,
6856
+ "grad_norm": 8.36257266998291,
6857
+ "learning_rate": 1.847020325186577e-07,
6858
+ "loss": 3.7132,
6859
+ "step": 974
6860
+ },
6861
+ {
6862
+ "epoch": 0.35335689045936397,
6863
+ "grad_norm": 7.264481067657471,
6864
+ "learning_rate": 1.7077534966650766e-07,
6865
+ "loss": 2.8945,
6866
+ "step": 975
6867
+ },
6868
+ {
6869
+ "epoch": 0.353719307782912,
6870
+ "grad_norm": 5.050163269042969,
6871
+ "learning_rate": 1.5739359123178587e-07,
6872
+ "loss": 2.8021,
6873
+ "step": 976
6874
+ },
6875
+ {
6876
+ "epoch": 0.3540817251064601,
6877
+ "grad_norm": 6.886010646820068,
6878
+ "learning_rate": 1.4455690355525964e-07,
6879
+ "loss": 3.1517,
6880
+ "step": 977
6881
+ },
6882
+ {
6883
+ "epoch": 0.35444414243000816,
6884
+ "grad_norm": 5.721440315246582,
6885
+ "learning_rate": 1.3226542701689215e-07,
6886
+ "loss": 2.6086,
6887
+ "step": 978
6888
+ },
6889
+ {
6890
+ "epoch": 0.35480655975355624,
6891
+ "grad_norm": 5.630771160125732,
6892
+ "learning_rate": 1.2051929603428825e-07,
6893
+ "loss": 2.9555,
6894
+ "step": 979
6895
+ },
6896
+ {
6897
+ "epoch": 0.35516897707710426,
6898
+ "grad_norm": 6.419443130493164,
6899
+ "learning_rate": 1.0931863906127327e-07,
6900
+ "loss": 2.4218,
6901
+ "step": 980
6902
+ },
6903
+ {
6904
+ "epoch": 0.35553139440065235,
6905
+ "grad_norm": 8.198994636535645,
6906
+ "learning_rate": 9.866357858642205e-08,
6907
+ "loss": 2.5117,
6908
+ "step": 981
6909
+ },
6910
+ {
6911
+ "epoch": 0.3558938117242004,
6912
+ "grad_norm": 6.984527587890625,
6913
+ "learning_rate": 8.855423113177664e-08,
6914
+ "loss": 2.5872,
6915
+ "step": 982
6916
+ },
6917
+ {
6918
+ "epoch": 0.3562562290477485,
6919
+ "grad_norm": 6.198144435882568,
6920
+ "learning_rate": 7.899070725153613e-08,
6921
+ "loss": 2.5865,
6922
+ "step": 983
6923
+ },
6924
+ {
6925
+ "epoch": 0.35661864637129653,
6926
+ "grad_norm": 5.657758712768555,
6927
+ "learning_rate": 6.997311153086883e-08,
6928
+ "loss": 2.7767,
6929
+ "step": 984
6930
+ },
6931
+ {
6932
+ "epoch": 0.3569810636948446,
6933
+ "grad_norm": 8.617463111877441,
6934
+ "learning_rate": 6.150154258476315e-08,
6935
+ "loss": 2.5342,
6936
+ "step": 985
6937
+ },
6938
+ {
6939
+ "epoch": 0.3573434810183927,
6940
+ "grad_norm": 6.265313625335693,
6941
+ "learning_rate": 5.3576093056922906e-08,
6942
+ "loss": 1.9735,
6943
+ "step": 986
6944
+ },
6945
+ {
6946
+ "epoch": 0.3577058983419407,
6947
+ "grad_norm": 4.9574875831604,
6948
+ "learning_rate": 4.619684961881254e-08,
6949
+ "loss": 1.8526,
6950
+ "step": 987
6951
+ },
6952
+ {
6953
+ "epoch": 0.3580683156654888,
6954
+ "grad_norm": 6.38330602645874,
6955
+ "learning_rate": 3.936389296864129e-08,
6956
+ "loss": 3.3058,
6957
+ "step": 988
6958
+ },
6959
+ {
6960
+ "epoch": 0.3584307329890369,
6961
+ "grad_norm": 6.539586067199707,
6962
+ "learning_rate": 3.3077297830541584e-08,
6963
+ "loss": 3.1111,
6964
+ "step": 989
6965
+ },
6966
+ {
6967
+ "epoch": 0.35879315031258496,
6968
+ "grad_norm": 6.351942539215088,
6969
+ "learning_rate": 2.7337132953697554e-08,
6970
+ "loss": 2.7353,
6971
+ "step": 990
6972
+ },
6973
+ {
6974
+ "epoch": 0.359155567636133,
6975
+ "grad_norm": 8.30229377746582,
6976
+ "learning_rate": 2.214346111164556e-08,
6977
+ "loss": 2.9746,
6978
+ "step": 991
6979
+ },
6980
+ {
6981
+ "epoch": 0.35951798495968107,
6982
+ "grad_norm": 6.088867664337158,
6983
+ "learning_rate": 1.749633910153592e-08,
6984
+ "loss": 2.5657,
6985
+ "step": 992
6986
+ },
6987
+ {
6988
+ "epoch": 0.35988040228322915,
6989
+ "grad_norm": 5.72982931137085,
6990
+ "learning_rate": 1.3395817743561134e-08,
6991
+ "loss": 2.762,
6992
+ "step": 993
6993
+ },
6994
+ {
6995
+ "epoch": 0.36024281960677723,
6996
+ "grad_norm": 7.2047576904296875,
6997
+ "learning_rate": 9.841941880361916e-09,
6998
+ "loss": 2.6464,
6999
+ "step": 994
7000
+ },
7001
+ {
7002
+ "epoch": 0.36060523693032526,
7003
+ "grad_norm": 4.944203853607178,
7004
+ "learning_rate": 6.834750376549792e-09,
7005
+ "loss": 2.7518,
7006
+ "step": 995
7007
+ },
7008
+ {
7009
+ "epoch": 0.36096765425387334,
7010
+ "grad_norm": 6.469346523284912,
7011
+ "learning_rate": 4.3742761183018784e-09,
7012
+ "loss": 3.3189,
7013
+ "step": 996
7014
+ },
7015
+ {
7016
+ "epoch": 0.3613300715774214,
7017
+ "grad_norm": 5.523458480834961,
7018
+ "learning_rate": 2.4605460129556445e-09,
7019
+ "loss": 2.5312,
7020
+ "step": 997
7021
+ },
7022
+ {
7023
+ "epoch": 0.36169248890096944,
7024
+ "grad_norm": 6.366145133972168,
7025
+ "learning_rate": 1.0935809887702154e-09,
7026
+ "loss": 2.7652,
7027
+ "step": 998
7028
+ },
7029
+ {
7030
+ "epoch": 0.3620549062245175,
7031
+ "grad_norm": 6.140265941619873,
7032
+ "learning_rate": 2.7339599464326627e-10,
7033
+ "loss": 2.759,
7034
+ "step": 999
7035
+ },
7036
+ {
7037
+ "epoch": 0.3624173235480656,
7038
+ "grad_norm": 6.8456130027771,
7039
+ "learning_rate": 0.0,
7040
+ "loss": 2.8016,
7041
+ "step": 1000
7042
+ },
7043
+ {
7044
+ "epoch": 0.3624173235480656,
7045
+ "eval_loss": 2.871779203414917,
7046
+ "eval_runtime": 179.6734,
7047
+ "eval_samples_per_second": 6.467,
7048
+ "eval_steps_per_second": 3.234,
7049
+ "step": 1000
7050
  }
7051
  ],
7052
  "logging_steps": 1,
 
7061
  "should_evaluate": false,
7062
  "should_log": false,
7063
  "should_save": true,
7064
+ "should_training_stop": true
7065
  },
7066
  "attributes": {}
7067
  }
7068
  },
7069
+ "total_flos": 7.209543008256e+16,
7070
  "train_batch_size": 2,
7071
  "trial_name": null,
7072
  "trial_params": null