romainnn commited on
Commit
086945d
·
verified ·
1 Parent(s): a6df4c7

Training in progress, step 966, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:022acefac6286471ee8d2c71c3690fd8a565a014ef058c4e3af4f78951dbd0c6
3
  size 159967880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48b44ffedd3bcc2378fb4b6ff819562642a49efef4ae3d053c11cf98092a991c
3
  size 159967880
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6815cc9faa2ed01fbe0f7494c50e2dad1e8d76d810e7a58f8875d331c5a18757
3
  size 81730644
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c65324517999537d1a713978c190754cee80df1b8d07519564e3bdbbe6f6f00
3
  size 81730644
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a51e93265e2c6a6f43a552e36ccd1c9901c5296dbb2ff8ac30a43fe5205e60a5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:901b58ab73b39325bfea8aed8a9f472d920fed35a9e5a296018097c13d84b1ca
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4f7ccd40217515b1adb168de499560d4c6c803e2bb562e36a2dfe417244db986
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:823c6cdea481d02c5473910d781463e3c449c8632b36bf92b3536a92203bd40d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.723136305809021,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-900",
4
- "epoch": 0.06268500783562599,
5
  "eval_steps": 100,
6
- "global_step": 900,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -6387,6 +6387,468 @@
6387
  "eval_samples_per_second": 7.139,
6388
  "eval_steps_per_second": 1.785,
6389
  "step": 900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6390
  }
6391
  ],
6392
  "logging_steps": 1,
@@ -6410,12 +6872,12 @@
6410
  "should_evaluate": false,
6411
  "should_log": false,
6412
  "should_save": true,
6413
- "should_training_stop": false
6414
  },
6415
  "attributes": {}
6416
  }
6417
  },
6418
- "total_flos": 2.448712204661293e+18,
6419
  "train_batch_size": 4,
6420
  "trial_name": null,
6421
  "trial_params": null
 
1
  {
2
  "best_metric": 0.723136305809021,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-900",
4
+ "epoch": 0.06728190841023855,
5
  "eval_steps": 100,
6
+ "global_step": 966,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
6387
  "eval_samples_per_second": 7.139,
6388
  "eval_steps_per_second": 1.785,
6389
  "step": 900
6390
+ },
6391
+ {
6392
+ "epoch": 0.06275465784433223,
6393
+ "grad_norm": 0.7109830379486084,
6394
+ "learning_rate": 2.272630321321023e-06,
6395
+ "loss": 0.704,
6396
+ "step": 901
6397
+ },
6398
+ {
6399
+ "epoch": 0.06282430785303848,
6400
+ "grad_norm": 0.4886980950832367,
6401
+ "learning_rate": 2.20349711463943e-06,
6402
+ "loss": 0.4915,
6403
+ "step": 902
6404
+ },
6405
+ {
6406
+ "epoch": 0.06289395786174473,
6407
+ "grad_norm": 0.6534592509269714,
6408
+ "learning_rate": 2.135420012462619e-06,
6409
+ "loss": 0.6073,
6410
+ "step": 903
6411
+ },
6412
+ {
6413
+ "epoch": 0.06296360787045098,
6414
+ "grad_norm": 0.5471417903900146,
6415
+ "learning_rate": 2.0683997499552632e-06,
6416
+ "loss": 0.6319,
6417
+ "step": 904
6418
+ },
6419
+ {
6420
+ "epoch": 0.06303325787915723,
6421
+ "grad_norm": 0.765691876411438,
6422
+ "learning_rate": 2.0024370508692104e-06,
6423
+ "loss": 0.9544,
6424
+ "step": 905
6425
+ },
6426
+ {
6427
+ "epoch": 0.06310290788786349,
6428
+ "grad_norm": 0.6834742426872253,
6429
+ "learning_rate": 1.9375326275357208e-06,
6430
+ "loss": 0.8162,
6431
+ "step": 906
6432
+ },
6433
+ {
6434
+ "epoch": 0.06317255789656974,
6435
+ "grad_norm": 0.7233893871307373,
6436
+ "learning_rate": 1.8736871808576861e-06,
6437
+ "loss": 1.0311,
6438
+ "step": 907
6439
+ },
6440
+ {
6441
+ "epoch": 0.06324220790527599,
6442
+ "grad_norm": 0.6150738000869751,
6443
+ "learning_rate": 1.8109014003021452e-06,
6444
+ "loss": 0.9241,
6445
+ "step": 908
6446
+ },
6447
+ {
6448
+ "epoch": 0.06331185791398224,
6449
+ "grad_norm": 0.7470687031745911,
6450
+ "learning_rate": 1.7491759638927686e-06,
6451
+ "loss": 1.1686,
6452
+ "step": 909
6453
+ },
6454
+ {
6455
+ "epoch": 0.06338150792268848,
6456
+ "grad_norm": 0.7098023295402527,
6457
+ "learning_rate": 1.6885115382026085e-06,
6458
+ "loss": 1.1531,
6459
+ "step": 910
6460
+ },
6461
+ {
6462
+ "epoch": 0.06345115793139475,
6463
+ "grad_norm": 0.6397354006767273,
6464
+ "learning_rate": 1.628908778346827e-06,
6465
+ "loss": 0.9153,
6466
+ "step": 911
6467
+ },
6468
+ {
6469
+ "epoch": 0.063520807940101,
6470
+ "grad_norm": 0.6609793305397034,
6471
+ "learning_rate": 1.5703683279756797e-06,
6472
+ "loss": 0.641,
6473
+ "step": 912
6474
+ },
6475
+ {
6476
+ "epoch": 0.06359045794880724,
6477
+ "grad_norm": 0.7062059640884399,
6478
+ "learning_rate": 1.5128908192675318e-06,
6479
+ "loss": 0.7182,
6480
+ "step": 913
6481
+ },
6482
+ {
6483
+ "epoch": 0.06366010795751349,
6484
+ "grad_norm": 0.6093196272850037,
6485
+ "learning_rate": 1.4564768729220412e-06,
6486
+ "loss": 0.6793,
6487
+ "step": 914
6488
+ },
6489
+ {
6490
+ "epoch": 0.06372975796621974,
6491
+ "grad_norm": 0.6978054642677307,
6492
+ "learning_rate": 1.401127098153443e-06,
6493
+ "loss": 0.7592,
6494
+ "step": 915
6495
+ },
6496
+ {
6497
+ "epoch": 0.063799407974926,
6498
+ "grad_norm": 0.5635403394699097,
6499
+ "learning_rate": 1.3468420926840197e-06,
6500
+ "loss": 0.869,
6501
+ "step": 916
6502
+ },
6503
+ {
6504
+ "epoch": 0.06386905798363225,
6505
+ "grad_norm": 0.6903446912765503,
6506
+ "learning_rate": 1.2936224427375521e-06,
6507
+ "loss": 0.7401,
6508
+ "step": 917
6509
+ },
6510
+ {
6511
+ "epoch": 0.0639387079923385,
6512
+ "grad_norm": 0.6210869550704956,
6513
+ "learning_rate": 1.2414687230331123e-06,
6514
+ "loss": 0.5908,
6515
+ "step": 918
6516
+ },
6517
+ {
6518
+ "epoch": 0.06400835800104475,
6519
+ "grad_norm": 0.6113409399986267,
6520
+ "learning_rate": 1.1903814967787253e-06,
6521
+ "loss": 0.5493,
6522
+ "step": 919
6523
+ },
6524
+ {
6525
+ "epoch": 0.064078008009751,
6526
+ "grad_norm": 0.9400643706321716,
6527
+ "learning_rate": 1.1403613156654059e-06,
6528
+ "loss": 1.0418,
6529
+ "step": 920
6530
+ },
6531
+ {
6532
+ "epoch": 0.06414765801845725,
6533
+ "grad_norm": 0.683574378490448,
6534
+ "learning_rate": 1.091408719861109e-06,
6535
+ "loss": 0.9345,
6536
+ "step": 921
6537
+ },
6538
+ {
6539
+ "epoch": 0.06421730802716351,
6540
+ "grad_norm": 0.7595987915992737,
6541
+ "learning_rate": 1.0435242380049559e-06,
6542
+ "loss": 0.8716,
6543
+ "step": 922
6544
+ },
6545
+ {
6546
+ "epoch": 0.06428695803586976,
6547
+ "grad_norm": 0.6851724982261658,
6548
+ "learning_rate": 9.967083872015282e-07,
6549
+ "loss": 0.5158,
6550
+ "step": 923
6551
+ },
6552
+ {
6553
+ "epoch": 0.064356608044576,
6554
+ "grad_norm": 0.6724770069122314,
6555
+ "learning_rate": 9.509616730151827e-07,
6556
+ "loss": 0.5133,
6557
+ "step": 924
6558
+ },
6559
+ {
6560
+ "epoch": 0.06442625805328225,
6561
+ "grad_norm": 0.6596947312355042,
6562
+ "learning_rate": 9.062845894647676e-07,
6563
+ "loss": 0.6722,
6564
+ "step": 925
6565
+ },
6566
+ {
6567
+ "epoch": 0.0644959080619885,
6568
+ "grad_norm": 0.5619158148765564,
6569
+ "learning_rate": 8.626776190181041e-07,
6570
+ "loss": 0.9499,
6571
+ "step": 926
6572
+ },
6573
+ {
6574
+ "epoch": 0.06456555807069476,
6575
+ "grad_norm": 0.7573150992393494,
6576
+ "learning_rate": 8.20141232586924e-07,
6577
+ "loss": 0.7521,
6578
+ "step": 927
6579
+ },
6580
+ {
6581
+ "epoch": 0.06463520807940101,
6582
+ "grad_norm": 0.6126770377159119,
6583
+ "learning_rate": 7.786758895216629e-07,
6584
+ "loss": 0.6616,
6585
+ "step": 928
6586
+ },
6587
+ {
6588
+ "epoch": 0.06470485808810726,
6589
+ "grad_norm": 0.7481774687767029,
6590
+ "learning_rate": 7.382820376066302e-07,
6591
+ "loss": 0.8779,
6592
+ "step": 929
6593
+ },
6594
+ {
6595
+ "epoch": 0.06477450809681351,
6596
+ "grad_norm": 0.7029200792312622,
6597
+ "learning_rate": 6.98960113055025e-07,
6598
+ "loss": 0.7685,
6599
+ "step": 930
6600
+ },
6601
+ {
6602
+ "epoch": 0.06484415810551976,
6603
+ "grad_norm": 0.6455416679382324,
6604
+ "learning_rate": 6.607105405043612e-07,
6605
+ "loss": 1.0069,
6606
+ "step": 931
6607
+ },
6608
+ {
6609
+ "epoch": 0.06491380811422602,
6610
+ "grad_norm": 0.7011751532554626,
6611
+ "learning_rate": 6.23533733011783e-07,
6612
+ "loss": 0.6548,
6613
+ "step": 932
6614
+ },
6615
+ {
6616
+ "epoch": 0.06498345812293227,
6617
+ "grad_norm": 0.7533524036407471,
6618
+ "learning_rate": 5.8743009204969e-07,
6619
+ "loss": 0.7463,
6620
+ "step": 933
6621
+ },
6622
+ {
6623
+ "epoch": 0.06505310813163852,
6624
+ "grad_norm": 0.5586950182914734,
6625
+ "learning_rate": 5.52400007501297e-07,
6626
+ "loss": 0.6125,
6627
+ "step": 934
6628
+ },
6629
+ {
6630
+ "epoch": 0.06512275814034477,
6631
+ "grad_norm": 0.6539096832275391,
6632
+ "learning_rate": 5.184438576565253e-07,
6633
+ "loss": 0.8559,
6634
+ "step": 935
6635
+ },
6636
+ {
6637
+ "epoch": 0.06519240814905101,
6638
+ "grad_norm": 0.7584323883056641,
6639
+ "learning_rate": 4.855620092078627e-07,
6640
+ "loss": 1.1142,
6641
+ "step": 936
6642
+ },
6643
+ {
6644
+ "epoch": 0.06526205815775726,
6645
+ "grad_norm": 0.6609397530555725,
6646
+ "learning_rate": 4.537548172464101e-07,
6647
+ "loss": 0.8978,
6648
+ "step": 937
6649
+ },
6650
+ {
6651
+ "epoch": 0.06533170816646353,
6652
+ "grad_norm": 0.6159988641738892,
6653
+ "learning_rate": 4.230226252580516e-07,
6654
+ "loss": 0.6993,
6655
+ "step": 938
6656
+ },
6657
+ {
6658
+ "epoch": 0.06540135817516977,
6659
+ "grad_norm": 0.6153664588928223,
6660
+ "learning_rate": 3.9336576511976863e-07,
6661
+ "loss": 0.4574,
6662
+ "step": 939
6663
+ },
6664
+ {
6665
+ "epoch": 0.06547100818387602,
6666
+ "grad_norm": 0.6489300727844238,
6667
+ "learning_rate": 3.6478455709598734e-07,
6668
+ "loss": 0.7568,
6669
+ "step": 940
6670
+ },
6671
+ {
6672
+ "epoch": 0.06554065819258227,
6673
+ "grad_norm": 0.6248874664306641,
6674
+ "learning_rate": 3.372793098352256e-07,
6675
+ "loss": 0.6879,
6676
+ "step": 941
6677
+ },
6678
+ {
6679
+ "epoch": 0.06561030820128852,
6680
+ "grad_norm": 0.5801978707313538,
6681
+ "learning_rate": 3.108503203666402e-07,
6682
+ "loss": 0.7331,
6683
+ "step": 942
6684
+ },
6685
+ {
6686
+ "epoch": 0.06567995820999478,
6687
+ "grad_norm": 0.605501115322113,
6688
+ "learning_rate": 2.8549787409691833e-07,
6689
+ "loss": 0.6179,
6690
+ "step": 943
6691
+ },
6692
+ {
6693
+ "epoch": 0.06574960821870103,
6694
+ "grad_norm": 0.5972608327865601,
6695
+ "learning_rate": 2.6122224480715775e-07,
6696
+ "loss": 0.6514,
6697
+ "step": 944
6698
+ },
6699
+ {
6700
+ "epoch": 0.06581925822740728,
6701
+ "grad_norm": 0.7556172609329224,
6702
+ "learning_rate": 2.380236946498693e-07,
6703
+ "loss": 0.8719,
6704
+ "step": 945
6705
+ },
6706
+ {
6707
+ "epoch": 0.06588890823611353,
6708
+ "grad_norm": 0.6486802101135254,
6709
+ "learning_rate": 2.1590247414624566e-07,
6710
+ "loss": 0.5719,
6711
+ "step": 946
6712
+ },
6713
+ {
6714
+ "epoch": 0.06595855824481978,
6715
+ "grad_norm": 0.638469398021698,
6716
+ "learning_rate": 1.948588221833303e-07,
6717
+ "loss": 0.6393,
6718
+ "step": 947
6719
+ },
6720
+ {
6721
+ "epoch": 0.06602820825352604,
6722
+ "grad_norm": 0.7082604765892029,
6723
+ "learning_rate": 1.7489296601156392e-07,
6724
+ "loss": 1.0018,
6725
+ "step": 948
6726
+ },
6727
+ {
6728
+ "epoch": 0.06609785826223229,
6729
+ "grad_norm": 0.6530460119247437,
6730
+ "learning_rate": 1.5600512124221978e-07,
6731
+ "loss": 0.7418,
6732
+ "step": 949
6733
+ },
6734
+ {
6735
+ "epoch": 0.06616750827093854,
6736
+ "grad_norm": 0.653685986995697,
6737
+ "learning_rate": 1.3819549184516112e-07,
6738
+ "loss": 0.9309,
6739
+ "step": 950
6740
+ },
6741
+ {
6742
+ "epoch": 0.06623715827964478,
6743
+ "grad_norm": 0.5263675451278687,
6744
+ "learning_rate": 1.2146427014657625e-07,
6745
+ "loss": 0.7189,
6746
+ "step": 951
6747
+ },
6748
+ {
6749
+ "epoch": 0.06630680828835103,
6750
+ "grad_norm": 0.6783672571182251,
6751
+ "learning_rate": 1.0581163682695793e-07,
6752
+ "loss": 0.5871,
6753
+ "step": 952
6754
+ },
6755
+ {
6756
+ "epoch": 0.06637645829705728,
6757
+ "grad_norm": 0.4727168083190918,
6758
+ "learning_rate": 9.123776091908287e-08,
6759
+ "loss": 0.3484,
6760
+ "step": 953
6761
+ },
6762
+ {
6763
+ "epoch": 0.06644610830576354,
6764
+ "grad_norm": 0.5385925769805908,
6765
+ "learning_rate": 7.774279980626853e-08,
6766
+ "loss": 0.5899,
6767
+ "step": 954
6768
+ },
6769
+ {
6770
+ "epoch": 0.06651575831446979,
6771
+ "grad_norm": 0.6668855547904968,
6772
+ "learning_rate": 6.532689922059687e-08,
6773
+ "loss": 1.0131,
6774
+ "step": 955
6775
+ },
6776
+ {
6777
+ "epoch": 0.06658540832317604,
6778
+ "grad_norm": 0.6244344115257263,
6779
+ "learning_rate": 5.3990193241393313e-08,
6780
+ "loss": 0.7458,
6781
+ "step": 956
6782
+ },
6783
+ {
6784
+ "epoch": 0.06665505833188229,
6785
+ "grad_norm": 0.6702743768692017,
6786
+ "learning_rate": 4.373280429375015e-08,
6787
+ "loss": 0.8924,
6788
+ "step": 957
6789
+ },
6790
+ {
6791
+ "epoch": 0.06672470834058854,
6792
+ "grad_norm": 0.6103947758674622,
6793
+ "learning_rate": 3.4554843147216464e-08,
6794
+ "loss": 1.0036,
6795
+ "step": 958
6796
+ },
6797
+ {
6798
+ "epoch": 0.0667943583492948,
6799
+ "grad_norm": 0.622797966003418,
6800
+ "learning_rate": 2.6456408914599108e-08,
6801
+ "loss": 0.8497,
6802
+ "step": 959
6803
+ },
6804
+ {
6805
+ "epoch": 0.06686400835800105,
6806
+ "grad_norm": 0.7076674699783325,
6807
+ "learning_rate": 1.9437589050907977e-08,
6808
+ "loss": 0.5629,
6809
+ "step": 960
6810
+ },
6811
+ {
6812
+ "epoch": 0.0669336583667073,
6813
+ "grad_norm": 0.7682867050170898,
6814
+ "learning_rate": 1.3498459352367931e-08,
6815
+ "loss": 0.7463,
6816
+ "step": 961
6817
+ },
6818
+ {
6819
+ "epoch": 0.06700330837541355,
6820
+ "grad_norm": 0.7987236380577087,
6821
+ "learning_rate": 8.639083955663818e-09,
6822
+ "loss": 1.1664,
6823
+ "step": 962
6824
+ },
6825
+ {
6826
+ "epoch": 0.0670729583841198,
6827
+ "grad_norm": 0.7837391495704651,
6828
+ "learning_rate": 4.859515337174436e-09,
6829
+ "loss": 0.6505,
6830
+ "step": 963
6831
+ },
6832
+ {
6833
+ "epoch": 0.06714260839282606,
6834
+ "grad_norm": 0.6566223502159119,
6835
+ "learning_rate": 2.1597943124729292e-09,
6836
+ "loss": 0.8524,
6837
+ "step": 964
6838
+ },
6839
+ {
6840
+ "epoch": 0.0672122584015323,
6841
+ "grad_norm": 0.6998875737190247,
6842
+ "learning_rate": 5.399500358493903e-10,
6843
+ "loss": 0.8817,
6844
+ "step": 965
6845
+ },
6846
+ {
6847
+ "epoch": 0.06728190841023855,
6848
+ "grad_norm": 0.6083624362945557,
6849
+ "learning_rate": 0.0,
6850
+ "loss": 0.8767,
6851
+ "step": 966
6852
  }
6853
  ],
6854
  "logging_steps": 1,
 
6872
  "should_evaluate": false,
6873
  "should_log": false,
6874
  "should_save": true,
6875
+ "should_training_stop": true
6876
  },
6877
  "attributes": {}
6878
  }
6879
  },
6880
+ "total_flos": 2.628352553502376e+18,
6881
  "train_batch_size": 4,
6882
  "trial_name": null,
6883
  "trial_params": null