Alawy21 commited on
Commit
2e05876
·
verified ·
1 Parent(s): dccabc0

Training in progress, step 1000, checkpoint

Browse files
last-checkpoint/adapter_config.json CHANGED
@@ -25,12 +25,12 @@
25
  "revision": null,
26
  "target_modules": [
27
  "up_proj",
28
- "down_proj",
29
- "gate_proj",
30
  "k_proj",
31
- "v_proj",
 
32
  "q_proj",
33
- "o_proj"
 
34
  ],
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
 
25
  "revision": null,
26
  "target_modules": [
27
  "up_proj",
 
 
28
  "k_proj",
29
+ "down_proj",
30
+ "o_proj",
31
  "q_proj",
32
+ "v_proj",
33
+ "gate_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f06a94a8fd2110630b29f1a0fd61ece5d491e081468de01dce6bcb49a29196b3
3
  size 295488936
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85928d986f2cd15fb887ee4e40bc18ea50570b9cc2b7c36dae87cce33f1f15c9
3
  size 295488936
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:583aa4e7918fb15bddf477cd5bcb1d9cae4199ecc8a16903e5bbd6fb6f6c04a5
3
  size 591203178
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55fc203eded4895da87380bd227f3349d9e16d83a889d7bd948c3d71d682fccf
3
  size 591203178
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d245e05e72192c132e0f2edb6fdcae0c578c890f0fe912f17ec7b0bba2d38cc3
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7682299c566684ea51cf26f0c86b6ffaa3c0bc63cbdf84674b29a2c62ac72143
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:822082b213c0642410a7baeaf5135c14aa4c940d0ddb23409fe69b75772aee28
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c46d5bba25f025149448ec23ffcd8a74fcd8ccfdf129227c5a3d660bb3b1333
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.6491499227202473,
6
  "eval_steps": 100,
7
- "global_step": 800,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -632,6 +632,162 @@
632
  "eval_samples_per_second": 0.409,
633
  "eval_steps_per_second": 0.409,
634
  "step": 800
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
635
  }
636
  ],
637
  "logging_steps": 10,
@@ -651,7 +807,7 @@
651
  "attributes": {}
652
  }
653
  },
654
- "total_flos": 4.309328094007296e+16,
655
  "train_batch_size": 1,
656
  "trial_name": null,
657
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.061823802163833,
6
  "eval_steps": 100,
7
+ "global_step": 1000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
632
  "eval_samples_per_second": 0.409,
633
  "eval_steps_per_second": 0.409,
634
  "step": 800
635
+ },
636
+ {
637
+ "epoch": 1.6697578567748583,
638
+ "grad_norm": 0.4012995660305023,
639
+ "learning_rate": 4.898007313042975e-05,
640
+ "loss": 0.1359,
641
+ "step": 810
642
+ },
643
+ {
644
+ "epoch": 1.6903657908294694,
645
+ "grad_norm": 0.4577192962169647,
646
+ "learning_rate": 4.7780734517504985e-05,
647
+ "loss": 0.1306,
648
+ "step": 820
649
+ },
650
+ {
651
+ "epoch": 1.7109737248840804,
652
+ "grad_norm": 0.5553697943687439,
653
+ "learning_rate": 4.658267413416326e-05,
654
+ "loss": 0.1414,
655
+ "step": 830
656
+ },
657
+ {
658
+ "epoch": 1.7315816589386914,
659
+ "grad_norm": 0.5764682292938232,
660
+ "learning_rate": 4.5386582026834906e-05,
661
+ "loss": 0.1407,
662
+ "step": 840
663
+ },
664
+ {
665
+ "epoch": 1.7521895929933025,
666
+ "grad_norm": 0.4649393558502197,
667
+ "learning_rate": 4.4193147108283016e-05,
668
+ "loss": 0.1291,
669
+ "step": 850
670
+ },
671
+ {
672
+ "epoch": 1.7727975270479135,
673
+ "grad_norm": 0.6201866865158081,
674
+ "learning_rate": 4.300305676081057e-05,
675
+ "loss": 0.1434,
676
+ "step": 860
677
+ },
678
+ {
679
+ "epoch": 1.7934054611025245,
680
+ "grad_norm": 0.6112945079803467,
681
+ "learning_rate": 4.1816996440349104e-05,
682
+ "loss": 0.1295,
683
+ "step": 870
684
+ },
685
+ {
686
+ "epoch": 1.8140133951571356,
687
+ "grad_norm": 0.4921644330024719,
688
+ "learning_rate": 4.063564928165682e-05,
689
+ "loss": 0.1325,
690
+ "step": 880
691
+ },
692
+ {
693
+ "epoch": 1.8346213292117466,
694
+ "grad_norm": 0.665326714515686,
695
+ "learning_rate": 3.9459695704853836e-05,
696
+ "loss": 0.164,
697
+ "step": 890
698
+ },
699
+ {
700
+ "epoch": 1.8552292632663576,
701
+ "grad_norm": 0.5689858198165894,
702
+ "learning_rate": 3.828981302352065e-05,
703
+ "loss": 0.147,
704
+ "step": 900
705
+ },
706
+ {
707
+ "epoch": 1.8552292632663576,
708
+ "eval_loss": 0.1883440464735031,
709
+ "eval_runtime": 149.5929,
710
+ "eval_samples_per_second": 0.401,
711
+ "eval_steps_per_second": 0.401,
712
+ "step": 900
713
+ },
714
+ {
715
+ "epoch": 1.8758371973209687,
716
+ "grad_norm": 0.5504728555679321,
717
+ "learning_rate": 3.712667505458622e-05,
718
+ "loss": 0.1228,
719
+ "step": 910
720
+ },
721
+ {
722
+ "epoch": 1.8964451313755797,
723
+ "grad_norm": 0.8000712990760803,
724
+ "learning_rate": 3.5970951730229785e-05,
725
+ "loss": 0.135,
726
+ "step": 920
727
+ },
728
+ {
729
+ "epoch": 1.9170530654301907,
730
+ "grad_norm": 0.4071889817714691,
731
+ "learning_rate": 3.482330871202029e-05,
732
+ "loss": 0.1197,
733
+ "step": 930
734
+ },
735
+ {
736
+ "epoch": 1.9376609994848017,
737
+ "grad_norm": 0.46653851866722107,
738
+ "learning_rate": 3.3684407007515484e-05,
739
+ "loss": 0.1395,
740
+ "step": 940
741
+ },
742
+ {
743
+ "epoch": 1.9582689335394128,
744
+ "grad_norm": 0.6100273132324219,
745
+ "learning_rate": 3.255490258954167e-05,
746
+ "loss": 0.1347,
747
+ "step": 950
748
+ },
749
+ {
750
+ "epoch": 1.9788768675940238,
751
+ "grad_norm": 0.5089631080627441,
752
+ "learning_rate": 3.14354460183732e-05,
753
+ "loss": 0.12,
754
+ "step": 960
755
+ },
756
+ {
757
+ "epoch": 1.9994848016486348,
758
+ "grad_norm": 0.4348441958427429,
759
+ "learning_rate": 3.032668206702959e-05,
760
+ "loss": 0.1289,
761
+ "step": 970
762
+ },
763
+ {
764
+ "epoch": 2.020607934054611,
765
+ "grad_norm": 0.3667771816253662,
766
+ "learning_rate": 2.9229249349905684e-05,
767
+ "loss": 0.0908,
768
+ "step": 980
769
+ },
770
+ {
771
+ "epoch": 2.041215868109222,
772
+ "grad_norm": 0.420012891292572,
773
+ "learning_rate": 2.8143779954949267e-05,
774
+ "loss": 0.0718,
775
+ "step": 990
776
+ },
777
+ {
778
+ "epoch": 2.061823802163833,
779
+ "grad_norm": 0.3521002233028412,
780
+ "learning_rate": 2.70708990795975e-05,
781
+ "loss": 0.0668,
782
+ "step": 1000
783
+ },
784
+ {
785
+ "epoch": 2.061823802163833,
786
+ "eval_loss": 0.1961071491241455,
787
+ "eval_runtime": 149.4232,
788
+ "eval_samples_per_second": 0.402,
789
+ "eval_steps_per_second": 0.402,
790
+ "step": 1000
791
  }
792
  ],
793
  "logging_steps": 10,
 
807
  "attributes": {}
808
  }
809
  },
810
+ "total_flos": 5.394040116523008e+16,
811
  "train_batch_size": 1,
812
  "trial_name": null,
813
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a137e88e0e55a17fda65e854b20c699b62b6ccacbdb5fb18d98d0daf1f24b9f
3
  size 5816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:929586f453c837dd2c45da477cee81ff19a4b897cb7beb925285c835f78d9f3f
3
  size 5816