statking commited on
Commit
57d62f7
·
verified ·
1 Parent(s): 6b44bdd

Model save

Browse files
README.md ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ tags:
4
+ - trl
5
+ - dpo
6
+ - generated_from_trainer
7
+ base_model: data/zephyr-7b-sft-qlora-merged
8
+ model-index:
9
+ - name: zephyr-7b-dpo-qlora
10
+ results: []
11
+ ---
12
+
13
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
+ should probably proofread and complete it, then remove this comment. -->
15
+
16
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/statking/huggingface/runs/qxp2vmm7)
17
+ # zephyr-7b-dpo-qlora
18
+
19
+ This model was trained from scratch on an unknown dataset.
20
+ It achieves the following results on the evaluation set:
21
+ - Loss: 0.4933
22
+ - Rewards/chosen: -2.1713
23
+ - Rewards/rejected: -3.1801
24
+ - Rewards/accuracies: 0.7738
25
+ - Rewards/margins: 1.0088
26
+ - Logps/rejected: -564.8470
27
+ - Logps/chosen: -483.4024
28
+ - Logits/rejected: -1.4105
29
+ - Logits/chosen: -1.4778
30
+
31
+ ## Model description
32
+
33
+ More information needed
34
+
35
+ ## Intended uses & limitations
36
+
37
+ More information needed
38
+
39
+ ## Training and evaluation data
40
+
41
+ More information needed
42
+
43
+ ## Training procedure
44
+
45
+ ### Training hyperparameters
46
+
47
+ The following hyperparameters were used during training:
48
+ - learning_rate: 5e-06
49
+ - train_batch_size: 4
50
+ - eval_batch_size: 8
51
+ - seed: 42
52
+ - distributed_type: multi-GPU
53
+ - num_devices: 4
54
+ - gradient_accumulation_steps: 4
55
+ - total_train_batch_size: 64
56
+ - total_eval_batch_size: 32
57
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
58
+ - lr_scheduler_type: cosine
59
+ - lr_scheduler_warmup_ratio: 0.1
60
+ - num_epochs: 1
61
+
62
+ ### Training results
63
+
64
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
+ | 0.6185 | 0.1047 | 100 | 0.6240 | -0.3010 | -0.5396 | 0.6964 | 0.2387 | -300.7997 | -296.3736 | -2.2954 | -2.3537 |
67
+ | 0.5724 | 0.2094 | 200 | 0.5692 | -0.8434 | -1.3284 | 0.7302 | 0.4850 | -379.6750 | -350.6113 | -2.2448 | -2.2930 |
68
+ | 0.5366 | 0.3141 | 300 | 0.5249 | -1.6887 | -2.4863 | 0.7639 | 0.7976 | -495.4648 | -435.1429 | -1.6220 | -1.6850 |
69
+ | 0.5397 | 0.4187 | 400 | 0.5253 | -1.2998 | -1.9923 | 0.7698 | 0.6925 | -446.0619 | -396.2537 | -1.7586 | -1.8144 |
70
+ | 0.5003 | 0.5234 | 500 | 0.5013 | -1.9982 | -2.9207 | 0.7659 | 0.9226 | -538.9065 | -466.0909 | -1.6049 | -1.6682 |
71
+ | 0.4835 | 0.6281 | 600 | 0.5027 | -2.5699 | -3.5168 | 0.7560 | 0.9470 | -598.5182 | -523.2593 | -1.3417 | -1.4125 |
72
+ | 0.4715 | 0.7328 | 700 | 0.4956 | -2.1902 | -3.1936 | 0.7679 | 1.0035 | -566.1955 | -485.2894 | -1.3782 | -1.4480 |
73
+ | 0.4898 | 0.8375 | 800 | 0.4948 | -2.0401 | -3.0116 | 0.7698 | 0.9715 | -547.9974 | -470.2821 | -1.4275 | -1.4946 |
74
+ | 0.4785 | 0.9422 | 900 | 0.4933 | -2.1713 | -3.1801 | 0.7738 | 1.0088 | -564.8470 | -483.4024 | -1.4105 | -1.4778 |
75
+
76
+
77
+ ### Framework versions
78
+
79
+ - PEFT 0.10.0
80
+ - Transformers 4.41.0.dev0
81
+ - Pytorch 2.3.0+cu121
82
+ - Datasets 2.19.1
83
+ - Tokenizers 0.19.1
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57347a3a18d8c084e1990f0336fbd2239c42dd81fe7ea30d2a22d5cf10cba66f
3
  size 671150064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0959395381e8b38e8e47659e117c11d160900e65148219841bc9a27c24ed8c6a
3
  size 671150064
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9997382884061764,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.5333428270529702,
5
+ "train_runtime": 30179.1373,
6
+ "train_samples": 61134,
7
+ "train_samples_per_second": 2.026,
8
+ "train_steps_per_second": 0.032
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9997382884061764,
3
+ "total_flos": 0.0,
4
+ "train_loss": 0.5333428270529702,
5
+ "train_runtime": 30179.1373,
6
+ "train_samples": 61134,
7
+ "train_samples_per_second": 2.026,
8
+ "train_steps_per_second": 0.032
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,1626 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9997382884061764,
5
+ "eval_steps": 100,
6
+ "global_step": 955,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0010468463752944255,
13
+ "grad_norm": 1.1290596695646078,
14
+ "learning_rate": 5.208333333333333e-08,
15
+ "logits/chosen": -2.659804344177246,
16
+ "logits/rejected": -2.5501840114593506,
17
+ "logps/chosen": -300.040771484375,
18
+ "logps/rejected": -255.0087127685547,
19
+ "loss": 0.6931,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/rejected": 0.0,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.010468463752944255,
28
+ "grad_norm": 1.014414920556849,
29
+ "learning_rate": 5.208333333333334e-07,
30
+ "logits/chosen": -2.438154935836792,
31
+ "logits/rejected": -2.421139717102051,
32
+ "logps/chosen": -277.2698669433594,
33
+ "logps/rejected": -256.98931884765625,
34
+ "loss": 0.6932,
35
+ "rewards/accuracies": 0.4930555522441864,
36
+ "rewards/chosen": 8.686767250765115e-05,
37
+ "rewards/margins": 0.0004906932590529323,
38
+ "rewards/rejected": -0.0004038256302010268,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.02093692750588851,
43
+ "grad_norm": 1.1593543431279631,
44
+ "learning_rate": 1.0416666666666667e-06,
45
+ "logits/chosen": -2.4331612586975098,
46
+ "logits/rejected": -2.344513177871704,
47
+ "logps/chosen": -281.2090759277344,
48
+ "logps/rejected": -260.51007080078125,
49
+ "loss": 0.6926,
50
+ "rewards/accuracies": 0.5562499761581421,
51
+ "rewards/chosen": 0.0008262035553343594,
52
+ "rewards/margins": 0.0013498691841959953,
53
+ "rewards/rejected": -0.0005236656288616359,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.031405391258832765,
58
+ "grad_norm": 1.1658712617217375,
59
+ "learning_rate": 1.5625e-06,
60
+ "logits/chosen": -2.492827892303467,
61
+ "logits/rejected": -2.421010971069336,
62
+ "logps/chosen": -286.5789489746094,
63
+ "logps/rejected": -267.0865783691406,
64
+ "loss": 0.6909,
65
+ "rewards/accuracies": 0.6187499761581421,
66
+ "rewards/chosen": 0.0038475811015814543,
67
+ "rewards/margins": 0.004234342835843563,
68
+ "rewards/rejected": -0.0003867618797812611,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.04187385501177702,
73
+ "grad_norm": 1.1449870340989134,
74
+ "learning_rate": 2.0833333333333334e-06,
75
+ "logits/chosen": -2.486035108566284,
76
+ "logits/rejected": -2.4186811447143555,
77
+ "logps/chosen": -274.2925720214844,
78
+ "logps/rejected": -260.7481384277344,
79
+ "loss": 0.688,
80
+ "rewards/accuracies": 0.6499999761581421,
81
+ "rewards/chosen": 0.011111991479992867,
82
+ "rewards/margins": 0.01205919124186039,
83
+ "rewards/rejected": -0.0009472008096054196,
84
+ "step": 40
85
+ },
86
+ {
87
+ "epoch": 0.05234231876472128,
88
+ "grad_norm": 1.1126402014071601,
89
+ "learning_rate": 2.604166666666667e-06,
90
+ "logits/chosen": -2.4758384227752686,
91
+ "logits/rejected": -2.404066562652588,
92
+ "logps/chosen": -233.27294921875,
93
+ "logps/rejected": -210.6988525390625,
94
+ "loss": 0.6835,
95
+ "rewards/accuracies": 0.6937500238418579,
96
+ "rewards/chosen": 0.023658782243728638,
97
+ "rewards/margins": 0.025034388527274132,
98
+ "rewards/rejected": -0.0013756046537309885,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.06281078251766553,
103
+ "grad_norm": 1.2225282330679852,
104
+ "learning_rate": 3.125e-06,
105
+ "logits/chosen": -2.4660396575927734,
106
+ "logits/rejected": -2.3986904621124268,
107
+ "logps/chosen": -268.0579528808594,
108
+ "logps/rejected": -230.85205078125,
109
+ "loss": 0.6753,
110
+ "rewards/accuracies": 0.6625000238418579,
111
+ "rewards/chosen": 0.029377218335866928,
112
+ "rewards/margins": 0.03872579708695412,
113
+ "rewards/rejected": -0.009348581545054913,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.07327924627060979,
118
+ "grad_norm": 1.3167193147118952,
119
+ "learning_rate": 3.6458333333333333e-06,
120
+ "logits/chosen": -2.3562026023864746,
121
+ "logits/rejected": -2.327336072921753,
122
+ "logps/chosen": -256.792724609375,
123
+ "logps/rejected": -262.48309326171875,
124
+ "loss": 0.6596,
125
+ "rewards/accuracies": 0.78125,
126
+ "rewards/chosen": 0.008793818764388561,
127
+ "rewards/margins": 0.08649717271327972,
128
+ "rewards/rejected": -0.07770337164402008,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 0.08374771002355404,
133
+ "grad_norm": 1.8338431766961225,
134
+ "learning_rate": 4.166666666666667e-06,
135
+ "logits/chosen": -2.4636688232421875,
136
+ "logits/rejected": -2.3472182750701904,
137
+ "logps/chosen": -264.05975341796875,
138
+ "logps/rejected": -255.5188446044922,
139
+ "loss": 0.6425,
140
+ "rewards/accuracies": 0.71875,
141
+ "rewards/chosen": -0.10224815458059311,
142
+ "rewards/margins": 0.11396624147891998,
143
+ "rewards/rejected": -0.2162143886089325,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 0.0942161737764983,
148
+ "grad_norm": 1.8214883674875806,
149
+ "learning_rate": 4.6875000000000004e-06,
150
+ "logits/chosen": -2.3876922130584717,
151
+ "logits/rejected": -2.3397979736328125,
152
+ "logps/chosen": -267.89422607421875,
153
+ "logps/rejected": -286.52838134765625,
154
+ "loss": 0.6327,
155
+ "rewards/accuracies": 0.6875,
156
+ "rewards/chosen": -0.1534612625837326,
157
+ "rewards/margins": 0.16493618488311768,
158
+ "rewards/rejected": -0.3183974325656891,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.10468463752944256,
163
+ "grad_norm": 2.20348278766923,
164
+ "learning_rate": 4.9997324926814375e-06,
165
+ "logits/chosen": -2.408754587173462,
166
+ "logits/rejected": -2.332271099090576,
167
+ "logps/chosen": -306.8832702636719,
168
+ "logps/rejected": -322.60369873046875,
169
+ "loss": 0.6185,
170
+ "rewards/accuracies": 0.731249988079071,
171
+ "rewards/chosen": -0.3260679841041565,
172
+ "rewards/margins": 0.21181587874889374,
173
+ "rewards/rejected": -0.5378838777542114,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.10468463752944256,
178
+ "eval_logits/chosen": -2.3536956310272217,
179
+ "eval_logits/rejected": -2.295363426208496,
180
+ "eval_logps/chosen": -296.3735656738281,
181
+ "eval_logps/rejected": -300.7996826171875,
182
+ "eval_loss": 0.6239820718765259,
183
+ "eval_rewards/accuracies": 0.6964285969734192,
184
+ "eval_rewards/chosen": -0.30099427700042725,
185
+ "eval_rewards/margins": 0.23865534365177155,
186
+ "eval_rewards/rejected": -0.5396496653556824,
187
+ "eval_runtime": 276.8422,
188
+ "eval_samples_per_second": 7.224,
189
+ "eval_steps_per_second": 0.228,
190
+ "step": 100
191
+ },
192
+ {
193
+ "epoch": 0.11515310128238682,
194
+ "grad_norm": 2.4992668200294634,
195
+ "learning_rate": 4.996723692767927e-06,
196
+ "logits/chosen": -2.4233577251434326,
197
+ "logits/rejected": -2.3276515007019043,
198
+ "logps/chosen": -282.03070068359375,
199
+ "logps/rejected": -270.5752258300781,
200
+ "loss": 0.6214,
201
+ "rewards/accuracies": 0.7124999761581421,
202
+ "rewards/chosen": -0.32758888602256775,
203
+ "rewards/margins": 0.2726225256919861,
204
+ "rewards/rejected": -0.6002114415168762,
205
+ "step": 110
206
+ },
207
+ {
208
+ "epoch": 0.12562156503533106,
209
+ "grad_norm": 2.277765435939027,
210
+ "learning_rate": 4.9903757462135984e-06,
211
+ "logits/chosen": -2.402243137359619,
212
+ "logits/rejected": -2.3531296253204346,
213
+ "logps/chosen": -279.3037414550781,
214
+ "logps/rejected": -320.41241455078125,
215
+ "loss": 0.5899,
216
+ "rewards/accuracies": 0.762499988079071,
217
+ "rewards/chosen": -0.3164636492729187,
218
+ "rewards/margins": 0.3019184470176697,
219
+ "rewards/rejected": -0.6183820962905884,
220
+ "step": 120
221
+ },
222
+ {
223
+ "epoch": 0.1360900287882753,
224
+ "grad_norm": 3.5836859325532537,
225
+ "learning_rate": 4.980697142834315e-06,
226
+ "logits/chosen": -2.4281363487243652,
227
+ "logits/rejected": -2.3622257709503174,
228
+ "logps/chosen": -365.813232421875,
229
+ "logps/rejected": -333.8047790527344,
230
+ "loss": 0.5984,
231
+ "rewards/accuracies": 0.6875,
232
+ "rewards/chosen": -0.6453860402107239,
233
+ "rewards/margins": 0.291470468044281,
234
+ "rewards/rejected": -0.9368564486503601,
235
+ "step": 130
236
+ },
237
+ {
238
+ "epoch": 0.14655849254121958,
239
+ "grad_norm": 4.177122005020008,
240
+ "learning_rate": 4.967700826904229e-06,
241
+ "logits/chosen": -2.3888461589813232,
242
+ "logits/rejected": -2.3509252071380615,
243
+ "logps/chosen": -329.9725341796875,
244
+ "logps/rejected": -369.2422790527344,
245
+ "loss": 0.5786,
246
+ "rewards/accuracies": 0.7562500238418579,
247
+ "rewards/chosen": -0.5841600298881531,
248
+ "rewards/margins": 0.4313054084777832,
249
+ "rewards/rejected": -1.015465497970581,
250
+ "step": 140
251
+ },
252
+ {
253
+ "epoch": 0.15702695629416383,
254
+ "grad_norm": 3.8223984872865686,
255
+ "learning_rate": 4.951404179843963e-06,
256
+ "logits/chosen": -2.4817895889282227,
257
+ "logits/rejected": -2.4037411212921143,
258
+ "logps/chosen": -363.9546203613281,
259
+ "logps/rejected": -355.483154296875,
260
+ "loss": 0.5982,
261
+ "rewards/accuracies": 0.699999988079071,
262
+ "rewards/chosen": -0.738756000995636,
263
+ "rewards/margins": 0.38375917077064514,
264
+ "rewards/rejected": -1.122515320777893,
265
+ "step": 150
266
+ },
267
+ {
268
+ "epoch": 0.16749542004710807,
269
+ "grad_norm": 3.3391629933554023,
270
+ "learning_rate": 4.931828996974498e-06,
271
+ "logits/chosen": -2.4580092430114746,
272
+ "logits/rejected": -2.364123821258545,
273
+ "logps/chosen": -374.60455322265625,
274
+ "logps/rejected": -352.8321533203125,
275
+ "loss": 0.5851,
276
+ "rewards/accuracies": 0.7124999761581421,
277
+ "rewards/chosen": -0.8589010238647461,
278
+ "rewards/margins": 0.3599582612514496,
279
+ "rewards/rejected": -1.2188594341278076,
280
+ "step": 160
281
+ },
282
+ {
283
+ "epoch": 0.17796388380005235,
284
+ "grad_norm": 2.5964425242978337,
285
+ "learning_rate": 4.909001458367867e-06,
286
+ "logits/chosen": -2.4445998668670654,
287
+ "logits/rejected": -2.355095148086548,
288
+ "logps/chosen": -318.6025085449219,
289
+ "logps/rejected": -344.8854675292969,
290
+ "loss": 0.575,
291
+ "rewards/accuracies": 0.6875,
292
+ "rewards/chosen": -0.580207347869873,
293
+ "rewards/margins": 0.40731239318847656,
294
+ "rewards/rejected": -0.9875197410583496,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 0.1884323475529966,
299
+ "grad_norm": 3.6928906953423732,
300
+ "learning_rate": 4.882952093833628e-06,
301
+ "logits/chosen": -2.388700008392334,
302
+ "logits/rejected": -2.3635334968566895,
303
+ "logps/chosen": -325.55902099609375,
304
+ "logps/rejected": -366.1028747558594,
305
+ "loss": 0.55,
306
+ "rewards/accuracies": 0.6875,
307
+ "rewards/chosen": -0.6206040382385254,
308
+ "rewards/margins": 0.4654437005519867,
309
+ "rewards/rejected": -1.086047649383545,
310
+ "step": 180
311
+ },
312
+ {
313
+ "epoch": 0.19890081130594087,
314
+ "grad_norm": 3.022202327145697,
315
+ "learning_rate": 4.853715742087947e-06,
316
+ "logits/chosen": -2.243128776550293,
317
+ "logits/rejected": -2.2242884635925293,
318
+ "logps/chosen": -345.83331298828125,
319
+ "logps/rejected": -424.47003173828125,
320
+ "loss": 0.564,
321
+ "rewards/accuracies": 0.793749988079071,
322
+ "rewards/chosen": -0.9772090911865234,
323
+ "rewards/margins": 0.5893430113792419,
324
+ "rewards/rejected": -1.5665521621704102,
325
+ "step": 190
326
+ },
327
+ {
328
+ "epoch": 0.2093692750588851,
329
+ "grad_norm": 4.352581590749502,
330
+ "learning_rate": 4.821331504159906e-06,
331
+ "logits/chosen": -2.3912417888641357,
332
+ "logits/rejected": -2.310708999633789,
333
+ "logps/chosen": -394.71142578125,
334
+ "logps/rejected": -382.6004943847656,
335
+ "loss": 0.5724,
336
+ "rewards/accuracies": 0.7124999761581421,
337
+ "rewards/chosen": -0.8045751452445984,
338
+ "rewards/margins": 0.4971863627433777,
339
+ "rewards/rejected": -1.3017616271972656,
340
+ "step": 200
341
+ },
342
+ {
343
+ "epoch": 0.2093692750588851,
344
+ "eval_logits/chosen": -2.2930498123168945,
345
+ "eval_logits/rejected": -2.2448384761810303,
346
+ "eval_logps/chosen": -350.611328125,
347
+ "eval_logps/rejected": -379.675048828125,
348
+ "eval_loss": 0.5691524744033813,
349
+ "eval_rewards/accuracies": 0.7301587462425232,
350
+ "eval_rewards/chosen": -0.8433722257614136,
351
+ "eval_rewards/margins": 0.48503097891807556,
352
+ "eval_rewards/rejected": -1.3284029960632324,
353
+ "eval_runtime": 277.4907,
354
+ "eval_samples_per_second": 7.207,
355
+ "eval_steps_per_second": 0.227,
356
+ "step": 200
357
+ },
358
+ {
359
+ "epoch": 0.21983773881182936,
360
+ "grad_norm": 4.349505936455232,
361
+ "learning_rate": 4.7858426910973435e-06,
362
+ "logits/chosen": -2.3390839099884033,
363
+ "logits/rejected": -2.296893835067749,
364
+ "logps/chosen": -365.0033264160156,
365
+ "logps/rejected": -390.9748229980469,
366
+ "loss": 0.5875,
367
+ "rewards/accuracies": 0.6625000238418579,
368
+ "rewards/chosen": -0.8617814779281616,
369
+ "rewards/margins": 0.34669047594070435,
370
+ "rewards/rejected": -1.2084718942642212,
371
+ "step": 210
372
+ },
373
+ {
374
+ "epoch": 0.23030620256477363,
375
+ "grad_norm": 3.456252004286951,
376
+ "learning_rate": 4.747296766042161e-06,
377
+ "logits/chosen": -2.345686435699463,
378
+ "logits/rejected": -2.2922120094299316,
379
+ "logps/chosen": -383.15887451171875,
380
+ "logps/rejected": -390.80059814453125,
381
+ "loss": 0.5635,
382
+ "rewards/accuracies": 0.7250000238418579,
383
+ "rewards/chosen": -0.8896678686141968,
384
+ "rewards/margins": 0.5066917538642883,
385
+ "rewards/rejected": -1.3963596820831299,
386
+ "step": 220
387
+ },
388
+ {
389
+ "epoch": 0.24077466631771788,
390
+ "grad_norm": 4.05851608400099,
391
+ "learning_rate": 4.705745280752586e-06,
392
+ "logits/chosen": -2.1309893131256104,
393
+ "logits/rejected": -2.105212450027466,
394
+ "logps/chosen": -367.39385986328125,
395
+ "logps/rejected": -395.93389892578125,
396
+ "loss": 0.5581,
397
+ "rewards/accuracies": 0.6937500238418579,
398
+ "rewards/chosen": -1.184787631034851,
399
+ "rewards/margins": 0.43892526626586914,
400
+ "rewards/rejected": -1.6237128973007202,
401
+ "step": 230
402
+ },
403
+ {
404
+ "epoch": 0.2512431300706621,
405
+ "grad_norm": 3.8267051093451507,
406
+ "learning_rate": 4.661243806657256e-06,
407
+ "logits/chosen": -2.1996395587921143,
408
+ "logits/rejected": -2.182788133621216,
409
+ "logps/chosen": -372.97308349609375,
410
+ "logps/rejected": -395.46978759765625,
411
+ "loss": 0.5428,
412
+ "rewards/accuracies": 0.7562500238418579,
413
+ "rewards/chosen": -1.135464072227478,
414
+ "rewards/margins": 0.5388585925102234,
415
+ "rewards/rejected": -1.6743228435516357,
416
+ "step": 240
417
+ },
418
+ {
419
+ "epoch": 0.26171159382360637,
420
+ "grad_norm": 4.611796368437759,
421
+ "learning_rate": 4.613851860533367e-06,
422
+ "logits/chosen": -2.163637399673462,
423
+ "logits/rejected": -2.143691062927246,
424
+ "logps/chosen": -375.72900390625,
425
+ "logps/rejected": -394.5628967285156,
426
+ "loss": 0.5294,
427
+ "rewards/accuracies": 0.75,
428
+ "rewards/chosen": -1.087786316871643,
429
+ "rewards/margins": 0.5885697603225708,
430
+ "rewards/rejected": -1.6763559579849243,
431
+ "step": 250
432
+ },
433
+ {
434
+ "epoch": 0.2721800575765506,
435
+ "grad_norm": 4.723757696740007,
436
+ "learning_rate": 4.563632824908252e-06,
437
+ "logits/chosen": -2.07501482963562,
438
+ "logits/rejected": -2.003157138824463,
439
+ "logps/chosen": -431.285888671875,
440
+ "logps/rejected": -481.96435546875,
441
+ "loss": 0.536,
442
+ "rewards/accuracies": 0.7124999761581421,
443
+ "rewards/chosen": -1.5983136892318726,
444
+ "rewards/margins": 0.6496692299842834,
445
+ "rewards/rejected": -2.247982978820801,
446
+ "step": 260
447
+ },
448
+ {
449
+ "epoch": 0.2826485213294949,
450
+ "grad_norm": 6.406650480112877,
451
+ "learning_rate": 4.510653863290871e-06,
452
+ "logits/chosen": -1.9036792516708374,
453
+ "logits/rejected": -1.8629169464111328,
454
+ "logps/chosen": -441.50677490234375,
455
+ "logps/rejected": -505.6388244628906,
456
+ "loss": 0.5138,
457
+ "rewards/accuracies": 0.7250000238418579,
458
+ "rewards/chosen": -1.6636412143707275,
459
+ "rewards/margins": 0.8482675552368164,
460
+ "rewards/rejected": -2.511908769607544,
461
+ "step": 270
462
+ },
463
+ {
464
+ "epoch": 0.29311698508243916,
465
+ "grad_norm": 3.669198365138619,
466
+ "learning_rate": 4.454985830346574e-06,
467
+ "logits/chosen": -1.8643271923065186,
468
+ "logits/rejected": -1.7986135482788086,
469
+ "logps/chosen": -439.7417907714844,
470
+ "logps/rejected": -483.3232421875,
471
+ "loss": 0.5515,
472
+ "rewards/accuracies": 0.800000011920929,
473
+ "rewards/chosen": -1.501157522201538,
474
+ "rewards/margins": 0.768980085849762,
475
+ "rewards/rejected": -2.270137310028076,
476
+ "step": 280
477
+ },
478
+ {
479
+ "epoch": 0.3035854488353834,
480
+ "grad_norm": 4.724976933585983,
481
+ "learning_rate": 4.396703177135262e-06,
482
+ "logits/chosen": -1.8855762481689453,
483
+ "logits/rejected": -1.7720863819122314,
484
+ "logps/chosen": -438.58154296875,
485
+ "logps/rejected": -466.4124450683594,
486
+ "loss": 0.5336,
487
+ "rewards/accuracies": 0.731249988079071,
488
+ "rewards/chosen": -1.535830020904541,
489
+ "rewards/margins": 0.6200370788574219,
490
+ "rewards/rejected": -2.155867099761963,
491
+ "step": 290
492
+ },
493
+ {
494
+ "epoch": 0.31405391258832765,
495
+ "grad_norm": 4.575595350067731,
496
+ "learning_rate": 4.335883851539693e-06,
497
+ "logits/chosen": -1.7169132232666016,
498
+ "logits/rejected": -1.7286903858184814,
499
+ "logps/chosen": -402.37982177734375,
500
+ "logps/rejected": -462.84478759765625,
501
+ "loss": 0.5366,
502
+ "rewards/accuracies": 0.7250000238418579,
503
+ "rewards/chosen": -1.5827662944793701,
504
+ "rewards/margins": 0.6053536534309387,
505
+ "rewards/rejected": -2.188120126724243,
506
+ "step": 300
507
+ },
508
+ {
509
+ "epoch": 0.31405391258832765,
510
+ "eval_logits/chosen": -1.6850438117980957,
511
+ "eval_logits/rejected": -1.6220176219940186,
512
+ "eval_logps/chosen": -435.1428527832031,
513
+ "eval_logps/rejected": -495.46484375,
514
+ "eval_loss": 0.524876594543457,
515
+ "eval_rewards/accuracies": 0.7638888955116272,
516
+ "eval_rewards/chosen": -1.6886873245239258,
517
+ "eval_rewards/margins": 0.7976137399673462,
518
+ "eval_rewards/rejected": -2.4863009452819824,
519
+ "eval_runtime": 278.1285,
520
+ "eval_samples_per_second": 7.191,
521
+ "eval_steps_per_second": 0.227,
522
+ "step": 300
523
+ },
524
+ {
525
+ "epoch": 0.3245223763412719,
526
+ "grad_norm": 4.624400922286726,
527
+ "learning_rate": 4.2726091940171055e-06,
528
+ "logits/chosen": -1.75604248046875,
529
+ "logits/rejected": -1.6548255681991577,
530
+ "logps/chosen": -441.04571533203125,
531
+ "logps/rejected": -495.0343322753906,
532
+ "loss": 0.5239,
533
+ "rewards/accuracies": 0.7749999761581421,
534
+ "rewards/chosen": -1.6734451055526733,
535
+ "rewards/margins": 0.911232590675354,
536
+ "rewards/rejected": -2.5846774578094482,
537
+ "step": 310
538
+ },
539
+ {
540
+ "epoch": 0.33499084009421615,
541
+ "grad_norm": 5.321599244496586,
542
+ "learning_rate": 4.206963828813555e-06,
543
+ "logits/chosen": -1.7151927947998047,
544
+ "logits/rejected": -1.659000039100647,
545
+ "logps/chosen": -407.6772766113281,
546
+ "logps/rejected": -487.42816162109375,
547
+ "loss": 0.5533,
548
+ "rewards/accuracies": 0.6812499761581421,
549
+ "rewards/chosen": -1.6879345178604126,
550
+ "rewards/margins": 0.7429603338241577,
551
+ "rewards/rejected": -2.4308950901031494,
552
+ "step": 320
553
+ },
554
+ {
555
+ "epoch": 0.34545930384716045,
556
+ "grad_norm": 4.6340466714580755,
557
+ "learning_rate": 4.139035550786495e-06,
558
+ "logits/chosen": -1.693902611732483,
559
+ "logits/rejected": -1.6580374240875244,
560
+ "logps/chosen": -437.0306091308594,
561
+ "logps/rejected": -501.24298095703125,
562
+ "loss": 0.5172,
563
+ "rewards/accuracies": 0.7124999761581421,
564
+ "rewards/chosen": -1.860239028930664,
565
+ "rewards/margins": 0.6861482858657837,
566
+ "rewards/rejected": -2.5463874340057373,
567
+ "step": 330
568
+ },
569
+ {
570
+ "epoch": 0.3559277676001047,
571
+ "grad_norm": 4.529361319396784,
572
+ "learning_rate": 4.068915207986931e-06,
573
+ "logits/chosen": -1.5517462491989136,
574
+ "logits/rejected": -1.497995376586914,
575
+ "logps/chosen": -427.9058532714844,
576
+ "logps/rejected": -476.1865234375,
577
+ "loss": 0.5503,
578
+ "rewards/accuracies": 0.699999988079071,
579
+ "rewards/chosen": -1.7529428005218506,
580
+ "rewards/margins": 0.7496469020843506,
581
+ "rewards/rejected": -2.502589702606201,
582
+ "step": 340
583
+ },
584
+ {
585
+ "epoch": 0.36639623135304894,
586
+ "grad_norm": 5.461159855322785,
587
+ "learning_rate": 3.996696580158211e-06,
588
+ "logits/chosen": -1.7160961627960205,
589
+ "logits/rejected": -1.64591383934021,
590
+ "logps/chosen": -440.3072204589844,
591
+ "logps/rejected": -496.3219299316406,
592
+ "loss": 0.5145,
593
+ "rewards/accuracies": 0.668749988079071,
594
+ "rewards/chosen": -1.8336540460586548,
595
+ "rewards/margins": 0.7150315046310425,
596
+ "rewards/rejected": -2.5486855506896973,
597
+ "step": 350
598
+ },
599
+ {
600
+ "epoch": 0.3768646951059932,
601
+ "grad_norm": 5.022405804138076,
602
+ "learning_rate": 3.922476253313921e-06,
603
+ "logits/chosen": -1.6719996929168701,
604
+ "logits/rejected": -1.6425079107284546,
605
+ "logps/chosen": -405.40142822265625,
606
+ "logps/rejected": -469.779052734375,
607
+ "loss": 0.4736,
608
+ "rewards/accuracies": 0.800000011920929,
609
+ "rewards/chosen": -1.4964035749435425,
610
+ "rewards/margins": 0.8178457021713257,
611
+ "rewards/rejected": -2.314249277114868,
612
+ "step": 360
613
+ },
614
+ {
615
+ "epoch": 0.38733315885893743,
616
+ "grad_norm": 4.765071999966862,
617
+ "learning_rate": 3.846353490562664e-06,
618
+ "logits/chosen": -1.754809021949768,
619
+ "logits/rejected": -1.7150640487670898,
620
+ "logps/chosen": -381.4289855957031,
621
+ "logps/rejected": -456.10675048828125,
622
+ "loss": 0.5203,
623
+ "rewards/accuracies": 0.7562500238418579,
624
+ "rewards/chosen": -1.3520570993423462,
625
+ "rewards/margins": 0.8618815541267395,
626
+ "rewards/rejected": -2.2139384746551514,
627
+ "step": 370
628
+ },
629
+ {
630
+ "epoch": 0.39780162261188173,
631
+ "grad_norm": 5.077040951702186,
632
+ "learning_rate": 3.768430099352445e-06,
633
+ "logits/chosen": -1.7217756509780884,
634
+ "logits/rejected": -1.67500901222229,
635
+ "logps/chosen": -441.7967834472656,
636
+ "logps/rejected": -520.3387451171875,
637
+ "loss": 0.5364,
638
+ "rewards/accuracies": 0.6937500238418579,
639
+ "rewards/chosen": -1.86903977394104,
640
+ "rewards/margins": 0.7624320983886719,
641
+ "rewards/rejected": -2.631471872329712,
642
+ "step": 380
643
+ },
644
+ {
645
+ "epoch": 0.408270086364826,
646
+ "grad_norm": 3.6370311779518514,
647
+ "learning_rate": 3.6888102953122307e-06,
648
+ "logits/chosen": -1.7671334743499756,
649
+ "logits/rejected": -1.7222200632095337,
650
+ "logps/chosen": -417.48699951171875,
651
+ "logps/rejected": -465.5294494628906,
652
+ "loss": 0.5336,
653
+ "rewards/accuracies": 0.6875,
654
+ "rewards/chosen": -1.583051323890686,
655
+ "rewards/margins": 0.6584590673446655,
656
+ "rewards/rejected": -2.2415101528167725,
657
+ "step": 390
658
+ },
659
+ {
660
+ "epoch": 0.4187385501177702,
661
+ "grad_norm": 3.8429456015253396,
662
+ "learning_rate": 3.607600562872785e-06,
663
+ "logits/chosen": -1.843812346458435,
664
+ "logits/rejected": -1.8115432262420654,
665
+ "logps/chosen": -375.5889587402344,
666
+ "logps/rejected": -427.83990478515625,
667
+ "loss": 0.5397,
668
+ "rewards/accuracies": 0.6937500238418579,
669
+ "rewards/chosen": -1.298298954963684,
670
+ "rewards/margins": 0.5158218145370483,
671
+ "rewards/rejected": -1.8141206502914429,
672
+ "step": 400
673
+ },
674
+ {
675
+ "epoch": 0.4187385501177702,
676
+ "eval_logits/chosen": -1.8144292831420898,
677
+ "eval_logits/rejected": -1.7585728168487549,
678
+ "eval_logps/chosen": -396.2536926269531,
679
+ "eval_logps/rejected": -446.0619201660156,
680
+ "eval_loss": 0.5252955555915833,
681
+ "eval_rewards/accuracies": 0.7698412537574768,
682
+ "eval_rewards/chosen": -1.2997959852218628,
683
+ "eval_rewards/margins": 0.6924758553504944,
684
+ "eval_rewards/rejected": -1.9922715425491333,
685
+ "eval_runtime": 277.0739,
686
+ "eval_samples_per_second": 7.218,
687
+ "eval_steps_per_second": 0.227,
688
+ "step": 400
689
+ },
690
+ {
691
+ "epoch": 0.42920701387071447,
692
+ "grad_norm": 4.321854918430961,
693
+ "learning_rate": 3.5249095128531863e-06,
694
+ "logits/chosen": -1.708707571029663,
695
+ "logits/rejected": -1.6425899267196655,
696
+ "logps/chosen": -434.7601623535156,
697
+ "logps/rejected": -490.0912170410156,
698
+ "loss": 0.5406,
699
+ "rewards/accuracies": 0.7875000238418579,
700
+ "rewards/chosen": -1.66852605342865,
701
+ "rewards/margins": 0.8226760625839233,
702
+ "rewards/rejected": -2.4912023544311523,
703
+ "step": 410
704
+ },
705
+ {
706
+ "epoch": 0.4396754776236587,
707
+ "grad_norm": 4.6429659320397185,
708
+ "learning_rate": 3.4408477372034743e-06,
709
+ "logits/chosen": -1.7646077871322632,
710
+ "logits/rejected": -1.6311779022216797,
711
+ "logps/chosen": -465.91851806640625,
712
+ "logps/rejected": -482.90313720703125,
713
+ "loss": 0.5232,
714
+ "rewards/accuracies": 0.706250011920929,
715
+ "rewards/chosen": -1.8112109899520874,
716
+ "rewards/margins": 0.7522093653678894,
717
+ "rewards/rejected": -2.5634207725524902,
718
+ "step": 420
719
+ },
720
+ {
721
+ "epoch": 0.45014394137660296,
722
+ "grad_norm": 4.073383292817544,
723
+ "learning_rate": 3.355527661097728e-06,
724
+ "logits/chosen": -1.7000205516815186,
725
+ "logits/rejected": -1.6602411270141602,
726
+ "logps/chosen": -406.20111083984375,
727
+ "logps/rejected": -491.0992126464844,
728
+ "loss": 0.5113,
729
+ "rewards/accuracies": 0.75,
730
+ "rewards/chosen": -1.771209478378296,
731
+ "rewards/margins": 0.8097410202026367,
732
+ "rewards/rejected": -2.5809504985809326,
733
+ "step": 430
734
+ },
735
+ {
736
+ "epoch": 0.46061240512954726,
737
+ "grad_norm": 6.363547319828774,
738
+ "learning_rate": 3.269063392575352e-06,
739
+ "logits/chosen": -1.6135189533233643,
740
+ "logits/rejected": -1.6108068227767944,
741
+ "logps/chosen": -434.0970764160156,
742
+ "logps/rejected": -488.9344787597656,
743
+ "loss": 0.5193,
744
+ "rewards/accuracies": 0.706250011920929,
745
+ "rewards/chosen": -1.8390247821807861,
746
+ "rewards/margins": 0.7370558977127075,
747
+ "rewards/rejected": -2.576080799102783,
748
+ "step": 440
749
+ },
750
+ {
751
+ "epoch": 0.4710808688824915,
752
+ "grad_norm": 5.120415362893221,
753
+ "learning_rate": 3.181570569931697e-06,
754
+ "logits/chosen": -1.699637770652771,
755
+ "logits/rejected": -1.6541831493377686,
756
+ "logps/chosen": -434.42010498046875,
757
+ "logps/rejected": -562.7218017578125,
758
+ "loss": 0.508,
759
+ "rewards/accuracies": 0.768750011920929,
760
+ "rewards/chosen": -1.8876171112060547,
761
+ "rewards/margins": 1.1184637546539307,
762
+ "rewards/rejected": -3.0060808658599854,
763
+ "step": 450
764
+ },
765
+ {
766
+ "epoch": 0.48154933263543576,
767
+ "grad_norm": 4.704501376897649,
768
+ "learning_rate": 3.09316620706208e-06,
769
+ "logits/chosen": -1.7134917974472046,
770
+ "logits/rejected": -1.6925232410430908,
771
+ "logps/chosen": -452.0691833496094,
772
+ "logps/rejected": -523.4850463867188,
773
+ "loss": 0.5335,
774
+ "rewards/accuracies": 0.731249988079071,
775
+ "rewards/chosen": -1.904476523399353,
776
+ "rewards/margins": 0.8279927372932434,
777
+ "rewards/rejected": -2.732469320297241,
778
+ "step": 460
779
+ },
780
+ {
781
+ "epoch": 0.49201779638838,
782
+ "grad_norm": 4.879700738283134,
783
+ "learning_rate": 3.0039685369660785e-06,
784
+ "logits/chosen": -1.7574955224990845,
785
+ "logits/rejected": -1.666815161705017,
786
+ "logps/chosen": -433.5028381347656,
787
+ "logps/rejected": -469.12158203125,
788
+ "loss": 0.4923,
789
+ "rewards/accuracies": 0.699999988079071,
790
+ "rewards/chosen": -1.7322635650634766,
791
+ "rewards/margins": 0.6690031290054321,
792
+ "rewards/rejected": -2.401266574859619,
793
+ "step": 470
794
+ },
795
+ {
796
+ "epoch": 0.5024862601413242,
797
+ "grad_norm": 5.236704166005388,
798
+ "learning_rate": 2.91409685362137e-06,
799
+ "logits/chosen": -1.6420507431030273,
800
+ "logits/rejected": -1.551762342453003,
801
+ "logps/chosen": -483.396240234375,
802
+ "logps/rejected": -551.7774658203125,
803
+ "loss": 0.5032,
804
+ "rewards/accuracies": 0.800000011920929,
805
+ "rewards/chosen": -2.1131300926208496,
806
+ "rewards/margins": 0.9647353887557983,
807
+ "rewards/rejected": -3.0778656005859375,
808
+ "step": 480
809
+ },
810
+ {
811
+ "epoch": 0.5129547238942685,
812
+ "grad_norm": 4.9932627461086625,
813
+ "learning_rate": 2.8236713524386085e-06,
814
+ "logits/chosen": -1.7080411911010742,
815
+ "logits/rejected": -1.606384515762329,
816
+ "logps/chosen": -494.674560546875,
817
+ "logps/rejected": -555.2131958007812,
818
+ "loss": 0.497,
819
+ "rewards/accuracies": 0.7749999761581421,
820
+ "rewards/chosen": -2.097015619277954,
821
+ "rewards/margins": 0.9440878629684448,
822
+ "rewards/rejected": -3.0411033630371094,
823
+ "step": 490
824
+ },
825
+ {
826
+ "epoch": 0.5234231876472127,
827
+ "grad_norm": 4.897787536727827,
828
+ "learning_rate": 2.7328129695107205e-06,
829
+ "logits/chosen": -1.6698167324066162,
830
+ "logits/rejected": -1.6173324584960938,
831
+ "logps/chosen": -458.3126525878906,
832
+ "logps/rejected": -532.6166381835938,
833
+ "loss": 0.5003,
834
+ "rewards/accuracies": 0.7875000238418579,
835
+ "rewards/chosen": -1.9204323291778564,
836
+ "rewards/margins": 0.9951194524765015,
837
+ "rewards/rejected": -2.9155516624450684,
838
+ "step": 500
839
+ },
840
+ {
841
+ "epoch": 0.5234231876472127,
842
+ "eval_logits/chosen": -1.6681629419326782,
843
+ "eval_logits/rejected": -1.6049078702926636,
844
+ "eval_logps/chosen": -466.0908508300781,
845
+ "eval_logps/rejected": -538.906494140625,
846
+ "eval_loss": 0.5013344287872314,
847
+ "eval_rewards/accuracies": 0.7658730149269104,
848
+ "eval_rewards/chosen": -1.9981670379638672,
849
+ "eval_rewards/margins": 0.922550618648529,
850
+ "eval_rewards/rejected": -2.92071795463562,
851
+ "eval_runtime": 276.7176,
852
+ "eval_samples_per_second": 7.228,
853
+ "eval_steps_per_second": 0.228,
854
+ "step": 500
855
+ },
856
+ {
857
+ "epoch": 0.533891651400157,
858
+ "grad_norm": 4.97316270122908,
859
+ "learning_rate": 2.641643219871597e-06,
860
+ "logits/chosen": -1.7244354486465454,
861
+ "logits/rejected": -1.6957614421844482,
862
+ "logps/chosen": -451.49981689453125,
863
+ "logps/rejected": -516.1077270507812,
864
+ "loss": 0.4933,
865
+ "rewards/accuracies": 0.706250011920929,
866
+ "rewards/chosen": -1.911877989768982,
867
+ "rewards/margins": 0.8705556988716125,
868
+ "rewards/rejected": -2.7824337482452393,
869
+ "step": 510
870
+ },
871
+ {
872
+ "epoch": 0.5443601151531012,
873
+ "grad_norm": 5.8930442835078685,
874
+ "learning_rate": 2.5502840349805074e-06,
875
+ "logits/chosen": -1.6949068307876587,
876
+ "logits/rejected": -1.6189374923706055,
877
+ "logps/chosen": -499.230224609375,
878
+ "logps/rejected": -536.48583984375,
879
+ "loss": 0.5261,
880
+ "rewards/accuracies": 0.7437499761581421,
881
+ "rewards/chosen": -2.2237086296081543,
882
+ "rewards/margins": 0.827392578125,
883
+ "rewards/rejected": -3.0511016845703125,
884
+ "step": 520
885
+ },
886
+ {
887
+ "epoch": 0.5548285789060455,
888
+ "grad_norm": 5.076166821402116,
889
+ "learning_rate": 2.4588575996495797e-06,
890
+ "logits/chosen": -1.6722028255462646,
891
+ "logits/rejected": -1.606205940246582,
892
+ "logps/chosen": -441.3253479003906,
893
+ "logps/rejected": -515.5338134765625,
894
+ "loss": 0.4997,
895
+ "rewards/accuracies": 0.824999988079071,
896
+ "rewards/chosen": -1.6732591390609741,
897
+ "rewards/margins": 0.9954363703727722,
898
+ "rewards/rejected": -2.6686952114105225,
899
+ "step": 530
900
+ },
901
+ {
902
+ "epoch": 0.5652970426589898,
903
+ "grad_norm": 5.298568838364531,
904
+ "learning_rate": 2.367486188632446e-06,
905
+ "logits/chosen": -1.6246812343597412,
906
+ "logits/rejected": -1.564178705215454,
907
+ "logps/chosen": -443.343017578125,
908
+ "logps/rejected": -481.7394104003906,
909
+ "loss": 0.5271,
910
+ "rewards/accuracies": 0.7124999761581421,
911
+ "rewards/chosen": -1.873234510421753,
912
+ "rewards/margins": 0.8055590391159058,
913
+ "rewards/rejected": -2.6787936687469482,
914
+ "step": 540
915
+ },
916
+ {
917
+ "epoch": 0.575765506411934,
918
+ "grad_norm": 4.903563195070263,
919
+ "learning_rate": 2.276292003092593e-06,
920
+ "logits/chosen": -1.591386079788208,
921
+ "logits/rejected": -1.5199774503707886,
922
+ "logps/chosen": -461.0108337402344,
923
+ "logps/rejected": -490.60498046875,
924
+ "loss": 0.517,
925
+ "rewards/accuracies": 0.7124999761581421,
926
+ "rewards/chosen": -1.9846340417861938,
927
+ "rewards/margins": 0.6464044451713562,
928
+ "rewards/rejected": -2.6310389041900635,
929
+ "step": 550
930
+ },
931
+ {
932
+ "epoch": 0.5862339701648783,
933
+ "grad_norm": 6.151878065059773,
934
+ "learning_rate": 2.1853970071701415e-06,
935
+ "logits/chosen": -1.524752140045166,
936
+ "logits/rejected": -1.4780924320220947,
937
+ "logps/chosen": -458.1922912597656,
938
+ "logps/rejected": -509.72918701171875,
939
+ "loss": 0.5217,
940
+ "rewards/accuracies": 0.7562500238418579,
941
+ "rewards/chosen": -1.9151818752288818,
942
+ "rewards/margins": 0.8629971742630005,
943
+ "rewards/rejected": -2.7781789302825928,
944
+ "step": 560
945
+ },
946
+ {
947
+ "epoch": 0.5967024339178225,
948
+ "grad_norm": 5.220341094167877,
949
+ "learning_rate": 2.0949227648656194e-06,
950
+ "logits/chosen": -1.5827438831329346,
951
+ "logits/rejected": -1.5035909414291382,
952
+ "logps/chosen": -455.52197265625,
953
+ "logps/rejected": -528.5602416992188,
954
+ "loss": 0.5069,
955
+ "rewards/accuracies": 0.762499988079071,
956
+ "rewards/chosen": -1.9370582103729248,
957
+ "rewards/margins": 0.8514412641525269,
958
+ "rewards/rejected": -2.788499355316162,
959
+ "step": 570
960
+ },
961
+ {
962
+ "epoch": 0.6071708976707668,
963
+ "grad_norm": 5.5150025763626145,
964
+ "learning_rate": 2.00499027745888e-06,
965
+ "logits/chosen": -1.5897815227508545,
966
+ "logits/rejected": -1.5100150108337402,
967
+ "logps/chosen": -404.41741943359375,
968
+ "logps/rejected": -460.427001953125,
969
+ "loss": 0.5166,
970
+ "rewards/accuracies": 0.737500011920929,
971
+ "rewards/chosen": -1.6832536458969116,
972
+ "rewards/margins": 0.7988817095756531,
973
+ "rewards/rejected": -2.48213529586792,
974
+ "step": 580
975
+ },
976
+ {
977
+ "epoch": 0.6176393614237111,
978
+ "grad_norm": 4.9008082837332765,
979
+ "learning_rate": 1.915719821680624e-06,
980
+ "logits/chosen": -1.6151273250579834,
981
+ "logits/rejected": -1.5963796377182007,
982
+ "logps/chosen": -419.74725341796875,
983
+ "logps/rejected": -513.4180908203125,
984
+ "loss": 0.494,
985
+ "rewards/accuracies": 0.7562500238418579,
986
+ "rewards/chosen": -1.7207034826278687,
987
+ "rewards/margins": 0.8950328826904297,
988
+ "rewards/rejected": -2.615736246109009,
989
+ "step": 590
990
+ },
991
+ {
992
+ "epoch": 0.6281078251766553,
993
+ "grad_norm": 6.4962338338594385,
994
+ "learning_rate": 1.8272307888529276e-06,
995
+ "logits/chosen": -1.4385050535202026,
996
+ "logits/rejected": -1.3539499044418335,
997
+ "logps/chosen": -534.6634521484375,
998
+ "logps/rejected": -584.0506591796875,
999
+ "loss": 0.4835,
1000
+ "rewards/accuracies": 0.78125,
1001
+ "rewards/chosen": -2.359952449798584,
1002
+ "rewards/margins": 0.8336412310600281,
1003
+ "rewards/rejected": -3.193593740463257,
1004
+ "step": 600
1005
+ },
1006
+ {
1007
+ "epoch": 0.6281078251766553,
1008
+ "eval_logits/chosen": -1.4124971628189087,
1009
+ "eval_logits/rejected": -1.3416856527328491,
1010
+ "eval_logps/chosen": -523.2593383789062,
1011
+ "eval_logps/rejected": -598.5182495117188,
1012
+ "eval_loss": 0.5026703476905823,
1013
+ "eval_rewards/accuracies": 0.7559523582458496,
1014
+ "eval_rewards/chosen": -2.569852352142334,
1015
+ "eval_rewards/margins": 0.9469824433326721,
1016
+ "eval_rewards/rejected": -3.516834259033203,
1017
+ "eval_runtime": 276.7094,
1018
+ "eval_samples_per_second": 7.228,
1019
+ "eval_steps_per_second": 0.228,
1020
+ "step": 600
1021
+ },
1022
+ {
1023
+ "epoch": 0.6385762889295996,
1024
+ "grad_norm": 5.595781300544392,
1025
+ "learning_rate": 1.739641525213929e-06,
1026
+ "logits/chosen": -1.5086050033569336,
1027
+ "logits/rejected": -1.478257179260254,
1028
+ "logps/chosen": -520.480712890625,
1029
+ "logps/rejected": -586.702880859375,
1030
+ "loss": 0.5161,
1031
+ "rewards/accuracies": 0.7875000238418579,
1032
+ "rewards/chosen": -2.468291997909546,
1033
+ "rewards/margins": 1.007550835609436,
1034
+ "rewards/rejected": -3.4758429527282715,
1035
+ "step": 610
1036
+ },
1037
+ {
1038
+ "epoch": 0.6490447526825438,
1039
+ "grad_norm": 6.142511293341612,
1040
+ "learning_rate": 1.6530691736402317e-06,
1041
+ "logits/chosen": -1.6536248922348022,
1042
+ "logits/rejected": -1.5585447549819946,
1043
+ "logps/chosen": -506.4708557128906,
1044
+ "logps/rejected": -550.4331665039062,
1045
+ "loss": 0.4796,
1046
+ "rewards/accuracies": 0.8062499761581421,
1047
+ "rewards/chosen": -2.0505268573760986,
1048
+ "rewards/margins": 0.9777840375900269,
1049
+ "rewards/rejected": -3.028310775756836,
1050
+ "step": 620
1051
+ },
1052
+ {
1053
+ "epoch": 0.6595132164354881,
1054
+ "grad_norm": 7.522645087833883,
1055
+ "learning_rate": 1.5676295169786864e-06,
1056
+ "logits/chosen": -1.6819908618927002,
1057
+ "logits/rejected": -1.6001083850860596,
1058
+ "logps/chosen": -470.88494873046875,
1059
+ "logps/rejected": -511.28680419921875,
1060
+ "loss": 0.4863,
1061
+ "rewards/accuracies": 0.6937500238418579,
1062
+ "rewards/chosen": -1.9194360971450806,
1063
+ "rewards/margins": 0.8219398260116577,
1064
+ "rewards/rejected": -2.741375684738159,
1065
+ "step": 630
1066
+ },
1067
+ {
1068
+ "epoch": 0.6699816801884323,
1069
+ "grad_norm": 7.853260652377011,
1070
+ "learning_rate": 1.4834368231970922e-06,
1071
+ "logits/chosen": -1.564366340637207,
1072
+ "logits/rejected": -1.494086742401123,
1073
+ "logps/chosen": -437.3993225097656,
1074
+ "logps/rejected": -517.3038330078125,
1075
+ "loss": 0.4786,
1076
+ "rewards/accuracies": 0.8187500238418579,
1077
+ "rewards/chosen": -1.9992519617080688,
1078
+ "rewards/margins": 0.9533640742301941,
1079
+ "rewards/rejected": -2.952615976333618,
1080
+ "step": 640
1081
+ },
1082
+ {
1083
+ "epoch": 0.6804501439413766,
1084
+ "grad_norm": 5.803651393914848,
1085
+ "learning_rate": 1.4006036925609245e-06,
1086
+ "logits/chosen": -1.6179273128509521,
1087
+ "logits/rejected": -1.5984830856323242,
1088
+ "logps/chosen": -457.8565979003906,
1089
+ "logps/rejected": -540.9107666015625,
1090
+ "loss": 0.5039,
1091
+ "rewards/accuracies": 0.7250000238418579,
1092
+ "rewards/chosen": -1.9681861400604248,
1093
+ "rewards/margins": 0.8296257257461548,
1094
+ "rewards/rejected": -2.79781174659729,
1095
+ "step": 650
1096
+ },
1097
+ {
1098
+ "epoch": 0.6909186076943209,
1099
+ "grad_norm": 5.141240530191721,
1100
+ "learning_rate": 1.3192409070404582e-06,
1101
+ "logits/chosen": -1.680153489112854,
1102
+ "logits/rejected": -1.6061805486679077,
1103
+ "logps/chosen": -439.73272705078125,
1104
+ "logps/rejected": -508.1709899902344,
1105
+ "loss": 0.4981,
1106
+ "rewards/accuracies": 0.7437499761581421,
1107
+ "rewards/chosen": -1.8612697124481201,
1108
+ "rewards/margins": 0.8554447889328003,
1109
+ "rewards/rejected": -2.716714382171631,
1110
+ "step": 660
1111
+ },
1112
+ {
1113
+ "epoch": 0.7013870714472651,
1114
+ "grad_norm": 6.25377182984675,
1115
+ "learning_rate": 1.2394572821496953e-06,
1116
+ "logits/chosen": -1.5166045427322388,
1117
+ "logits/rejected": -1.4167017936706543,
1118
+ "logps/chosen": -456.9510192871094,
1119
+ "logps/rejected": -535.3768310546875,
1120
+ "loss": 0.4811,
1121
+ "rewards/accuracies": 0.7749999761581421,
1122
+ "rewards/chosen": -2.0087170600891113,
1123
+ "rewards/margins": 1.0361051559448242,
1124
+ "rewards/rejected": -3.0448222160339355,
1125
+ "step": 670
1126
+ },
1127
+ {
1128
+ "epoch": 0.7118555352002094,
1129
+ "grad_norm": 7.572206476060471,
1130
+ "learning_rate": 1.1613595214152713e-06,
1131
+ "logits/chosen": -1.5004642009735107,
1132
+ "logits/rejected": -1.4637364149093628,
1133
+ "logps/chosen": -480.53875732421875,
1134
+ "logps/rejected": -548.7276611328125,
1135
+ "loss": 0.5205,
1136
+ "rewards/accuracies": 0.762499988079071,
1137
+ "rewards/chosen": -2.131697416305542,
1138
+ "rewards/margins": 0.8276796340942383,
1139
+ "rewards/rejected": -2.9593770503997803,
1140
+ "step": 680
1141
+ },
1142
+ {
1143
+ "epoch": 0.7223239989531536,
1144
+ "grad_norm": 4.010350785789718,
1145
+ "learning_rate": 1.0850520736699362e-06,
1146
+ "logits/chosen": -1.588531255722046,
1147
+ "logits/rejected": -1.503158688545227,
1148
+ "logps/chosen": -480.64715576171875,
1149
+ "logps/rejected": -513.88232421875,
1150
+ "loss": 0.4544,
1151
+ "rewards/accuracies": 0.7562500238418579,
1152
+ "rewards/chosen": -2.0653069019317627,
1153
+ "rewards/margins": 0.8280428647994995,
1154
+ "rewards/rejected": -2.8933494091033936,
1155
+ "step": 690
1156
+ },
1157
+ {
1158
+ "epoch": 0.7327924627060979,
1159
+ "grad_norm": 5.214297382993777,
1160
+ "learning_rate": 1.0106369933615043e-06,
1161
+ "logits/chosen": -1.4556739330291748,
1162
+ "logits/rejected": -1.4368045330047607,
1163
+ "logps/chosen": -476.2962951660156,
1164
+ "logps/rejected": -560.508056640625,
1165
+ "loss": 0.4715,
1166
+ "rewards/accuracies": 0.768750011920929,
1167
+ "rewards/chosen": -2.2485156059265137,
1168
+ "rewards/margins": 0.9335169792175293,
1169
+ "rewards/rejected": -3.182032585144043,
1170
+ "step": 700
1171
+ },
1172
+ {
1173
+ "epoch": 0.7327924627060979,
1174
+ "eval_logits/chosen": -1.4479634761810303,
1175
+ "eval_logits/rejected": -1.378204107284546,
1176
+ "eval_logps/chosen": -485.28936767578125,
1177
+ "eval_logps/rejected": -566.1954956054688,
1178
+ "eval_loss": 0.4955996572971344,
1179
+ "eval_rewards/accuracies": 0.7678571343421936,
1180
+ "eval_rewards/chosen": -2.190152883529663,
1181
+ "eval_rewards/margins": 1.0034549236297607,
1182
+ "eval_rewards/rejected": -3.193607807159424,
1183
+ "eval_runtime": 277.1042,
1184
+ "eval_samples_per_second": 7.218,
1185
+ "eval_steps_per_second": 0.227,
1186
+ "step": 700
1187
+ },
1188
+ {
1189
+ "epoch": 0.7432609264590422,
1190
+ "grad_norm": 5.527625380225802,
1191
+ "learning_rate": 9.382138040640714e-07,
1192
+ "logits/chosen": -1.4906096458435059,
1193
+ "logits/rejected": -1.3870160579681396,
1194
+ "logps/chosen": -493.5198669433594,
1195
+ "logps/rejected": -554.8124389648438,
1196
+ "loss": 0.5528,
1197
+ "rewards/accuracies": 0.7437499761581421,
1198
+ "rewards/chosen": -2.260972499847412,
1199
+ "rewards/margins": 0.9154504537582397,
1200
+ "rewards/rejected": -3.176422595977783,
1201
+ "step": 710
1202
+ },
1203
+ {
1204
+ "epoch": 0.7537293902119864,
1205
+ "grad_norm": 4.7649826972694145,
1206
+ "learning_rate": 8.678793653740633e-07,
1207
+ "logits/chosen": -1.6233956813812256,
1208
+ "logits/rejected": -1.5308845043182373,
1209
+ "logps/chosen": -514.6124267578125,
1210
+ "logps/rejected": -569.4010620117188,
1211
+ "loss": 0.4833,
1212
+ "rewards/accuracies": 0.8187500238418579,
1213
+ "rewards/chosen": -1.9400179386138916,
1214
+ "rewards/margins": 1.038830280303955,
1215
+ "rewards/rejected": -2.9788479804992676,
1216
+ "step": 720
1217
+ },
1218
+ {
1219
+ "epoch": 0.7641978539649307,
1220
+ "grad_norm": 5.43475547987397,
1221
+ "learning_rate": 7.997277433690984e-07,
1222
+ "logits/chosen": -1.5152695178985596,
1223
+ "logits/rejected": -1.423516869544983,
1224
+ "logps/chosen": -439.678955078125,
1225
+ "logps/rejected": -520.2861938476562,
1226
+ "loss": 0.5025,
1227
+ "rewards/accuracies": 0.7875000238418579,
1228
+ "rewards/chosen": -2.0016751289367676,
1229
+ "rewards/margins": 0.9704031944274902,
1230
+ "rewards/rejected": -2.972078561782837,
1231
+ "step": 730
1232
+ },
1233
+ {
1234
+ "epoch": 0.7746663177178749,
1235
+ "grad_norm": 6.294313395553081,
1236
+ "learning_rate": 7.338500848029603e-07,
1237
+ "logits/chosen": -1.503118872642517,
1238
+ "logits/rejected": -1.4224625825881958,
1239
+ "logps/chosen": -494.43505859375,
1240
+ "logps/rejected": -529.97607421875,
1241
+ "loss": 0.5112,
1242
+ "rewards/accuracies": 0.75,
1243
+ "rewards/chosen": -2.1212410926818848,
1244
+ "rewards/margins": 0.9084514379501343,
1245
+ "rewards/rejected": -3.0296926498413086,
1246
+ "step": 740
1247
+ },
1248
+ {
1249
+ "epoch": 0.7851347814708192,
1250
+ "grad_norm": 4.456878260756767,
1251
+ "learning_rate": 6.70334495204884e-07,
1252
+ "logits/chosen": -1.3774914741516113,
1253
+ "logits/rejected": -1.3466551303863525,
1254
+ "logps/chosen": -453.56060791015625,
1255
+ "logps/rejected": -562.9486083984375,
1256
+ "loss": 0.4843,
1257
+ "rewards/accuracies": 0.7124999761581421,
1258
+ "rewards/chosen": -2.2453207969665527,
1259
+ "rewards/margins": 1.0070064067840576,
1260
+ "rewards/rejected": -3.2523269653320312,
1261
+ "step": 750
1262
+ },
1263
+ {
1264
+ "epoch": 0.7956032452237635,
1265
+ "grad_norm": 4.6479649573926425,
1266
+ "learning_rate": 6.092659210462232e-07,
1267
+ "logits/chosen": -1.4880906343460083,
1268
+ "logits/rejected": -1.3924858570098877,
1269
+ "logps/chosen": -477.60284423828125,
1270
+ "logps/rejected": -531.3121337890625,
1271
+ "loss": 0.508,
1272
+ "rewards/accuracies": 0.7437499761581421,
1273
+ "rewards/chosen": -2.1819820404052734,
1274
+ "rewards/margins": 0.885240912437439,
1275
+ "rewards/rejected": -3.067223072052002,
1276
+ "step": 760
1277
+ },
1278
+ {
1279
+ "epoch": 0.8060717089767077,
1280
+ "grad_norm": 3.847763381152239,
1281
+ "learning_rate": 5.507260361320738e-07,
1282
+ "logits/chosen": -1.5711116790771484,
1283
+ "logits/rejected": -1.5567753314971924,
1284
+ "logps/chosen": -490.78106689453125,
1285
+ "logps/rejected": -591.5490112304688,
1286
+ "loss": 0.4728,
1287
+ "rewards/accuracies": 0.7437499761581421,
1288
+ "rewards/chosen": -2.039038896560669,
1289
+ "rewards/margins": 0.9788064956665039,
1290
+ "rewards/rejected": -3.017845630645752,
1291
+ "step": 770
1292
+ },
1293
+ {
1294
+ "epoch": 0.816540172729652,
1295
+ "grad_norm": 6.007803688457658,
1296
+ "learning_rate": 4.947931323697983e-07,
1297
+ "logits/chosen": -1.5654757022857666,
1298
+ "logits/rejected": -1.4224697351455688,
1299
+ "logps/chosen": -507.5931091308594,
1300
+ "logps/rejected": -524.8499145507812,
1301
+ "loss": 0.4961,
1302
+ "rewards/accuracies": 0.6937500238418579,
1303
+ "rewards/chosen": -1.997415542602539,
1304
+ "rewards/margins": 0.7785658240318298,
1305
+ "rewards/rejected": -2.7759814262390137,
1306
+ "step": 780
1307
+ },
1308
+ {
1309
+ "epoch": 0.8270086364825961,
1310
+ "grad_norm": 5.439843899605622,
1311
+ "learning_rate": 4.4154201506053985e-07,
1312
+ "logits/chosen": -1.4940153360366821,
1313
+ "logits/rejected": -1.4301998615264893,
1314
+ "logps/chosen": -463.53424072265625,
1315
+ "logps/rejected": -545.1070556640625,
1316
+ "loss": 0.5221,
1317
+ "rewards/accuracies": 0.762499988079071,
1318
+ "rewards/chosen": -2.205313205718994,
1319
+ "rewards/margins": 0.8524004817008972,
1320
+ "rewards/rejected": -3.057713747024536,
1321
+ "step": 790
1322
+ },
1323
+ {
1324
+ "epoch": 0.8374771002355405,
1325
+ "grad_norm": 6.835477486779152,
1326
+ "learning_rate": 3.910439028537638e-07,
1327
+ "logits/chosen": -1.4495314359664917,
1328
+ "logits/rejected": -1.433584451675415,
1329
+ "logps/chosen": -439.5552673339844,
1330
+ "logps/rejected": -554.7761840820312,
1331
+ "loss": 0.4898,
1332
+ "rewards/accuracies": 0.793749988079071,
1333
+ "rewards/chosen": -2.071547031402588,
1334
+ "rewards/margins": 1.1167417764663696,
1335
+ "rewards/rejected": -3.188288927078247,
1336
+ "step": 800
1337
+ },
1338
+ {
1339
+ "epoch": 0.8374771002355405,
1340
+ "eval_logits/chosen": -1.4946216344833374,
1341
+ "eval_logits/rejected": -1.427457332611084,
1342
+ "eval_logps/chosen": -470.2820739746094,
1343
+ "eval_logps/rejected": -547.9973754882812,
1344
+ "eval_loss": 0.4948446750640869,
1345
+ "eval_rewards/accuracies": 0.7698412537574768,
1346
+ "eval_rewards/chosen": -2.0400798320770264,
1347
+ "eval_rewards/margins": 0.9715465903282166,
1348
+ "eval_rewards/rejected": -3.0116262435913086,
1349
+ "eval_runtime": 277.1008,
1350
+ "eval_samples_per_second": 7.218,
1351
+ "eval_steps_per_second": 0.227,
1352
+ "step": 800
1353
+ },
1354
+ {
1355
+ "epoch": 0.8479455639884846,
1356
+ "grad_norm": 4.180571004247573,
1357
+ "learning_rate": 3.4336633249862084e-07,
1358
+ "logits/chosen": -1.5085508823394775,
1359
+ "logits/rejected": -1.4109275341033936,
1360
+ "logps/chosen": -499.6455993652344,
1361
+ "logps/rejected": -552.9725341796875,
1362
+ "loss": 0.5021,
1363
+ "rewards/accuracies": 0.768750011920929,
1364
+ "rewards/chosen": -2.0836286544799805,
1365
+ "rewards/margins": 0.9168750643730164,
1366
+ "rewards/rejected": -3.0005037784576416,
1367
+ "step": 810
1368
+ },
1369
+ {
1370
+ "epoch": 0.8584140277414289,
1371
+ "grad_norm": 5.175742678334051,
1372
+ "learning_rate": 2.98573068519539e-07,
1373
+ "logits/chosen": -1.548295021057129,
1374
+ "logits/rejected": -1.5127002000808716,
1375
+ "logps/chosen": -480.94427490234375,
1376
+ "logps/rejected": -558.2870483398438,
1377
+ "loss": 0.5078,
1378
+ "rewards/accuracies": 0.737500011920929,
1379
+ "rewards/chosen": -2.1707310676574707,
1380
+ "rewards/margins": 0.8862228393554688,
1381
+ "rewards/rejected": -3.0569539070129395,
1382
+ "step": 820
1383
+ },
1384
+ {
1385
+ "epoch": 0.8688824914943732,
1386
+ "grad_norm": 5.516648759407251,
1387
+ "learning_rate": 2.5672401793681854e-07,
1388
+ "logits/chosen": -1.4796596765518188,
1389
+ "logits/rejected": -1.461284875869751,
1390
+ "logps/chosen": -461.5376892089844,
1391
+ "logps/rejected": -552.8976440429688,
1392
+ "loss": 0.5097,
1393
+ "rewards/accuracies": 0.731249988079071,
1394
+ "rewards/chosen": -2.1345925331115723,
1395
+ "rewards/margins": 0.9002341032028198,
1396
+ "rewards/rejected": -3.0348267555236816,
1397
+ "step": 830
1398
+ },
1399
+ {
1400
+ "epoch": 0.8793509552473174,
1401
+ "grad_norm": 4.741723004470474,
1402
+ "learning_rate": 2.178751501463036e-07,
1403
+ "logits/chosen": -1.502801537513733,
1404
+ "logits/rejected": -1.4475767612457275,
1405
+ "logps/chosen": -501.8017578125,
1406
+ "logps/rejected": -600.8604125976562,
1407
+ "loss": 0.4766,
1408
+ "rewards/accuracies": 0.731249988079071,
1409
+ "rewards/chosen": -2.2499661445617676,
1410
+ "rewards/margins": 1.028209924697876,
1411
+ "rewards/rejected": -3.2781760692596436,
1412
+ "step": 840
1413
+ },
1414
+ {
1415
+ "epoch": 0.8898194190002617,
1416
+ "grad_norm": 5.522067224662803,
1417
+ "learning_rate": 1.820784220652766e-07,
1418
+ "logits/chosen": -1.454413652420044,
1419
+ "logits/rejected": -1.3597214221954346,
1420
+ "logps/chosen": -481.33905029296875,
1421
+ "logps/rejected": -549.5355224609375,
1422
+ "loss": 0.4602,
1423
+ "rewards/accuracies": 0.8187500238418579,
1424
+ "rewards/chosen": -2.1423401832580566,
1425
+ "rewards/margins": 0.996941864490509,
1426
+ "rewards/rejected": -3.139281988143921,
1427
+ "step": 850
1428
+ },
1429
+ {
1430
+ "epoch": 0.9002878827532059,
1431
+ "grad_norm": 5.0423547871768575,
1432
+ "learning_rate": 1.4938170864468636e-07,
1433
+ "logits/chosen": -1.5176475048065186,
1434
+ "logits/rejected": -1.414750337600708,
1435
+ "logps/chosen": -488.75177001953125,
1436
+ "logps/rejected": -550.3502197265625,
1437
+ "loss": 0.4715,
1438
+ "rewards/accuracies": 0.7875000238418579,
1439
+ "rewards/chosen": -2.1736044883728027,
1440
+ "rewards/margins": 0.8499178886413574,
1441
+ "rewards/rejected": -3.0235226154327393,
1442
+ "step": 860
1443
+ },
1444
+ {
1445
+ "epoch": 0.9107563465061502,
1446
+ "grad_norm": 4.698698474333166,
1447
+ "learning_rate": 1.1982873884064466e-07,
1448
+ "logits/chosen": -1.5513103008270264,
1449
+ "logits/rejected": -1.4094650745391846,
1450
+ "logps/chosen": -506.3067321777344,
1451
+ "logps/rejected": -559.7911376953125,
1452
+ "loss": 0.4896,
1453
+ "rewards/accuracies": 0.706250011920929,
1454
+ "rewards/chosen": -2.2394676208496094,
1455
+ "rewards/margins": 1.060282588005066,
1456
+ "rewards/rejected": -3.2997500896453857,
1457
+ "step": 870
1458
+ },
1459
+ {
1460
+ "epoch": 0.9212248102590945,
1461
+ "grad_norm": 4.851477312614225,
1462
+ "learning_rate": 9.345903713082305e-08,
1463
+ "logits/chosen": -1.5655453205108643,
1464
+ "logits/rejected": -1.4765007495880127,
1465
+ "logps/chosen": -480.64605712890625,
1466
+ "logps/rejected": -537.4286499023438,
1467
+ "loss": 0.4762,
1468
+ "rewards/accuracies": 0.7437499761581421,
1469
+ "rewards/chosen": -2.0842466354370117,
1470
+ "rewards/margins": 0.9646340608596802,
1471
+ "rewards/rejected": -3.0488810539245605,
1472
+ "step": 880
1473
+ },
1474
+ {
1475
+ "epoch": 0.9316932740120387,
1476
+ "grad_norm": 5.226575451994075,
1477
+ "learning_rate": 7.030787065396866e-08,
1478
+ "logits/chosen": -1.4315617084503174,
1479
+ "logits/rejected": -1.373170256614685,
1480
+ "logps/chosen": -466.0337829589844,
1481
+ "logps/rejected": -583.4358520507812,
1482
+ "loss": 0.4857,
1483
+ "rewards/accuracies": 0.793749988079071,
1484
+ "rewards/chosen": -2.1631832122802734,
1485
+ "rewards/margins": 1.2467734813690186,
1486
+ "rewards/rejected": -3.409956455230713,
1487
+ "step": 890
1488
+ },
1489
+ {
1490
+ "epoch": 0.942161737764983,
1491
+ "grad_norm": 5.514107153250047,
1492
+ "learning_rate": 5.0406202043228604e-08,
1493
+ "logits/chosen": -1.3586971759796143,
1494
+ "logits/rejected": -1.3444792032241821,
1495
+ "logps/chosen": -478.5252380371094,
1496
+ "logps/rejected": -628.2623291015625,
1497
+ "loss": 0.4785,
1498
+ "rewards/accuracies": 0.768750011920929,
1499
+ "rewards/chosen": -2.112252712249756,
1500
+ "rewards/margins": 1.2148876190185547,
1501
+ "rewards/rejected": -3.3271403312683105,
1502
+ "step": 900
1503
+ },
1504
+ {
1505
+ "epoch": 0.942161737764983,
1506
+ "eval_logits/chosen": -1.4777917861938477,
1507
+ "eval_logits/rejected": -1.4105483293533325,
1508
+ "eval_logps/chosen": -483.4023742675781,
1509
+ "eval_logps/rejected": -564.8470458984375,
1510
+ "eval_loss": 0.493284672498703,
1511
+ "eval_rewards/accuracies": 0.773809552192688,
1512
+ "eval_rewards/chosen": -2.1712825298309326,
1513
+ "eval_rewards/margins": 1.0088402032852173,
1514
+ "eval_rewards/rejected": -3.1801228523254395,
1515
+ "eval_runtime": 276.8996,
1516
+ "eval_samples_per_second": 7.223,
1517
+ "eval_steps_per_second": 0.228,
1518
+ "step": 900
1519
+ },
1520
+ {
1521
+ "epoch": 0.9526302015179272,
1522
+ "grad_norm": 7.959876344290962,
1523
+ "learning_rate": 3.378064801637687e-08,
1524
+ "logits/chosen": -1.476231336593628,
1525
+ "logits/rejected": -1.382854700088501,
1526
+ "logps/chosen": -463.75653076171875,
1527
+ "logps/rejected": -513.1383666992188,
1528
+ "loss": 0.5014,
1529
+ "rewards/accuracies": 0.762499988079071,
1530
+ "rewards/chosen": -2.1452784538269043,
1531
+ "rewards/margins": 0.9462319612503052,
1532
+ "rewards/rejected": -3.09151029586792,
1533
+ "step": 910
1534
+ },
1535
+ {
1536
+ "epoch": 0.9630986652708715,
1537
+ "grad_norm": 10.786910308077847,
1538
+ "learning_rate": 2.0453443778310766e-08,
1539
+ "logits/chosen": -1.4654277563095093,
1540
+ "logits/rejected": -1.371619462966919,
1541
+ "logps/chosen": -502.68450927734375,
1542
+ "logps/rejected": -547.508544921875,
1543
+ "loss": 0.5041,
1544
+ "rewards/accuracies": 0.7250000238418579,
1545
+ "rewards/chosen": -2.2171082496643066,
1546
+ "rewards/margins": 0.9030927419662476,
1547
+ "rewards/rejected": -3.1202011108398438,
1548
+ "step": 920
1549
+ },
1550
+ {
1551
+ "epoch": 0.9735671290238157,
1552
+ "grad_norm": 5.431830769761952,
1553
+ "learning_rate": 1.0442413283435759e-08,
1554
+ "logits/chosen": -1.4298330545425415,
1555
+ "logits/rejected": -1.3073384761810303,
1556
+ "logps/chosen": -518.3793334960938,
1557
+ "logps/rejected": -556.9884033203125,
1558
+ "loss": 0.4886,
1559
+ "rewards/accuracies": 0.731249988079071,
1560
+ "rewards/chosen": -2.1807284355163574,
1561
+ "rewards/margins": 0.9955088496208191,
1562
+ "rewards/rejected": -3.176237106323242,
1563
+ "step": 930
1564
+ },
1565
+ {
1566
+ "epoch": 0.98403559277676,
1567
+ "grad_norm": 5.391800635976396,
1568
+ "learning_rate": 3.760945397705828e-09,
1569
+ "logits/chosen": -1.457722544670105,
1570
+ "logits/rejected": -1.4312618970870972,
1571
+ "logps/chosen": -479.5771484375,
1572
+ "logps/rejected": -562.9434814453125,
1573
+ "loss": 0.4851,
1574
+ "rewards/accuracies": 0.699999988079071,
1575
+ "rewards/chosen": -2.1574504375457764,
1576
+ "rewards/margins": 0.8255201578140259,
1577
+ "rewards/rejected": -2.982970714569092,
1578
+ "step": 940
1579
+ },
1580
+ {
1581
+ "epoch": 0.9945040565297043,
1582
+ "grad_norm": 5.016778527628419,
1583
+ "learning_rate": 4.1797599220405605e-10,
1584
+ "logits/chosen": -1.4823999404907227,
1585
+ "logits/rejected": -1.3737461566925049,
1586
+ "logps/chosen": -480.68951416015625,
1587
+ "logps/rejected": -539.4514770507812,
1588
+ "loss": 0.4874,
1589
+ "rewards/accuracies": 0.706250011920929,
1590
+ "rewards/chosen": -2.134705066680908,
1591
+ "rewards/margins": 0.8970800638198853,
1592
+ "rewards/rejected": -3.031785249710083,
1593
+ "step": 950
1594
+ },
1595
+ {
1596
+ "epoch": 0.9997382884061764,
1597
+ "step": 955,
1598
+ "total_flos": 0.0,
1599
+ "train_loss": 0.5333428270529702,
1600
+ "train_runtime": 30179.1373,
1601
+ "train_samples_per_second": 2.026,
1602
+ "train_steps_per_second": 0.032
1603
+ }
1604
+ ],
1605
+ "logging_steps": 10,
1606
+ "max_steps": 955,
1607
+ "num_input_tokens_seen": 0,
1608
+ "num_train_epochs": 1,
1609
+ "save_steps": 100,
1610
+ "stateful_callbacks": {
1611
+ "TrainerControl": {
1612
+ "args": {
1613
+ "should_epoch_stop": false,
1614
+ "should_evaluate": false,
1615
+ "should_log": false,
1616
+ "should_save": true,
1617
+ "should_training_stop": false
1618
+ },
1619
+ "attributes": {}
1620
+ }
1621
+ },
1622
+ "total_flos": 0.0,
1623
+ "train_batch_size": 4,
1624
+ "trial_name": null,
1625
+ "trial_params": null
1626
+ }