Kimory-X commited on
Commit
dcf27ca
·
verified ·
1 Parent(s): 3621fb1

Model save

Browse files
Files changed (4) hide show
  1. README.md +90 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +2577 -0
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: mistralai/Mistral-7B-v0.1
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ library_name: peft
9
+ model-index:
10
+ - name: zephyr-7b-nca_pair-qlora-lr5e6-beta0.1
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/1014579852qq-tsinghua-university/huggingface/runs/p6zmvlxg)
18
+ # zephyr-7b-nca_pair-qlora-lr5e6-beta0.1
19
+
20
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on an unknown dataset.
21
+ It achieves the following results on the evaluation set:
22
+ - Loss: 1.3409
23
+ - Rewards/chosen: 0.1433
24
+ - Rewards/rejected: -0.3268
25
+ - Rewards/accuracies: 0.7475
26
+ - Rewards/margins: 0.4701
27
+ - Logps/rejected: -257.3707
28
+ - Logps/chosen: -270.7436
29
+ - Logits/rejected: -2.2339
30
+ - Logits/chosen: -2.3137
31
+
32
+ ## Model description
33
+
34
+ More information needed
35
+
36
+ ## Intended uses & limitations
37
+
38
+ More information needed
39
+
40
+ ## Training and evaluation data
41
+
42
+ More information needed
43
+
44
+ ## Training procedure
45
+
46
+ ### Training hyperparameters
47
+
48
+ The following hyperparameters were used during training:
49
+ - learning_rate: 5e-06
50
+ - train_batch_size: 2
51
+ - eval_batch_size: 4
52
+ - seed: 42
53
+ - distributed_type: multi-GPU
54
+ - num_devices: 5
55
+ - gradient_accumulation_steps: 4
56
+ - total_train_batch_size: 40
57
+ - total_eval_batch_size: 20
58
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
59
+ - lr_scheduler_type: cosine
60
+ - lr_scheduler_warmup_ratio: 0.1
61
+ - num_epochs: 1
62
+
63
+ ### Training results
64
+
65
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
66
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
67
+ | 1.3652 | 0.0654 | 100 | 1.3585 | 0.0179 | -0.2204 | 0.7200 | 0.2383 | -256.3067 | -271.9973 | -2.2040 | -2.2872 |
68
+ | 1.3457 | 0.1308 | 200 | 1.3529 | 0.1799 | -0.2015 | 0.7425 | 0.3814 | -256.1176 | -270.3774 | -2.2080 | -2.2912 |
69
+ | 1.3328 | 0.1963 | 300 | 1.3500 | 0.1269 | -0.2919 | 0.7150 | 0.4188 | -257.0218 | -270.9071 | -2.2303 | -2.3106 |
70
+ | 1.3452 | 0.2617 | 400 | 1.3536 | 0.1854 | -0.2395 | 0.7200 | 0.4249 | -256.4976 | -270.3225 | -2.2250 | -2.3062 |
71
+ | 1.3446 | 0.3271 | 500 | 1.3501 | 0.0859 | -0.3936 | 0.7275 | 0.4795 | -258.0389 | -271.3175 | -2.1984 | -2.2818 |
72
+ | 1.333 | 0.3925 | 600 | 1.3496 | 0.0493 | -0.3851 | 0.7450 | 0.4344 | -257.9544 | -271.6837 | -2.2107 | -2.2937 |
73
+ | 1.3577 | 0.4580 | 700 | 1.3457 | 0.1306 | -0.2688 | 0.7175 | 0.3994 | -256.7908 | -270.8706 | -2.2100 | -2.2934 |
74
+ | 1.343 | 0.5234 | 800 | 1.3449 | 0.0814 | -0.3810 | 0.7150 | 0.4623 | -257.9127 | -271.3629 | -2.2312 | -2.3121 |
75
+ | 1.3439 | 0.5888 | 900 | 1.3459 | 0.0385 | -0.4054 | 0.7250 | 0.4439 | -258.1573 | -271.7917 | -2.2327 | -2.3137 |
76
+ | 1.3388 | 0.6542 | 1000 | 1.3442 | 0.2150 | -0.2625 | 0.7325 | 0.4775 | -256.7277 | -270.0262 | -2.2387 | -2.3183 |
77
+ | 1.3186 | 0.7197 | 1100 | 1.3423 | 0.1242 | -0.3587 | 0.7325 | 0.4829 | -257.6895 | -270.9345 | -2.2306 | -2.3107 |
78
+ | 1.3299 | 0.7851 | 1200 | 1.3417 | 0.1468 | -0.3270 | 0.7425 | 0.4737 | -257.3728 | -270.7089 | -2.2275 | -2.3078 |
79
+ | 1.3248 | 0.8505 | 1300 | 1.3413 | 0.1555 | -0.3132 | 0.7525 | 0.4687 | -257.2347 | -270.6216 | -2.2306 | -2.3105 |
80
+ | 1.3398 | 0.9159 | 1400 | 1.3414 | 0.1409 | -0.3251 | 0.7475 | 0.4660 | -257.3535 | -270.7675 | -2.2317 | -2.3117 |
81
+ | 1.325 | 0.9814 | 1500 | 1.3409 | 0.1433 | -0.3268 | 0.7475 | 0.4701 | -257.3707 | -270.7436 | -2.2339 | -2.3137 |
82
+
83
+
84
+ ### Framework versions
85
+
86
+ - PEFT 0.10.0
87
+ - Transformers 4.43.1
88
+ - Pytorch 2.1.2+cu121
89
+ - Datasets 2.18.0
90
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9996728819103696,
3
+ "total_flos": 0.0,
4
+ "train_loss": 1.343712306771603,
5
+ "train_runtime": 13958.9948,
6
+ "train_samples": 61134,
7
+ "train_samples_per_second": 4.38,
8
+ "train_steps_per_second": 0.109
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9996728819103696,
3
+ "total_flos": 0.0,
4
+ "train_loss": 1.343712306771603,
5
+ "train_runtime": 13958.9948,
6
+ "train_samples": 61134,
7
+ "train_samples_per_second": 4.38,
8
+ "train_steps_per_second": 0.109
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,2577 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9996728819103696,
5
+ "eval_steps": 100,
6
+ "global_step": 1528,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0006542361792607131,
13
+ "grad_norm": 7.53125,
14
+ "learning_rate": 3.267973856209151e-08,
15
+ "logits/chosen": -2.690979480743408,
16
+ "logits/rejected": -2.4915528297424316,
17
+ "logps/chosen": -306.9772644042969,
18
+ "logps/rejected": -274.77850341796875,
19
+ "loss": 1.3863,
20
+ "rewards/accuracies": 0.0,
21
+ "rewards/chosen": 0.0,
22
+ "rewards/margins": 0.0,
23
+ "rewards/rejected": 0.0,
24
+ "step": 1
25
+ },
26
+ {
27
+ "epoch": 0.006542361792607131,
28
+ "grad_norm": 6.75,
29
+ "learning_rate": 3.267973856209151e-07,
30
+ "logits/chosen": -2.5390172004699707,
31
+ "logits/rejected": -2.4508614540100098,
32
+ "logps/chosen": -286.64813232421875,
33
+ "logps/rejected": -271.6336669921875,
34
+ "loss": 1.3872,
35
+ "rewards/accuracies": 0.4305555522441864,
36
+ "rewards/chosen": 0.002395547926425934,
37
+ "rewards/margins": 0.0021986099891364574,
38
+ "rewards/rejected": 0.0001969372679013759,
39
+ "step": 10
40
+ },
41
+ {
42
+ "epoch": 0.013084723585214262,
43
+ "grad_norm": 7.4375,
44
+ "learning_rate": 6.535947712418302e-07,
45
+ "logits/chosen": -2.479840040206909,
46
+ "logits/rejected": -2.3548035621643066,
47
+ "logps/chosen": -268.228759765625,
48
+ "logps/rejected": -197.14610290527344,
49
+ "loss": 1.3853,
50
+ "rewards/accuracies": 0.5249999761581421,
51
+ "rewards/chosen": 0.003464195877313614,
52
+ "rewards/margins": -0.0019345780601724982,
53
+ "rewards/rejected": 0.005398774053901434,
54
+ "step": 20
55
+ },
56
+ {
57
+ "epoch": 0.019627085377821395,
58
+ "grad_norm": 8.4375,
59
+ "learning_rate": 9.80392156862745e-07,
60
+ "logits/chosen": -2.4206929206848145,
61
+ "logits/rejected": -2.365497350692749,
62
+ "logps/chosen": -258.31414794921875,
63
+ "logps/rejected": -255.288818359375,
64
+ "loss": 1.3826,
65
+ "rewards/accuracies": 0.637499988079071,
66
+ "rewards/chosen": 0.025201931595802307,
67
+ "rewards/margins": 0.022868353873491287,
68
+ "rewards/rejected": 0.0023335753940045834,
69
+ "step": 30
70
+ },
71
+ {
72
+ "epoch": 0.026169447170428524,
73
+ "grad_norm": 7.09375,
74
+ "learning_rate": 1.3071895424836604e-06,
75
+ "logits/chosen": -2.370481252670288,
76
+ "logits/rejected": -2.3122944831848145,
77
+ "logps/chosen": -273.23687744140625,
78
+ "logps/rejected": -237.98342895507812,
79
+ "loss": 1.3774,
80
+ "rewards/accuracies": 0.6000000238418579,
81
+ "rewards/chosen": 0.017533209174871445,
82
+ "rewards/margins": 0.031263187527656555,
83
+ "rewards/rejected": -0.01372998021543026,
84
+ "step": 40
85
+ },
86
+ {
87
+ "epoch": 0.03271180896303565,
88
+ "grad_norm": 7.25,
89
+ "learning_rate": 1.6339869281045753e-06,
90
+ "logits/chosen": -2.4228413105010986,
91
+ "logits/rejected": -2.365399122238159,
92
+ "logps/chosen": -237.292724609375,
93
+ "logps/rejected": -250.16238403320312,
94
+ "loss": 1.3682,
95
+ "rewards/accuracies": 0.7250000238418579,
96
+ "rewards/chosen": 0.0359283983707428,
97
+ "rewards/margins": 0.08779667317867279,
98
+ "rewards/rejected": -0.0518682599067688,
99
+ "step": 50
100
+ },
101
+ {
102
+ "epoch": 0.03925417075564279,
103
+ "grad_norm": 6.84375,
104
+ "learning_rate": 1.96078431372549e-06,
105
+ "logits/chosen": -2.5553746223449707,
106
+ "logits/rejected": -2.249553680419922,
107
+ "logps/chosen": -291.7508239746094,
108
+ "logps/rejected": -209.22256469726562,
109
+ "loss": 1.3681,
110
+ "rewards/accuracies": 0.6875,
111
+ "rewards/chosen": 0.006251047365367413,
112
+ "rewards/margins": 0.1306430548429489,
113
+ "rewards/rejected": -0.12439201027154922,
114
+ "step": 60
115
+ },
116
+ {
117
+ "epoch": 0.04579653254824992,
118
+ "grad_norm": 7.15625,
119
+ "learning_rate": 2.2875816993464053e-06,
120
+ "logits/chosen": -2.4841151237487793,
121
+ "logits/rejected": -2.4571692943573,
122
+ "logps/chosen": -233.838623046875,
123
+ "logps/rejected": -244.6779022216797,
124
+ "loss": 1.3704,
125
+ "rewards/accuracies": 0.7124999761581421,
126
+ "rewards/chosen": 0.05333448573946953,
127
+ "rewards/margins": 0.15633264183998108,
128
+ "rewards/rejected": -0.10299815982580185,
129
+ "step": 70
130
+ },
131
+ {
132
+ "epoch": 0.05233889434085705,
133
+ "grad_norm": 7.40625,
134
+ "learning_rate": 2.6143790849673208e-06,
135
+ "logits/chosen": -2.369415760040283,
136
+ "logits/rejected": -2.2743725776672363,
137
+ "logps/chosen": -234.2888946533203,
138
+ "logps/rejected": -199.08114624023438,
139
+ "loss": 1.3623,
140
+ "rewards/accuracies": 0.637499988079071,
141
+ "rewards/chosen": 0.011479836888611317,
142
+ "rewards/margins": 0.14200374484062195,
143
+ "rewards/rejected": -0.1305239051580429,
144
+ "step": 80
145
+ },
146
+ {
147
+ "epoch": 0.058881256133464184,
148
+ "grad_norm": 8.25,
149
+ "learning_rate": 2.9411764705882355e-06,
150
+ "logits/chosen": -2.3975484371185303,
151
+ "logits/rejected": -2.3238232135772705,
152
+ "logps/chosen": -267.9956359863281,
153
+ "logps/rejected": -265.9745178222656,
154
+ "loss": 1.3549,
155
+ "rewards/accuracies": 0.762499988079071,
156
+ "rewards/chosen": 0.1009618490934372,
157
+ "rewards/margins": 0.2633386254310608,
158
+ "rewards/rejected": -0.16237673163414001,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.0654236179260713,
163
+ "grad_norm": 6.96875,
164
+ "learning_rate": 3.2679738562091506e-06,
165
+ "logits/chosen": -2.496324062347412,
166
+ "logits/rejected": -2.4217727184295654,
167
+ "logps/chosen": -258.4588317871094,
168
+ "logps/rejected": -248.0790557861328,
169
+ "loss": 1.3652,
170
+ "rewards/accuracies": 0.6499999761581421,
171
+ "rewards/chosen": 0.04371471330523491,
172
+ "rewards/margins": 0.1967853307723999,
173
+ "rewards/rejected": -0.1530705988407135,
174
+ "step": 100
175
+ },
176
+ {
177
+ "epoch": 0.0654236179260713,
178
+ "eval_logits/chosen": -2.2871594429016113,
179
+ "eval_logits/rejected": -2.2039709091186523,
180
+ "eval_logps/chosen": -271.9973449707031,
181
+ "eval_logps/rejected": -256.3067321777344,
182
+ "eval_loss": 1.3585007190704346,
183
+ "eval_rewards/accuracies": 0.7200000286102295,
184
+ "eval_rewards/chosen": 0.01791691593825817,
185
+ "eval_rewards/margins": 0.23829618096351624,
186
+ "eval_rewards/rejected": -0.2203792780637741,
187
+ "eval_runtime": 193.0612,
188
+ "eval_samples_per_second": 10.359,
189
+ "eval_steps_per_second": 0.518,
190
+ "step": 100
191
+ },
192
+ {
193
+ "epoch": 0.07196597971867845,
194
+ "grad_norm": 7.78125,
195
+ "learning_rate": 3.5947712418300657e-06,
196
+ "logits/chosen": -2.355022668838501,
197
+ "logits/rejected": -2.3289151191711426,
198
+ "logps/chosen": -282.1571960449219,
199
+ "logps/rejected": -228.05319213867188,
200
+ "loss": 1.3483,
201
+ "rewards/accuracies": 0.699999988079071,
202
+ "rewards/chosen": 0.0913485437631607,
203
+ "rewards/margins": 0.3209895193576813,
204
+ "rewards/rejected": -0.229640930891037,
205
+ "step": 110
206
+ },
207
+ {
208
+ "epoch": 0.07850834151128558,
209
+ "grad_norm": 8.4375,
210
+ "learning_rate": 3.92156862745098e-06,
211
+ "logits/chosen": -2.3910622596740723,
212
+ "logits/rejected": -2.2716870307922363,
213
+ "logps/chosen": -274.3619079589844,
214
+ "logps/rejected": -280.36614990234375,
215
+ "loss": 1.3505,
216
+ "rewards/accuracies": 0.800000011920929,
217
+ "rewards/chosen": 0.11332492530345917,
218
+ "rewards/margins": 0.39289209246635437,
219
+ "rewards/rejected": -0.279567152261734,
220
+ "step": 120
221
+ },
222
+ {
223
+ "epoch": 0.08505070330389271,
224
+ "grad_norm": 6.65625,
225
+ "learning_rate": 4.2483660130718954e-06,
226
+ "logits/chosen": -2.4673023223876953,
227
+ "logits/rejected": -2.3496053218841553,
228
+ "logps/chosen": -233.6593017578125,
229
+ "logps/rejected": -225.4088134765625,
230
+ "loss": 1.3577,
231
+ "rewards/accuracies": 0.7250000238418579,
232
+ "rewards/chosen": 0.04373621195554733,
233
+ "rewards/margins": 0.27300602197647095,
234
+ "rewards/rejected": -0.22926978766918182,
235
+ "step": 130
236
+ },
237
+ {
238
+ "epoch": 0.09159306509649984,
239
+ "grad_norm": 8.8125,
240
+ "learning_rate": 4.5751633986928105e-06,
241
+ "logits/chosen": -2.488830804824829,
242
+ "logits/rejected": -2.363779067993164,
243
+ "logps/chosen": -295.23980712890625,
244
+ "logps/rejected": -280.0679016113281,
245
+ "loss": 1.3551,
246
+ "rewards/accuracies": 0.6499999761581421,
247
+ "rewards/chosen": 0.03703129664063454,
248
+ "rewards/margins": 0.1885422170162201,
249
+ "rewards/rejected": -0.15151092410087585,
250
+ "step": 140
251
+ },
252
+ {
253
+ "epoch": 0.09813542688910697,
254
+ "grad_norm": 7.21875,
255
+ "learning_rate": 4.901960784313726e-06,
256
+ "logits/chosen": -2.4635255336761475,
257
+ "logits/rejected": -2.3997857570648193,
258
+ "logps/chosen": -271.8360900878906,
259
+ "logps/rejected": -239.2571563720703,
260
+ "loss": 1.3443,
261
+ "rewards/accuracies": 0.7749999761581421,
262
+ "rewards/chosen": 0.13082675635814667,
263
+ "rewards/margins": 0.38594740629196167,
264
+ "rewards/rejected": -0.2551206648349762,
265
+ "step": 150
266
+ },
267
+ {
268
+ "epoch": 0.1046777886817141,
269
+ "grad_norm": 8.1875,
270
+ "learning_rate": 4.999680264259825e-06,
271
+ "logits/chosen": -2.5628502368927,
272
+ "logits/rejected": -2.309502601623535,
273
+ "logps/chosen": -289.554931640625,
274
+ "logps/rejected": -228.1037139892578,
275
+ "loss": 1.3507,
276
+ "rewards/accuracies": 0.6875,
277
+ "rewards/chosen": 0.056355126202106476,
278
+ "rewards/margins": 0.30590537190437317,
279
+ "rewards/rejected": -0.2495502233505249,
280
+ "step": 160
281
+ },
282
+ {
283
+ "epoch": 0.11122015047432122,
284
+ "grad_norm": 24.125,
285
+ "learning_rate": 4.998114408534616e-06,
286
+ "logits/chosen": -2.459836483001709,
287
+ "logits/rejected": -2.241123676300049,
288
+ "logps/chosen": -269.2346496582031,
289
+ "logps/rejected": -245.09005737304688,
290
+ "loss": 1.3566,
291
+ "rewards/accuracies": 0.7250000238418579,
292
+ "rewards/chosen": 0.11544966697692871,
293
+ "rewards/margins": 0.3507261276245117,
294
+ "rewards/rejected": -0.2352764904499054,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 0.11776251226692837,
299
+ "grad_norm": 7.28125,
300
+ "learning_rate": 4.995244522215781e-06,
301
+ "logits/chosen": -2.479318857192993,
302
+ "logits/rejected": -2.452930450439453,
303
+ "logps/chosen": -253.64907836914062,
304
+ "logps/rejected": -255.054443359375,
305
+ "loss": 1.3515,
306
+ "rewards/accuracies": 0.6499999761581421,
307
+ "rewards/chosen": 0.0358736589550972,
308
+ "rewards/margins": 0.2104395180940628,
309
+ "rewards/rejected": -0.174565851688385,
310
+ "step": 180
311
+ },
312
+ {
313
+ "epoch": 0.1243048740595355,
314
+ "grad_norm": 6.78125,
315
+ "learning_rate": 4.9910721034010655e-06,
316
+ "logits/chosen": -2.4955029487609863,
317
+ "logits/rejected": -2.3607935905456543,
318
+ "logps/chosen": -259.48529052734375,
319
+ "logps/rejected": -232.8231964111328,
320
+ "loss": 1.3516,
321
+ "rewards/accuracies": 0.6499999761581421,
322
+ "rewards/chosen": 0.06281731277704239,
323
+ "rewards/margins": 0.3066301941871643,
324
+ "rewards/rejected": -0.24381284415721893,
325
+ "step": 190
326
+ },
327
+ {
328
+ "epoch": 0.1308472358521426,
329
+ "grad_norm": 7.90625,
330
+ "learning_rate": 4.985599330117931e-06,
331
+ "logits/chosen": -2.4151864051818848,
332
+ "logits/rejected": -2.3472375869750977,
333
+ "logps/chosen": -251.25210571289062,
334
+ "logps/rejected": -235.2193603515625,
335
+ "loss": 1.3457,
336
+ "rewards/accuracies": 0.7124999761581421,
337
+ "rewards/chosen": 0.0700979083776474,
338
+ "rewards/margins": 0.3274100422859192,
339
+ "rewards/rejected": -0.25731217861175537,
340
+ "step": 200
341
+ },
342
+ {
343
+ "epoch": 0.1308472358521426,
344
+ "eval_logits/chosen": -2.2911951541900635,
345
+ "eval_logits/rejected": -2.2080154418945312,
346
+ "eval_logps/chosen": -270.37738037109375,
347
+ "eval_logps/rejected": -256.1176452636719,
348
+ "eval_loss": 1.3529404401779175,
349
+ "eval_rewards/accuracies": 0.7425000071525574,
350
+ "eval_rewards/chosen": 0.17991353571414948,
351
+ "eval_rewards/margins": 0.3813881278038025,
352
+ "eval_rewards/rejected": -0.20147459208965302,
353
+ "eval_runtime": 192.5507,
354
+ "eval_samples_per_second": 10.387,
355
+ "eval_steps_per_second": 0.519,
356
+ "step": 200
357
+ },
358
+ {
359
+ "epoch": 0.13738959764474976,
360
+ "grad_norm": 6.9375,
361
+ "learning_rate": 4.978829059186611e-06,
362
+ "logits/chosen": -2.512892246246338,
363
+ "logits/rejected": -2.4710421562194824,
364
+ "logps/chosen": -278.5118713378906,
365
+ "logps/rejected": -281.03924560546875,
366
+ "loss": 1.3592,
367
+ "rewards/accuracies": 0.737500011920929,
368
+ "rewards/chosen": 0.01397608406841755,
369
+ "rewards/margins": 0.33375468850135803,
370
+ "rewards/rejected": -0.31977859139442444,
371
+ "step": 210
372
+ },
373
+ {
374
+ "epoch": 0.1439319594373569,
375
+ "grad_norm": 7.8125,
376
+ "learning_rate": 4.97076482472884e-06,
377
+ "logits/chosen": -2.3786368370056152,
378
+ "logits/rejected": -2.4024887084960938,
379
+ "logps/chosen": -307.94610595703125,
380
+ "logps/rejected": -310.7808532714844,
381
+ "loss": 1.3364,
382
+ "rewards/accuracies": 0.762499988079071,
383
+ "rewards/chosen": 0.07198480516672134,
384
+ "rewards/margins": 0.43232816457748413,
385
+ "rewards/rejected": -0.3603433668613434,
386
+ "step": 220
387
+ },
388
+ {
389
+ "epoch": 0.15047432122996401,
390
+ "grad_norm": 7.96875,
391
+ "learning_rate": 4.961410836323014e-06,
392
+ "logits/chosen": -2.5079922676086426,
393
+ "logits/rejected": -2.3904035091400146,
394
+ "logps/chosen": -292.0177917480469,
395
+ "logps/rejected": -243.524658203125,
396
+ "loss": 1.3466,
397
+ "rewards/accuracies": 0.7250000238418579,
398
+ "rewards/chosen": 0.0814410150051117,
399
+ "rewards/margins": 0.4043430685997009,
400
+ "rewards/rejected": -0.322902113199234,
401
+ "step": 230
402
+ },
403
+ {
404
+ "epoch": 0.15701668302257116,
405
+ "grad_norm": 6.75,
406
+ "learning_rate": 4.950771976806769e-06,
407
+ "logits/chosen": -2.643059730529785,
408
+ "logits/rejected": -2.447086811065674,
409
+ "logps/chosen": -279.7219543457031,
410
+ "logps/rejected": -228.05068969726562,
411
+ "loss": 1.3531,
412
+ "rewards/accuracies": 0.7749999761581421,
413
+ "rewards/chosen": 0.1486392319202423,
414
+ "rewards/margins": 0.4861406683921814,
415
+ "rewards/rejected": -0.3375014662742615,
416
+ "step": 240
417
+ },
418
+ {
419
+ "epoch": 0.16355904481517827,
420
+ "grad_norm": 7.25,
421
+ "learning_rate": 4.938853799728112e-06,
422
+ "logits/chosen": -2.533738613128662,
423
+ "logits/rejected": -2.347072124481201,
424
+ "logps/chosen": -291.59527587890625,
425
+ "logps/rejected": -224.52340698242188,
426
+ "loss": 1.3444,
427
+ "rewards/accuracies": 0.737500011920929,
428
+ "rewards/chosen": 0.14931416511535645,
429
+ "rewards/margins": 0.5157926082611084,
430
+ "rewards/rejected": -0.36647850275039673,
431
+ "step": 250
432
+ },
433
+ {
434
+ "epoch": 0.17010140660778542,
435
+ "grad_norm": 7.96875,
436
+ "learning_rate": 4.925662526446431e-06,
437
+ "logits/chosen": -2.4311368465423584,
438
+ "logits/rejected": -2.3289899826049805,
439
+ "logps/chosen": -253.0860137939453,
440
+ "logps/rejected": -219.7169952392578,
441
+ "loss": 1.3418,
442
+ "rewards/accuracies": 0.6625000238418579,
443
+ "rewards/chosen": 0.03077099286019802,
444
+ "rewards/margins": 0.30891746282577515,
445
+ "rewards/rejected": -0.27814650535583496,
446
+ "step": 260
447
+ },
448
+ {
449
+ "epoch": 0.17664376840039253,
450
+ "grad_norm": 6.90625,
451
+ "learning_rate": 4.911205042884912e-06,
452
+ "logits/chosen": -2.497413158416748,
453
+ "logits/rejected": -2.3586812019348145,
454
+ "logps/chosen": -294.7901916503906,
455
+ "logps/rejected": -242.64291381835938,
456
+ "loss": 1.3551,
457
+ "rewards/accuracies": 0.625,
458
+ "rewards/chosen": 0.0127085717394948,
459
+ "rewards/margins": 0.21187984943389893,
460
+ "rewards/rejected": -0.1991712599992752,
461
+ "step": 270
462
+ },
463
+ {
464
+ "epoch": 0.18318613019299967,
465
+ "grad_norm": 7.15625,
466
+ "learning_rate": 4.895488895936047e-06,
467
+ "logits/chosen": -2.4523603916168213,
468
+ "logits/rejected": -2.4156365394592285,
469
+ "logps/chosen": -240.6490020751953,
470
+ "logps/rejected": -219.9276580810547,
471
+ "loss": 1.3329,
472
+ "rewards/accuracies": 0.762499988079071,
473
+ "rewards/chosen": 0.1077442541718483,
474
+ "rewards/margins": 0.4334333539009094,
475
+ "rewards/rejected": -0.32568907737731934,
476
+ "step": 280
477
+ },
478
+ {
479
+ "epoch": 0.18972849198560682,
480
+ "grad_norm": 10.0,
481
+ "learning_rate": 4.8785222895221075e-06,
482
+ "logits/chosen": -2.671842336654663,
483
+ "logits/rejected": -2.547651767730713,
484
+ "logps/chosen": -322.6619567871094,
485
+ "logps/rejected": -319.7884216308594,
486
+ "loss": 1.3476,
487
+ "rewards/accuracies": 0.6875,
488
+ "rewards/chosen": 0.07628868520259857,
489
+ "rewards/margins": 0.34415873885154724,
490
+ "rewards/rejected": -0.26787006855010986,
491
+ "step": 290
492
+ },
493
+ {
494
+ "epoch": 0.19627085377821393,
495
+ "grad_norm": 7.8125,
496
+ "learning_rate": 4.860314080312651e-06,
497
+ "logits/chosen": -2.5084714889526367,
498
+ "logits/rejected": -2.4003777503967285,
499
+ "logps/chosen": -274.4216003417969,
500
+ "logps/rejected": -254.49710083007812,
501
+ "loss": 1.3328,
502
+ "rewards/accuracies": 0.737500011920929,
503
+ "rewards/chosen": 0.07708136737346649,
504
+ "rewards/margins": 0.42221707105636597,
505
+ "rewards/rejected": -0.3451356589794159,
506
+ "step": 300
507
+ },
508
+ {
509
+ "epoch": 0.19627085377821393,
510
+ "eval_logits/chosen": -2.3106164932250977,
511
+ "eval_logits/rejected": -2.230290412902832,
512
+ "eval_logps/chosen": -270.9070739746094,
513
+ "eval_logps/rejected": -257.02178955078125,
514
+ "eval_loss": 1.3500434160232544,
515
+ "eval_rewards/accuracies": 0.7149999737739563,
516
+ "eval_rewards/chosen": 0.12694190442562103,
517
+ "eval_rewards/margins": 0.41882869601249695,
518
+ "eval_rewards/rejected": -0.2918868362903595,
519
+ "eval_runtime": 192.4649,
520
+ "eval_samples_per_second": 10.392,
521
+ "eval_steps_per_second": 0.52,
522
+ "step": 300
523
+ },
524
+ {
525
+ "epoch": 0.20281321557082108,
526
+ "grad_norm": 8.125,
527
+ "learning_rate": 4.840873773101287e-06,
528
+ "logits/chosen": -2.4888222217559814,
529
+ "logits/rejected": -2.4138476848602295,
530
+ "logps/chosen": -264.6638488769531,
531
+ "logps/rejected": -239.2753448486328,
532
+ "loss": 1.3448,
533
+ "rewards/accuracies": 0.7875000238418579,
534
+ "rewards/chosen": 0.10051695257425308,
535
+ "rewards/margins": 0.5004209280014038,
536
+ "rewards/rejected": -0.39990395307540894,
537
+ "step": 310
538
+ },
539
+ {
540
+ "epoch": 0.2093555773634282,
541
+ "grad_norm": 7.6875,
542
+ "learning_rate": 4.820211515844116e-06,
543
+ "logits/chosen": -2.500939130783081,
544
+ "logits/rejected": -2.4757304191589355,
545
+ "logps/chosen": -203.4287109375,
546
+ "logps/rejected": -218.0804901123047,
547
+ "loss": 1.3531,
548
+ "rewards/accuracies": 0.675000011920929,
549
+ "rewards/chosen": 0.10015901178121567,
550
+ "rewards/margins": 0.3148805797100067,
551
+ "rewards/rejected": -0.21472156047821045,
552
+ "step": 320
553
+ },
554
+ {
555
+ "epoch": 0.21589793915603533,
556
+ "grad_norm": 7.96875,
557
+ "learning_rate": 4.798338094362439e-06,
558
+ "logits/chosen": -2.477666139602661,
559
+ "logits/rejected": -2.5014617443084717,
560
+ "logps/chosen": -288.17950439453125,
561
+ "logps/rejected": -281.34295654296875,
562
+ "loss": 1.3666,
563
+ "rewards/accuracies": 0.675000011920929,
564
+ "rewards/chosen": 0.07362223416566849,
565
+ "rewards/margins": 0.274558961391449,
566
+ "rewards/rejected": -0.2009367197751999,
567
+ "step": 330
568
+ },
569
+ {
570
+ "epoch": 0.22244030094864245,
571
+ "grad_norm": 6.75,
572
+ "learning_rate": 4.775264926712489e-06,
573
+ "logits/chosen": -2.4466490745544434,
574
+ "logits/rejected": -2.4048807621002197,
575
+ "logps/chosen": -246.18392944335938,
576
+ "logps/rejected": -259.27392578125,
577
+ "loss": 1.3409,
578
+ "rewards/accuracies": 0.762499988079071,
579
+ "rewards/chosen": 0.11778600513935089,
580
+ "rewards/margins": 0.4433900713920593,
581
+ "rewards/rejected": -0.32560408115386963,
582
+ "step": 340
583
+ },
584
+ {
585
+ "epoch": 0.2289826627412496,
586
+ "grad_norm": 6.5,
587
+ "learning_rate": 4.751004057225147e-06,
588
+ "logits/chosen": -2.4351608753204346,
589
+ "logits/rejected": -2.3736894130706787,
590
+ "logps/chosen": -275.3951721191406,
591
+ "logps/rejected": -281.2572021484375,
592
+ "loss": 1.344,
593
+ "rewards/accuracies": 0.762499988079071,
594
+ "rewards/chosen": 0.12299034744501114,
595
+ "rewards/margins": 0.42316460609436035,
596
+ "rewards/rejected": -0.3001742362976074,
597
+ "step": 350
598
+ },
599
+ {
600
+ "epoch": 0.23552502453385674,
601
+ "grad_norm": 7.875,
602
+ "learning_rate": 4.725568150218719e-06,
603
+ "logits/chosen": -2.353898763656616,
604
+ "logits/rejected": -2.3495678901672363,
605
+ "logps/chosen": -247.7540283203125,
606
+ "logps/rejected": -262.0528564453125,
607
+ "loss": 1.339,
608
+ "rewards/accuracies": 0.7749999761581421,
609
+ "rewards/chosen": 0.10569906234741211,
610
+ "rewards/margins": 0.41347068548202515,
611
+ "rewards/rejected": -0.3077716529369354,
612
+ "step": 360
613
+ },
614
+ {
615
+ "epoch": 0.24206738632646385,
616
+ "grad_norm": 8.6875,
617
+ "learning_rate": 4.6989704833880936e-06,
618
+ "logits/chosen": -2.455857515335083,
619
+ "logits/rejected": -2.2684264183044434,
620
+ "logps/chosen": -278.91278076171875,
621
+ "logps/rejected": -248.13729858398438,
622
+ "loss": 1.3587,
623
+ "rewards/accuracies": 0.637499988079071,
624
+ "rewards/chosen": 0.05706097558140755,
625
+ "rewards/margins": 0.2867721915245056,
626
+ "rewards/rejected": -0.22971120476722717,
627
+ "step": 370
628
+ },
629
+ {
630
+ "epoch": 0.248609748119071,
631
+ "grad_norm": 7.5,
632
+ "learning_rate": 4.671224940873704e-06,
633
+ "logits/chosen": -2.5477356910705566,
634
+ "logits/rejected": -2.4833083152770996,
635
+ "logps/chosen": -284.73394775390625,
636
+ "logps/rejected": -268.12493896484375,
637
+ "loss": 1.3407,
638
+ "rewards/accuracies": 0.762499988079071,
639
+ "rewards/chosen": 0.14602813124656677,
640
+ "rewards/margins": 0.4975086748600006,
641
+ "rewards/rejected": -0.35148054361343384,
642
+ "step": 380
643
+ },
644
+ {
645
+ "epoch": 0.25515210991167814,
646
+ "grad_norm": 8.0625,
647
+ "learning_rate": 4.642346006013925e-06,
648
+ "logits/chosen": -2.563493251800537,
649
+ "logits/rejected": -2.495145320892334,
650
+ "logps/chosen": -258.86444091796875,
651
+ "logps/rejected": -258.3479309082031,
652
+ "loss": 1.3374,
653
+ "rewards/accuracies": 0.7124999761581421,
654
+ "rewards/chosen": 0.12897971272468567,
655
+ "rewards/margins": 0.4424809515476227,
656
+ "rewards/rejected": -0.3135012090206146,
657
+ "step": 390
658
+ },
659
+ {
660
+ "epoch": 0.2616944717042852,
661
+ "grad_norm": 7.6875,
662
+ "learning_rate": 4.612348753784682e-06,
663
+ "logits/chosen": -2.5260744094848633,
664
+ "logits/rejected": -2.3809266090393066,
665
+ "logps/chosen": -296.6476745605469,
666
+ "logps/rejected": -242.137451171875,
667
+ "loss": 1.3452,
668
+ "rewards/accuracies": 0.699999988079071,
669
+ "rewards/chosen": 0.08820326626300812,
670
+ "rewards/margins": 0.4365547299385071,
671
+ "rewards/rejected": -0.34835144877433777,
672
+ "step": 400
673
+ },
674
+ {
675
+ "epoch": 0.2616944717042852,
676
+ "eval_logits/chosen": -2.3061885833740234,
677
+ "eval_logits/rejected": -2.224980115890503,
678
+ "eval_logps/chosen": -270.3225402832031,
679
+ "eval_logps/rejected": -256.4975891113281,
680
+ "eval_loss": 1.3535617589950562,
681
+ "eval_rewards/accuracies": 0.7200000286102295,
682
+ "eval_rewards/chosen": 0.185396209359169,
683
+ "eval_rewards/margins": 0.42486390471458435,
684
+ "eval_rewards/rejected": -0.23946763575077057,
685
+ "eval_runtime": 192.5082,
686
+ "eval_samples_per_second": 10.389,
687
+ "eval_steps_per_second": 0.519,
688
+ "step": 400
689
+ },
690
+ {
691
+ "epoch": 0.26823683349689237,
692
+ "grad_norm": 7.71875,
693
+ "learning_rate": 4.5812488429302245e-06,
694
+ "logits/chosen": -2.4703800678253174,
695
+ "logits/rejected": -2.384880542755127,
696
+ "logps/chosen": -242.77249145507812,
697
+ "logps/rejected": -216.26919555664062,
698
+ "loss": 1.3521,
699
+ "rewards/accuracies": 0.675000011920929,
700
+ "rewards/chosen": 0.08403290808200836,
701
+ "rewards/margins": 0.23067112267017365,
702
+ "rewards/rejected": -0.14663821458816528,
703
+ "step": 410
704
+ },
705
+ {
706
+ "epoch": 0.2747791952894995,
707
+ "grad_norm": 7.625,
708
+ "learning_rate": 4.54906250778917e-06,
709
+ "logits/chosen": -2.494356632232666,
710
+ "logits/rejected": -2.4741995334625244,
711
+ "logps/chosen": -307.1471252441406,
712
+ "logps/rejected": -280.8088073730469,
713
+ "loss": 1.3253,
714
+ "rewards/accuracies": 0.675000011920929,
715
+ "rewards/chosen": 0.11699722707271576,
716
+ "rewards/margins": 0.491929829120636,
717
+ "rewards/rejected": -0.37493258714675903,
718
+ "step": 420
719
+ },
720
+ {
721
+ "epoch": 0.28132155708210665,
722
+ "grad_norm": 6.53125,
723
+ "learning_rate": 4.515806549820084e-06,
724
+ "logits/chosen": -2.4651083946228027,
725
+ "logits/rejected": -2.3330154418945312,
726
+ "logps/chosen": -281.57110595703125,
727
+ "logps/rejected": -254.07498168945312,
728
+ "loss": 1.3406,
729
+ "rewards/accuracies": 0.7124999761581421,
730
+ "rewards/chosen": 0.16895495355129242,
731
+ "rewards/margins": 0.4621726870536804,
732
+ "rewards/rejected": -0.2932177186012268,
733
+ "step": 430
734
+ },
735
+ {
736
+ "epoch": 0.2878639188747138,
737
+ "grad_norm": 8.875,
738
+ "learning_rate": 4.48149832883101e-06,
739
+ "logits/chosen": -2.4727940559387207,
740
+ "logits/rejected": -2.314194917678833,
741
+ "logps/chosen": -266.56134033203125,
742
+ "logps/rejected": -236.39089965820312,
743
+ "loss": 1.3585,
744
+ "rewards/accuracies": 0.6000000238418579,
745
+ "rewards/chosen": -0.08974994719028473,
746
+ "rewards/margins": 0.26218581199645996,
747
+ "rewards/rejected": -0.3519357144832611,
748
+ "step": 440
749
+ },
750
+ {
751
+ "epoch": 0.2944062806673209,
752
+ "grad_norm": 9.125,
753
+ "learning_rate": 4.446155753917559e-06,
754
+ "logits/chosen": -2.4373867511749268,
755
+ "logits/rejected": -2.3501367568969727,
756
+ "logps/chosen": -249.449462890625,
757
+ "logps/rejected": -280.8139343261719,
758
+ "loss": 1.355,
759
+ "rewards/accuracies": 0.7250000238418579,
760
+ "rewards/chosen": 0.09939811378717422,
761
+ "rewards/margins": 0.3732451796531677,
762
+ "rewards/rejected": -0.2738470435142517,
763
+ "step": 450
764
+ },
765
+ {
766
+ "epoch": 0.30094864245992803,
767
+ "grad_norm": 7.375,
768
+ "learning_rate": 4.409797274114245e-06,
769
+ "logits/chosen": -2.5222246646881104,
770
+ "logits/rejected": -2.3621227741241455,
771
+ "logps/chosen": -293.97747802734375,
772
+ "logps/rejected": -264.7182312011719,
773
+ "loss": 1.3389,
774
+ "rewards/accuracies": 0.75,
775
+ "rewards/chosen": 0.11550019681453705,
776
+ "rewards/margins": 0.4369579255580902,
777
+ "rewards/rejected": -0.32145771384239197,
778
+ "step": 460
779
+ },
780
+ {
781
+ "epoch": 0.30749100425253517,
782
+ "grad_norm": 8.0,
783
+ "learning_rate": 4.372441868763981e-06,
784
+ "logits/chosen": -2.4279232025146484,
785
+ "logits/rejected": -2.3788094520568848,
786
+ "logps/chosen": -269.70782470703125,
787
+ "logps/rejected": -271.08380126953125,
788
+ "loss": 1.3499,
789
+ "rewards/accuracies": 0.737500011920929,
790
+ "rewards/chosen": 0.1701391041278839,
791
+ "rewards/margins": 0.48491889238357544,
792
+ "rewards/rejected": -0.31477978825569153,
793
+ "step": 470
794
+ },
795
+ {
796
+ "epoch": 0.3140333660451423,
797
+ "grad_norm": 6.8125,
798
+ "learning_rate": 4.334109037610757e-06,
799
+ "logits/chosen": -2.47053599357605,
800
+ "logits/rejected": -2.4681105613708496,
801
+ "logps/chosen": -248.77597045898438,
802
+ "logps/rejected": -264.0259704589844,
803
+ "loss": 1.3588,
804
+ "rewards/accuracies": 0.7124999761581421,
805
+ "rewards/chosen": 0.08740128576755524,
806
+ "rewards/margins": 0.4287898540496826,
807
+ "rewards/rejected": -0.3413885235786438,
808
+ "step": 480
809
+ },
810
+ {
811
+ "epoch": 0.3205757278377494,
812
+ "grad_norm": 9.0625,
813
+ "learning_rate": 4.294818790620644e-06,
814
+ "logits/chosen": -2.388354539871216,
815
+ "logits/rejected": -2.2699804306030273,
816
+ "logps/chosen": -230.1956329345703,
817
+ "logps/rejected": -222.83560180664062,
818
+ "loss": 1.3372,
819
+ "rewards/accuracies": 0.7250000238418579,
820
+ "rewards/chosen": 0.11192689836025238,
821
+ "rewards/margins": 0.4855028986930847,
822
+ "rewards/rejected": -0.37357598543167114,
823
+ "step": 490
824
+ },
825
+ {
826
+ "epoch": 0.32711808963035655,
827
+ "grad_norm": 6.46875,
828
+ "learning_rate": 4.2545916375364835e-06,
829
+ "logits/chosen": -2.4573752880096436,
830
+ "logits/rejected": -2.384742259979248,
831
+ "logps/chosen": -253.0285186767578,
832
+ "logps/rejected": -235.6699676513672,
833
+ "loss": 1.3446,
834
+ "rewards/accuracies": 0.7250000238418579,
835
+ "rewards/chosen": 0.12988470494747162,
836
+ "rewards/margins": 0.4586367607116699,
837
+ "rewards/rejected": -0.3287521004676819,
838
+ "step": 500
839
+ },
840
+ {
841
+ "epoch": 0.32711808963035655,
842
+ "eval_logits/chosen": -2.2817647457122803,
843
+ "eval_logits/rejected": -2.1983530521392822,
844
+ "eval_logps/chosen": -271.3174743652344,
845
+ "eval_logps/rejected": -258.03887939453125,
846
+ "eval_loss": 1.3500897884368896,
847
+ "eval_rewards/accuracies": 0.7275000214576721,
848
+ "eval_rewards/chosen": 0.08590395003557205,
849
+ "eval_rewards/margins": 0.47949859499931335,
850
+ "eval_rewards/rejected": -0.3935946226119995,
851
+ "eval_runtime": 192.5361,
852
+ "eval_samples_per_second": 10.388,
853
+ "eval_steps_per_second": 0.519,
854
+ "step": 500
855
+ },
856
+ {
857
+ "epoch": 0.3336604514229637,
858
+ "grad_norm": 6.875,
859
+ "learning_rate": 4.213448577171676e-06,
860
+ "logits/chosen": -2.4477853775024414,
861
+ "logits/rejected": -2.2752411365509033,
862
+ "logps/chosen": -286.5871887207031,
863
+ "logps/rejected": -277.1395568847656,
864
+ "loss": 1.3551,
865
+ "rewards/accuracies": 0.699999988079071,
866
+ "rewards/chosen": 0.09675732254981995,
867
+ "rewards/margins": 0.36776870489120483,
868
+ "rewards/rejected": -0.2710114121437073,
869
+ "step": 510
870
+ },
871
+ {
872
+ "epoch": 0.34020281321557083,
873
+ "grad_norm": 6.71875,
874
+ "learning_rate": 4.171411086448674e-06,
875
+ "logits/chosen": -2.4915995597839355,
876
+ "logits/rejected": -2.40187668800354,
877
+ "logps/chosen": -264.9870910644531,
878
+ "logps/rejected": -248.1427001953125,
879
+ "loss": 1.3515,
880
+ "rewards/accuracies": 0.7124999761581421,
881
+ "rewards/chosen": 0.009617133066058159,
882
+ "rewards/margins": 0.3053894639015198,
883
+ "rewards/rejected": -0.29577234387397766,
884
+ "step": 520
885
+ },
886
+ {
887
+ "epoch": 0.346745175008178,
888
+ "grad_norm": 6.84375,
889
+ "learning_rate": 4.128501109187903e-06,
890
+ "logits/chosen": -2.5452780723571777,
891
+ "logits/rejected": -2.388065814971924,
892
+ "logps/chosen": -256.1615905761719,
893
+ "logps/rejected": -243.15542602539062,
894
+ "loss": 1.3448,
895
+ "rewards/accuracies": 0.737500011920929,
896
+ "rewards/chosen": 0.09919309616088867,
897
+ "rewards/margins": 0.45780619978904724,
898
+ "rewards/rejected": -0.35861313343048096,
899
+ "step": 530
900
+ },
901
+ {
902
+ "epoch": 0.35328753680078506,
903
+ "grad_norm": 8.0,
904
+ "learning_rate": 4.084741044652956e-06,
905
+ "logits/chosen": -2.5606534481048584,
906
+ "logits/rejected": -2.3131465911865234,
907
+ "logps/chosen": -287.9478759765625,
908
+ "logps/rejected": -224.75527954101562,
909
+ "loss": 1.3668,
910
+ "rewards/accuracies": 0.737500011920929,
911
+ "rewards/chosen": 0.17254912853240967,
912
+ "rewards/margins": 0.42019981145858765,
913
+ "rewards/rejected": -0.24765071272850037,
914
+ "step": 540
915
+ },
916
+ {
917
+ "epoch": 0.3598298985933922,
918
+ "grad_norm": 6.90625,
919
+ "learning_rate": 4.040153735858041e-06,
920
+ "logits/chosen": -2.4491143226623535,
921
+ "logits/rejected": -2.3129725456237793,
922
+ "logps/chosen": -274.43658447265625,
923
+ "logps/rejected": -268.6435241699219,
924
+ "loss": 1.3476,
925
+ "rewards/accuracies": 0.75,
926
+ "rewards/chosen": 0.14739489555358887,
927
+ "rewards/margins": 0.41326436400413513,
928
+ "rewards/rejected": -0.26586946845054626,
929
+ "step": 550
930
+ },
931
+ {
932
+ "epoch": 0.36637226038599935,
933
+ "grad_norm": 6.84375,
934
+ "learning_rate": 3.9947624576437975e-06,
935
+ "logits/chosen": -2.47575306892395,
936
+ "logits/rejected": -2.3158023357391357,
937
+ "logps/chosen": -230.30819702148438,
938
+ "logps/rejected": -211.89633178710938,
939
+ "loss": 1.3509,
940
+ "rewards/accuracies": 0.7124999761581421,
941
+ "rewards/chosen": 0.07401107251644135,
942
+ "rewards/margins": 0.36348801851272583,
943
+ "rewards/rejected": -0.2894769012928009,
944
+ "step": 560
945
+ },
946
+ {
947
+ "epoch": 0.3729146221786065,
948
+ "grad_norm": 6.90625,
949
+ "learning_rate": 3.948590904527689e-06,
950
+ "logits/chosen": -2.395169496536255,
951
+ "logits/rejected": -2.330249309539795,
952
+ "logps/chosen": -243.09487915039062,
953
+ "logps/rejected": -263.39227294921875,
954
+ "loss": 1.3349,
955
+ "rewards/accuracies": 0.7250000238418579,
956
+ "rewards/chosen": -0.01647457666695118,
957
+ "rewards/margins": 0.5050023794174194,
958
+ "rewards/rejected": -0.5214769244194031,
959
+ "step": 570
960
+ },
961
+ {
962
+ "epoch": 0.37945698397121363,
963
+ "grad_norm": 6.46875,
964
+ "learning_rate": 3.901663178335318e-06,
965
+ "logits/chosen": -2.5840940475463867,
966
+ "logits/rejected": -2.4302070140838623,
967
+ "logps/chosen": -292.9772644042969,
968
+ "logps/rejected": -280.25726318359375,
969
+ "loss": 1.3405,
970
+ "rewards/accuracies": 0.675000011920929,
971
+ "rewards/chosen": 0.11058641970157623,
972
+ "rewards/margins": 0.506061851978302,
973
+ "rewards/rejected": -0.39547544717788696,
974
+ "step": 580
975
+ },
976
+ {
977
+ "epoch": 0.3859993457638207,
978
+ "grad_norm": 7.59375,
979
+ "learning_rate": 3.854003775619142e-06,
980
+ "logits/chosen": -2.3781380653381348,
981
+ "logits/rejected": -2.372018337249756,
982
+ "logps/chosen": -263.7474365234375,
983
+ "logps/rejected": -244.7537384033203,
984
+ "loss": 1.3416,
985
+ "rewards/accuracies": 0.7250000238418579,
986
+ "rewards/chosen": 0.06603378802537918,
987
+ "rewards/margins": 0.4166053831577301,
988
+ "rewards/rejected": -0.3505716323852539,
989
+ "step": 590
990
+ },
991
+ {
992
+ "epoch": 0.39254170755642787,
993
+ "grad_norm": 8.0625,
994
+ "learning_rate": 3.805637574871115e-06,
995
+ "logits/chosen": -2.455042600631714,
996
+ "logits/rejected": -2.3687422275543213,
997
+ "logps/chosen": -266.13043212890625,
998
+ "logps/rejected": -225.39614868164062,
999
+ "loss": 1.333,
1000
+ "rewards/accuracies": 0.75,
1001
+ "rewards/chosen": 0.10043742507696152,
1002
+ "rewards/margins": 0.5269637703895569,
1003
+ "rewards/rejected": -0.4265263080596924,
1004
+ "step": 600
1005
+ },
1006
+ {
1007
+ "epoch": 0.39254170755642787,
1008
+ "eval_logits/chosen": -2.293658494949341,
1009
+ "eval_logits/rejected": -2.2107114791870117,
1010
+ "eval_logps/chosen": -271.6837463378906,
1011
+ "eval_logps/rejected": -257.95440673828125,
1012
+ "eval_loss": 1.3495758771896362,
1013
+ "eval_rewards/accuracies": 0.7450000047683716,
1014
+ "eval_rewards/chosen": 0.04927777126431465,
1015
+ "eval_rewards/margins": 0.43442490696907043,
1016
+ "eval_rewards/rejected": -0.38514718413352966,
1017
+ "eval_runtime": 192.5013,
1018
+ "eval_samples_per_second": 10.39,
1019
+ "eval_steps_per_second": 0.519,
1020
+ "step": 600
1021
+ },
1022
+ {
1023
+ "epoch": 0.399084069349035,
1024
+ "grad_norm": 6.71875,
1025
+ "learning_rate": 3.7565898235359717e-06,
1026
+ "logits/chosen": -2.531266689300537,
1027
+ "logits/rejected": -2.4104650020599365,
1028
+ "logps/chosen": -268.50482177734375,
1029
+ "logps/rejected": -252.98202514648438,
1030
+ "loss": 1.3522,
1031
+ "rewards/accuracies": 0.75,
1032
+ "rewards/chosen": 0.11455075442790985,
1033
+ "rewards/margins": 0.33470815420150757,
1034
+ "rewards/rejected": -0.2201574295759201,
1035
+ "step": 610
1036
+ },
1037
+ {
1038
+ "epoch": 0.40562643114164215,
1039
+ "grad_norm": 7.0,
1040
+ "learning_rate": 3.7068861248319127e-06,
1041
+ "logits/chosen": -2.420405626296997,
1042
+ "logits/rejected": -2.36627197265625,
1043
+ "logps/chosen": -252.94091796875,
1044
+ "logps/rejected": -260.6941833496094,
1045
+ "loss": 1.3502,
1046
+ "rewards/accuracies": 0.7749999761581421,
1047
+ "rewards/chosen": 0.004491160623729229,
1048
+ "rewards/margins": 0.4471139907836914,
1049
+ "rewards/rejected": -0.4426228106021881,
1050
+ "step": 620
1051
+ },
1052
+ {
1053
+ "epoch": 0.41216879293424924,
1054
+ "grad_norm": 8.125,
1055
+ "learning_rate": 3.6565524243855695e-06,
1056
+ "logits/chosen": -2.482597589492798,
1057
+ "logits/rejected": -2.381808042526245,
1058
+ "logps/chosen": -270.79833984375,
1059
+ "logps/rejected": -254.92868041992188,
1060
+ "loss": 1.3455,
1061
+ "rewards/accuracies": 0.75,
1062
+ "rewards/chosen": 0.18158364295959473,
1063
+ "rewards/margins": 0.5938194394111633,
1064
+ "rewards/rejected": -0.41223573684692383,
1065
+ "step": 630
1066
+ },
1067
+ {
1068
+ "epoch": 0.4187111547268564,
1069
+ "grad_norm": 8.125,
1070
+ "learning_rate": 3.6056149966882325e-06,
1071
+ "logits/chosen": -2.4741575717926025,
1072
+ "logits/rejected": -2.4443914890289307,
1073
+ "logps/chosen": -257.9027099609375,
1074
+ "logps/rejected": -263.8404235839844,
1075
+ "loss": 1.3501,
1076
+ "rewards/accuracies": 0.7749999761581421,
1077
+ "rewards/chosen": 0.12312284857034683,
1078
+ "rewards/margins": 0.4808879792690277,
1079
+ "rewards/rejected": -0.35776516795158386,
1080
+ "step": 640
1081
+ },
1082
+ {
1083
+ "epoch": 0.4252535165194635,
1084
+ "grad_norm": 7.84375,
1085
+ "learning_rate": 3.554100431380414e-06,
1086
+ "logits/chosen": -2.493617534637451,
1087
+ "logits/rejected": -2.3935816287994385,
1088
+ "logps/chosen": -249.92294311523438,
1089
+ "logps/rejected": -238.17404174804688,
1090
+ "loss": 1.3504,
1091
+ "rewards/accuracies": 0.75,
1092
+ "rewards/chosen": 0.04006017744541168,
1093
+ "rewards/margins": 0.48104342818260193,
1094
+ "rewards/rejected": -0.44098323583602905,
1095
+ "step": 650
1096
+ },
1097
+ {
1098
+ "epoch": 0.43179587831207067,
1099
+ "grad_norm": 9.0625,
1100
+ "learning_rate": 3.5020356193718934e-06,
1101
+ "logits/chosen": -2.4255988597869873,
1102
+ "logits/rejected": -2.3747265338897705,
1103
+ "logps/chosen": -217.7563934326172,
1104
+ "logps/rejected": -220.0769500732422,
1105
+ "loss": 1.3667,
1106
+ "rewards/accuracies": 0.7875000238418579,
1107
+ "rewards/chosen": 0.10521771758794785,
1108
+ "rewards/margins": 0.3761609196662903,
1109
+ "rewards/rejected": -0.27094319462776184,
1110
+ "step": 660
1111
+ },
1112
+ {
1113
+ "epoch": 0.4383382401046778,
1114
+ "grad_norm": 7.96875,
1115
+ "learning_rate": 3.4494477388045035e-06,
1116
+ "logits/chosen": -2.491364002227783,
1117
+ "logits/rejected": -2.235619068145752,
1118
+ "logps/chosen": -301.46600341796875,
1119
+ "logps/rejected": -238.8917999267578,
1120
+ "loss": 1.3419,
1121
+ "rewards/accuracies": 0.75,
1122
+ "rewards/chosen": 0.07640339434146881,
1123
+ "rewards/margins": 0.46867623925209045,
1124
+ "rewards/rejected": -0.39227285981178284,
1125
+ "step": 670
1126
+ },
1127
+ {
1128
+ "epoch": 0.4448806018972849,
1129
+ "grad_norm": 7.15625,
1130
+ "learning_rate": 3.3963642408649783e-06,
1131
+ "logits/chosen": -2.4565513134002686,
1132
+ "logits/rejected": -2.3826098442077637,
1133
+ "logps/chosen": -282.40228271484375,
1134
+ "logps/rejected": -249.9719696044922,
1135
+ "loss": 1.3423,
1136
+ "rewards/accuracies": 0.7749999761581421,
1137
+ "rewards/chosen": 0.1387428194284439,
1138
+ "rewards/margins": 0.49391037225723267,
1139
+ "rewards/rejected": -0.35516756772994995,
1140
+ "step": 680
1141
+ },
1142
+ {
1143
+ "epoch": 0.45142296368989204,
1144
+ "grad_norm": 8.5,
1145
+ "learning_rate": 3.3428128354552727e-06,
1146
+ "logits/chosen": -2.3810625076293945,
1147
+ "logits/rejected": -2.34759521484375,
1148
+ "logps/chosen": -252.3175048828125,
1149
+ "logps/rejected": -237.6392364501953,
1150
+ "loss": 1.3457,
1151
+ "rewards/accuracies": 0.699999988079071,
1152
+ "rewards/chosen": 0.10267229378223419,
1153
+ "rewards/margins": 0.4029284119606018,
1154
+ "rewards/rejected": -0.30025607347488403,
1155
+ "step": 690
1156
+ },
1157
+ {
1158
+ "epoch": 0.4579653254824992,
1159
+ "grad_norm": 6.375,
1160
+ "learning_rate": 3.2888214767278246e-06,
1161
+ "logits/chosen": -2.5027060508728027,
1162
+ "logits/rejected": -2.3413376808166504,
1163
+ "logps/chosen": -307.34686279296875,
1164
+ "logps/rejected": -240.319091796875,
1165
+ "loss": 1.3577,
1166
+ "rewards/accuracies": 0.675000011920929,
1167
+ "rewards/chosen": 0.04387027025222778,
1168
+ "rewards/margins": 0.33918899297714233,
1169
+ "rewards/rejected": -0.29531875252723694,
1170
+ "step": 700
1171
+ },
1172
+ {
1173
+ "epoch": 0.4579653254824992,
1174
+ "eval_logits/chosen": -2.2934322357177734,
1175
+ "eval_logits/rejected": -2.2099828720092773,
1176
+ "eval_logps/chosen": -270.8705749511719,
1177
+ "eval_logps/rejected": -256.79083251953125,
1178
+ "eval_loss": 1.345709204673767,
1179
+ "eval_rewards/accuracies": 0.7174999713897705,
1180
+ "eval_rewards/chosen": 0.13059137761592865,
1181
+ "eval_rewards/margins": 0.39938145875930786,
1182
+ "eval_rewards/rejected": -0.268790066242218,
1183
+ "eval_runtime": 192.4798,
1184
+ "eval_samples_per_second": 10.391,
1185
+ "eval_steps_per_second": 0.52,
1186
+ "step": 700
1187
+ },
1188
+ {
1189
+ "epoch": 0.46450768727510633,
1190
+ "grad_norm": 7.21875,
1191
+ "learning_rate": 3.2344183484933247e-06,
1192
+ "logits/chosen": -2.443279266357422,
1193
+ "logits/rejected": -2.3943705558776855,
1194
+ "logps/chosen": -256.82928466796875,
1195
+ "logps/rejected": -235.6039276123047,
1196
+ "loss": 1.3379,
1197
+ "rewards/accuracies": 0.699999988079071,
1198
+ "rewards/chosen": 0.02693643607199192,
1199
+ "rewards/margins": 0.40261945128440857,
1200
+ "rewards/rejected": -0.3756829798221588,
1201
+ "step": 710
1202
+ },
1203
+ {
1204
+ "epoch": 0.47105004906771347,
1205
+ "grad_norm": 9.625,
1206
+ "learning_rate": 3.179631849508597e-06,
1207
+ "logits/chosen": -2.445950984954834,
1208
+ "logits/rejected": -2.4440605640411377,
1209
+ "logps/chosen": -249.5127410888672,
1210
+ "logps/rejected": -268.5820007324219,
1211
+ "loss": 1.3417,
1212
+ "rewards/accuracies": 0.699999988079071,
1213
+ "rewards/chosen": 0.09297636151313782,
1214
+ "rewards/margins": 0.33975750207901,
1215
+ "rewards/rejected": -0.2467811554670334,
1216
+ "step": 720
1217
+ },
1218
+ {
1219
+ "epoch": 0.47759241086032056,
1220
+ "grad_norm": 7.59375,
1221
+ "learning_rate": 3.1244905786522796e-06,
1222
+ "logits/chosen": -2.4762067794799805,
1223
+ "logits/rejected": -2.4062716960906982,
1224
+ "logps/chosen": -264.5029602050781,
1225
+ "logps/rejected": -255.11361694335938,
1226
+ "loss": 1.3587,
1227
+ "rewards/accuracies": 0.737500011920929,
1228
+ "rewards/chosen": 0.11461961269378662,
1229
+ "rewards/margins": 0.3867848515510559,
1230
+ "rewards/rejected": -0.2721652388572693,
1231
+ "step": 730
1232
+ },
1233
+ {
1234
+ "epoch": 0.4841347726529277,
1235
+ "grad_norm": 7.4375,
1236
+ "learning_rate": 3.0690233199960393e-06,
1237
+ "logits/chosen": -2.3173129558563232,
1238
+ "logits/rejected": -2.2710585594177246,
1239
+ "logps/chosen": -227.1941375732422,
1240
+ "logps/rejected": -248.1180877685547,
1241
+ "loss": 1.3402,
1242
+ "rewards/accuracies": 0.800000011920929,
1243
+ "rewards/chosen": 0.1001671776175499,
1244
+ "rewards/margins": 0.45164409279823303,
1245
+ "rewards/rejected": -0.35147690773010254,
1246
+ "step": 740
1247
+ },
1248
+ {
1249
+ "epoch": 0.49067713444553485,
1250
+ "grad_norm": 7.4375,
1251
+ "learning_rate": 3.0132590277791163e-06,
1252
+ "logits/chosen": -2.524819850921631,
1253
+ "logits/rejected": -2.406615734100342,
1254
+ "logps/chosen": -263.93133544921875,
1255
+ "logps/rejected": -243.890625,
1256
+ "loss": 1.3328,
1257
+ "rewards/accuracies": 0.7749999761581421,
1258
+ "rewards/chosen": 0.14402124285697937,
1259
+ "rewards/margins": 0.5698949098587036,
1260
+ "rewards/rejected": -0.42587366700172424,
1261
+ "step": 750
1262
+ },
1263
+ {
1264
+ "epoch": 0.497219496238142,
1265
+ "grad_norm": 7.96875,
1266
+ "learning_rate": 2.9572268112940354e-06,
1267
+ "logits/chosen": -2.495116710662842,
1268
+ "logits/rejected": -2.4868061542510986,
1269
+ "logps/chosen": -254.06405639648438,
1270
+ "logps/rejected": -259.20379638671875,
1271
+ "loss": 1.3341,
1272
+ "rewards/accuracies": 0.7124999761581421,
1273
+ "rewards/chosen": 0.02839145064353943,
1274
+ "rewards/margins": 0.419182687997818,
1275
+ "rewards/rejected": -0.39079123735427856,
1276
+ "step": 760
1277
+ },
1278
+ {
1279
+ "epoch": 0.5037618580307491,
1280
+ "grad_norm": 8.1875,
1281
+ "learning_rate": 2.9009559196913882e-06,
1282
+ "logits/chosen": -2.5047004222869873,
1283
+ "logits/rejected": -2.386676788330078,
1284
+ "logps/chosen": -291.96478271484375,
1285
+ "logps/rejected": -252.67971801757812,
1286
+ "loss": 1.3422,
1287
+ "rewards/accuracies": 0.7250000238418579,
1288
+ "rewards/chosen": 0.1507759988307953,
1289
+ "rewards/margins": 0.5127144455909729,
1290
+ "rewards/rejected": -0.3619384467601776,
1291
+ "step": 770
1292
+ },
1293
+ {
1294
+ "epoch": 0.5103042198233563,
1295
+ "grad_norm": 7.84375,
1296
+ "learning_rate": 2.844475726711595e-06,
1297
+ "logits/chosen": -2.4655020236968994,
1298
+ "logits/rejected": -2.294151782989502,
1299
+ "logps/chosen": -249.9043731689453,
1300
+ "logps/rejected": -220.50216674804688,
1301
+ "loss": 1.3446,
1302
+ "rewards/accuracies": 0.675000011920929,
1303
+ "rewards/chosen": 0.06338523328304291,
1304
+ "rewards/margins": 0.47769251465797424,
1305
+ "rewards/rejected": -0.41430729627609253,
1306
+ "step": 780
1307
+ },
1308
+ {
1309
+ "epoch": 0.5168465816159633,
1310
+ "grad_norm": 6.0625,
1311
+ "learning_rate": 2.7878157153516446e-06,
1312
+ "logits/chosen": -2.4090731143951416,
1313
+ "logits/rejected": -2.3185999393463135,
1314
+ "logps/chosen": -270.53857421875,
1315
+ "logps/rejected": -269.4087829589844,
1316
+ "loss": 1.33,
1317
+ "rewards/accuracies": 0.7749999761581421,
1318
+ "rewards/chosen": 0.11772044748067856,
1319
+ "rewards/margins": 0.4702891707420349,
1320
+ "rewards/rejected": -0.35256871581077576,
1321
+ "step": 790
1322
+ },
1323
+ {
1324
+ "epoch": 0.5233889434085705,
1325
+ "grad_norm": 7.75,
1326
+ "learning_rate": 2.731005462474787e-06,
1327
+ "logits/chosen": -2.5132055282592773,
1328
+ "logits/rejected": -2.4322190284729004,
1329
+ "logps/chosen": -271.2453308105469,
1330
+ "logps/rejected": -266.37701416015625,
1331
+ "loss": 1.343,
1332
+ "rewards/accuracies": 0.7124999761581421,
1333
+ "rewards/chosen": 0.21996447443962097,
1334
+ "rewards/margins": 0.497943639755249,
1335
+ "rewards/rejected": -0.27797916531562805,
1336
+ "step": 800
1337
+ },
1338
+ {
1339
+ "epoch": 0.5233889434085705,
1340
+ "eval_logits/chosen": -2.3120627403259277,
1341
+ "eval_logits/rejected": -2.2312381267547607,
1342
+ "eval_logps/chosen": -271.3629150390625,
1343
+ "eval_logps/rejected": -257.9126892089844,
1344
+ "eval_loss": 1.3448688983917236,
1345
+ "eval_rewards/accuracies": 0.7149999737739563,
1346
+ "eval_rewards/chosen": 0.08135941624641418,
1347
+ "eval_rewards/margins": 0.4623354375362396,
1348
+ "eval_rewards/rejected": -0.38097602128982544,
1349
+ "eval_runtime": 192.5442,
1350
+ "eval_samples_per_second": 10.387,
1351
+ "eval_steps_per_second": 0.519,
1352
+ "step": 800
1353
+ },
1354
+ {
1355
+ "epoch": 0.5299313052011776,
1356
+ "grad_norm": 8.25,
1357
+ "learning_rate": 2.67407462337124e-06,
1358
+ "logits/chosen": -2.516996383666992,
1359
+ "logits/rejected": -2.399326801300049,
1360
+ "logps/chosen": -245.3385772705078,
1361
+ "logps/rejected": -241.2891082763672,
1362
+ "loss": 1.334,
1363
+ "rewards/accuracies": 0.75,
1364
+ "rewards/chosen": 0.0016076326137408614,
1365
+ "rewards/margins": 0.4354288578033447,
1366
+ "rewards/rejected": -0.43382126092910767,
1367
+ "step": 810
1368
+ },
1369
+ {
1370
+ "epoch": 0.5364736669937847,
1371
+ "grad_norm": 7.0,
1372
+ "learning_rate": 2.617052916277952e-06,
1373
+ "logits/chosen": -2.5313477516174316,
1374
+ "logits/rejected": -2.338625431060791,
1375
+ "logps/chosen": -253.85226440429688,
1376
+ "logps/rejected": -227.67898559570312,
1377
+ "loss": 1.3429,
1378
+ "rewards/accuracies": 0.762499988079071,
1379
+ "rewards/chosen": 0.07979673147201538,
1380
+ "rewards/margins": 0.5423867106437683,
1381
+ "rewards/rejected": -0.4625900387763977,
1382
+ "step": 820
1383
+ },
1384
+ {
1385
+ "epoch": 0.5430160287863919,
1386
+ "grad_norm": 7.1875,
1387
+ "learning_rate": 2.5599701068654985e-06,
1388
+ "logits/chosen": -2.5015134811401367,
1389
+ "logits/rejected": -2.4792141914367676,
1390
+ "logps/chosen": -250.22543334960938,
1391
+ "logps/rejected": -245.0906219482422,
1392
+ "loss": 1.3437,
1393
+ "rewards/accuracies": 0.762499988079071,
1394
+ "rewards/chosen": 0.13883474469184875,
1395
+ "rewards/margins": 0.5313934087753296,
1396
+ "rewards/rejected": -0.39255863428115845,
1397
+ "step": 830
1398
+ },
1399
+ {
1400
+ "epoch": 0.549558390578999,
1401
+ "grad_norm": 7.15625,
1402
+ "learning_rate": 2.5028559927002326e-06,
1403
+ "logits/chosen": -2.562939405441284,
1404
+ "logits/rejected": -2.4629690647125244,
1405
+ "logps/chosen": -295.15142822265625,
1406
+ "logps/rejected": -249.9164276123047,
1407
+ "loss": 1.3382,
1408
+ "rewards/accuracies": 0.8374999761581421,
1409
+ "rewards/chosen": 0.11802862584590912,
1410
+ "rewards/margins": 0.46983498334884644,
1411
+ "rewards/rejected": -0.3518063426017761,
1412
+ "step": 840
1413
+ },
1414
+ {
1415
+ "epoch": 0.5561007523716062,
1416
+ "grad_norm": 10.4375,
1417
+ "learning_rate": 2.4457403876897756e-06,
1418
+ "logits/chosen": -2.3556885719299316,
1419
+ "logits/rejected": -2.300297737121582,
1420
+ "logps/chosen": -241.96975708007812,
1421
+ "logps/rejected": -196.66481018066406,
1422
+ "loss": 1.3402,
1423
+ "rewards/accuracies": 0.6499999761581421,
1424
+ "rewards/chosen": 0.06055239588022232,
1425
+ "rewards/margins": 0.29153305292129517,
1426
+ "rewards/rejected": -0.23098066449165344,
1427
+ "step": 850
1428
+ },
1429
+ {
1430
+ "epoch": 0.5626431141642133,
1431
+ "grad_norm": 7.84375,
1432
+ "learning_rate": 2.388653106519975e-06,
1433
+ "logits/chosen": -2.504599094390869,
1434
+ "logits/rejected": -2.3005897998809814,
1435
+ "logps/chosen": -257.0250549316406,
1436
+ "logps/rejected": -192.78662109375,
1437
+ "loss": 1.3466,
1438
+ "rewards/accuracies": 0.699999988079071,
1439
+ "rewards/chosen": 0.04565539211034775,
1440
+ "rewards/margins": 0.4032720923423767,
1441
+ "rewards/rejected": -0.35761672258377075,
1442
+ "step": 860
1443
+ },
1444
+ {
1445
+ "epoch": 0.5691854759568205,
1446
+ "grad_norm": 7.21875,
1447
+ "learning_rate": 2.331623949091467e-06,
1448
+ "logits/chosen": -2.5008959770202637,
1449
+ "logits/rejected": -2.472198009490967,
1450
+ "logps/chosen": -277.51580810546875,
1451
+ "logps/rejected": -266.95245361328125,
1452
+ "loss": 1.3361,
1453
+ "rewards/accuracies": 0.699999988079071,
1454
+ "rewards/chosen": 0.13331273198127747,
1455
+ "rewards/margins": 0.45952072739601135,
1456
+ "rewards/rejected": -0.3262080252170563,
1457
+ "step": 870
1458
+ },
1459
+ {
1460
+ "epoch": 0.5757278377494276,
1461
+ "grad_norm": 7.75,
1462
+ "learning_rate": 2.2746826849639513e-06,
1463
+ "logits/chosen": -2.505166530609131,
1464
+ "logits/rejected": -2.3743433952331543,
1465
+ "logps/chosen": -281.8514709472656,
1466
+ "logps/rejected": -239.01443481445312,
1467
+ "loss": 1.3391,
1468
+ "rewards/accuracies": 0.699999988079071,
1469
+ "rewards/chosen": 0.11163483560085297,
1470
+ "rewards/margins": 0.4709078371524811,
1471
+ "rewards/rejected": -0.35927295684814453,
1472
+ "step": 880
1473
+ },
1474
+ {
1475
+ "epoch": 0.5822701995420346,
1476
+ "grad_norm": 7.1875,
1477
+ "learning_rate": 2.2178590378162957e-06,
1478
+ "logits/chosen": -2.5283544063568115,
1479
+ "logits/rejected": -2.402369260787964,
1480
+ "logps/chosen": -319.1461486816406,
1481
+ "logps/rejected": -279.74462890625,
1482
+ "loss": 1.3521,
1483
+ "rewards/accuracies": 0.737500011920929,
1484
+ "rewards/chosen": 0.02043941244482994,
1485
+ "rewards/margins": 0.45712727308273315,
1486
+ "rewards/rejected": -0.4366879463195801,
1487
+ "step": 890
1488
+ },
1489
+ {
1490
+ "epoch": 0.5888125613346418,
1491
+ "grad_norm": 7.15625,
1492
+ "learning_rate": 2.1611826699306104e-06,
1493
+ "logits/chosen": -2.4243669509887695,
1494
+ "logits/rejected": -2.32483172416687,
1495
+ "logps/chosen": -271.6533203125,
1496
+ "logps/rejected": -232.5511474609375,
1497
+ "loss": 1.3439,
1498
+ "rewards/accuracies": 0.699999988079071,
1499
+ "rewards/chosen": 0.13815268874168396,
1500
+ "rewards/margins": 0.49229878187179565,
1501
+ "rewards/rejected": -0.35414618253707886,
1502
+ "step": 900
1503
+ },
1504
+ {
1505
+ "epoch": 0.5888125613346418,
1506
+ "eval_logits/chosen": -2.313720941543579,
1507
+ "eval_logits/rejected": -2.2327077388763428,
1508
+ "eval_logps/chosen": -271.791748046875,
1509
+ "eval_logps/rejected": -258.1572570800781,
1510
+ "eval_loss": 1.3458954095840454,
1511
+ "eval_rewards/accuracies": 0.7250000238418579,
1512
+ "eval_rewards/chosen": 0.038476575165987015,
1513
+ "eval_rewards/margins": 0.443908154964447,
1514
+ "eval_rewards/rejected": -0.4054316282272339,
1515
+ "eval_runtime": 192.5142,
1516
+ "eval_samples_per_second": 10.389,
1517
+ "eval_steps_per_second": 0.519,
1518
+ "step": 900
1519
+ },
1520
+ {
1521
+ "epoch": 0.5953549231272489,
1522
+ "grad_norm": 8.0625,
1523
+ "learning_rate": 2.1046831667083483e-06,
1524
+ "logits/chosen": -2.4782989025115967,
1525
+ "logits/rejected": -2.3582215309143066,
1526
+ "logps/chosen": -341.8036193847656,
1527
+ "logps/rejected": -267.46795654296875,
1528
+ "loss": 1.3463,
1529
+ "rewards/accuracies": 0.8125,
1530
+ "rewards/chosen": 0.08111786842346191,
1531
+ "rewards/margins": 0.4356920123100281,
1532
+ "rewards/rejected": -0.35457414388656616,
1533
+ "step": 910
1534
+ },
1535
+ {
1536
+ "epoch": 0.6018972849198561,
1537
+ "grad_norm": 7.09375,
1538
+ "learning_rate": 2.048390021226559e-06,
1539
+ "logits/chosen": -2.445460557937622,
1540
+ "logits/rejected": -2.3675286769866943,
1541
+ "logps/chosen": -262.48907470703125,
1542
+ "logps/rejected": -266.7914733886719,
1543
+ "loss": 1.3489,
1544
+ "rewards/accuracies": 0.762499988079071,
1545
+ "rewards/chosen": 0.1052175760269165,
1546
+ "rewards/margins": 0.4468202590942383,
1547
+ "rewards/rejected": -0.341602623462677,
1548
+ "step": 920
1549
+ },
1550
+ {
1551
+ "epoch": 0.6084396467124632,
1552
+ "grad_norm": 7.25,
1553
+ "learning_rate": 1.9923326188423212e-06,
1554
+ "logits/chosen": -2.5053839683532715,
1555
+ "logits/rejected": -2.4001247882843018,
1556
+ "logps/chosen": -236.89382934570312,
1557
+ "logps/rejected": -205.64291381835938,
1558
+ "loss": 1.3523,
1559
+ "rewards/accuracies": 0.675000011920929,
1560
+ "rewards/chosen": 0.08720345795154572,
1561
+ "rewards/margins": 0.3279283344745636,
1562
+ "rewards/rejected": -0.24072487652301788,
1563
+ "step": 930
1564
+ },
1565
+ {
1566
+ "epoch": 0.6149820085050703,
1567
+ "grad_norm": 7.4375,
1568
+ "learning_rate": 1.936540221853415e-06,
1569
+ "logits/chosen": -2.4638781547546387,
1570
+ "logits/rejected": -2.3390696048736572,
1571
+ "logps/chosen": -283.2751770019531,
1572
+ "logps/rejected": -238.634765625,
1573
+ "loss": 1.3297,
1574
+ "rewards/accuracies": 0.75,
1575
+ "rewards/chosen": 0.058457307517528534,
1576
+ "rewards/margins": 0.6001859307289124,
1577
+ "rewards/rejected": -0.5417286157608032,
1578
+ "step": 940
1579
+ },
1580
+ {
1581
+ "epoch": 0.6215243702976775,
1582
+ "grad_norm": 6.71875,
1583
+ "learning_rate": 1.8810419542232245e-06,
1584
+ "logits/chosen": -2.5065360069274902,
1585
+ "logits/rejected": -2.3907275199890137,
1586
+ "logps/chosen": -273.1107177734375,
1587
+ "logps/rejected": -269.38140869140625,
1588
+ "loss": 1.3301,
1589
+ "rewards/accuracies": 0.762499988079071,
1590
+ "rewards/chosen": 0.07303529977798462,
1591
+ "rewards/margins": 0.37555113434791565,
1592
+ "rewards/rejected": -0.30251583456993103,
1593
+ "step": 950
1594
+ },
1595
+ {
1596
+ "epoch": 0.6280667320902846,
1597
+ "grad_norm": 6.71875,
1598
+ "learning_rate": 1.8258667863778573e-06,
1599
+ "logits/chosen": -2.415342092514038,
1600
+ "logits/rejected": -2.3436694145202637,
1601
+ "logps/chosen": -305.41937255859375,
1602
+ "logps/rejected": -254.4953155517578,
1603
+ "loss": 1.3252,
1604
+ "rewards/accuracies": 0.699999988079071,
1605
+ "rewards/chosen": 0.03756101056933403,
1606
+ "rewards/margins": 0.33726876974105835,
1607
+ "rewards/rejected": -0.2997077405452728,
1608
+ "step": 960
1609
+ },
1610
+ {
1611
+ "epoch": 0.6346090938828918,
1612
+ "grad_norm": 7.09375,
1613
+ "learning_rate": 1.7710435200834126e-06,
1614
+ "logits/chosen": -2.424187660217285,
1615
+ "logits/rejected": -2.4013800621032715,
1616
+ "logps/chosen": -253.3881378173828,
1617
+ "logps/rejected": -242.24853515625,
1618
+ "loss": 1.3294,
1619
+ "rewards/accuracies": 0.737500011920929,
1620
+ "rewards/chosen": 0.09904266893863678,
1621
+ "rewards/margins": 0.44573745131492615,
1622
+ "rewards/rejected": -0.3466947376728058,
1623
+ "step": 970
1624
+ },
1625
+ {
1626
+ "epoch": 0.6411514556754988,
1627
+ "grad_norm": 10.1875,
1628
+ "learning_rate": 1.7166007734112808e-06,
1629
+ "logits/chosen": -2.512207269668579,
1630
+ "logits/rejected": -2.3953731060028076,
1631
+ "logps/chosen": -282.1387939453125,
1632
+ "logps/rejected": -238.2812042236328,
1633
+ "loss": 1.3447,
1634
+ "rewards/accuracies": 0.75,
1635
+ "rewards/chosen": 0.10133460909128189,
1636
+ "rewards/margins": 0.4784146845340729,
1637
+ "rewards/rejected": -0.3770800232887268,
1638
+ "step": 980
1639
+ },
1640
+ {
1641
+ "epoch": 0.647693817468106,
1642
+ "grad_norm": 6.84375,
1643
+ "learning_rate": 1.6625669657993483e-06,
1644
+ "logits/chosen": -2.588611125946045,
1645
+ "logits/rejected": -2.5047097206115723,
1646
+ "logps/chosen": -316.20037841796875,
1647
+ "logps/rejected": -254.8091278076172,
1648
+ "loss": 1.3237,
1649
+ "rewards/accuracies": 0.862500011920929,
1650
+ "rewards/chosen": 0.18245932459831238,
1651
+ "rewards/margins": 0.5891641974449158,
1652
+ "rewards/rejected": -0.40670496225357056,
1653
+ "step": 990
1654
+ },
1655
+ {
1656
+ "epoch": 0.6542361792607131,
1657
+ "grad_norm": 6.90625,
1658
+ "learning_rate": 1.6089703032168736e-06,
1659
+ "logits/chosen": -2.417908191680908,
1660
+ "logits/rejected": -2.41746187210083,
1661
+ "logps/chosen": -254.3357391357422,
1662
+ "logps/rejected": -241.4535675048828,
1663
+ "loss": 1.3388,
1664
+ "rewards/accuracies": 0.7250000238418579,
1665
+ "rewards/chosen": 0.06749463826417923,
1666
+ "rewards/margins": 0.35123783349990845,
1667
+ "rewards/rejected": -0.2837432026863098,
1668
+ "step": 1000
1669
+ },
1670
+ {
1671
+ "epoch": 0.6542361792607131,
1672
+ "eval_logits/chosen": -2.318289041519165,
1673
+ "eval_logits/rejected": -2.238693952560425,
1674
+ "eval_logps/chosen": -270.0261535644531,
1675
+ "eval_logps/rejected": -256.7276611328125,
1676
+ "eval_loss": 1.344198226928711,
1677
+ "eval_rewards/accuracies": 0.7325000166893005,
1678
+ "eval_rewards/chosen": 0.215036541223526,
1679
+ "eval_rewards/margins": 0.4775146245956421,
1680
+ "eval_rewards/rejected": -0.2624781131744385,
1681
+ "eval_runtime": 192.5272,
1682
+ "eval_samples_per_second": 10.388,
1683
+ "eval_steps_per_second": 0.519,
1684
+ "step": 1000
1685
+ },
1686
+ {
1687
+ "epoch": 0.6607785410533202,
1688
+ "grad_norm": 7.8125,
1689
+ "learning_rate": 1.55583876344081e-06,
1690
+ "logits/chosen": -2.4103312492370605,
1691
+ "logits/rejected": -2.1900105476379395,
1692
+ "logps/chosen": -260.7649230957031,
1693
+ "logps/rejected": -209.84048461914062,
1694
+ "loss": 1.3325,
1695
+ "rewards/accuracies": 0.762499988079071,
1696
+ "rewards/chosen": 0.10994074493646622,
1697
+ "rewards/margins": 0.5784383416175842,
1698
+ "rewards/rejected": -0.46849751472473145,
1699
+ "step": 1010
1700
+ },
1701
+ {
1702
+ "epoch": 0.6673209028459274,
1703
+ "grad_norm": 7.3125,
1704
+ "learning_rate": 1.5032000814512372e-06,
1705
+ "logits/chosen": -2.4817819595336914,
1706
+ "logits/rejected": -2.4498400688171387,
1707
+ "logps/chosen": -240.53134155273438,
1708
+ "logps/rejected": -238.8572998046875,
1709
+ "loss": 1.3307,
1710
+ "rewards/accuracies": 0.6875,
1711
+ "rewards/chosen": -0.0427989736199379,
1712
+ "rewards/margins": 0.3493194580078125,
1713
+ "rewards/rejected": -0.3921184539794922,
1714
+ "step": 1020
1715
+ },
1716
+ {
1717
+ "epoch": 0.6738632646385345,
1718
+ "grad_norm": 5.59375,
1719
+ "learning_rate": 1.4510817349535323e-06,
1720
+ "logits/chosen": -2.453907012939453,
1721
+ "logits/rejected": -2.399127244949341,
1722
+ "logps/chosen": -262.92559814453125,
1723
+ "logps/rejected": -249.8006134033203,
1724
+ "loss": 1.3411,
1725
+ "rewards/accuracies": 0.762499988079071,
1726
+ "rewards/chosen": 0.13903044164180756,
1727
+ "rewards/margins": 0.5323747396469116,
1728
+ "rewards/rejected": -0.39334431290626526,
1729
+ "step": 1030
1730
+ },
1731
+ {
1732
+ "epoch": 0.6804056264311417,
1733
+ "grad_norm": 8.25,
1734
+ "learning_rate": 1.3995109300348537e-06,
1735
+ "logits/chosen": -2.5314221382141113,
1736
+ "logits/rejected": -2.424898624420166,
1737
+ "logps/chosen": -296.252197265625,
1738
+ "logps/rejected": -279.55096435546875,
1739
+ "loss": 1.3372,
1740
+ "rewards/accuracies": 0.699999988079071,
1741
+ "rewards/chosen": -0.001908986596390605,
1742
+ "rewards/margins": 0.4961877763271332,
1743
+ "rewards/rejected": -0.49809688329696655,
1744
+ "step": 1040
1745
+ },
1746
+ {
1747
+ "epoch": 0.6869479882237488,
1748
+ "grad_norm": 7.03125,
1749
+ "learning_rate": 1.348514586962389e-06,
1750
+ "logits/chosen": -2.543003559112549,
1751
+ "logits/rejected": -2.466203212738037,
1752
+ "logps/chosen": -244.98037719726562,
1753
+ "logps/rejected": -231.51449584960938,
1754
+ "loss": 1.3373,
1755
+ "rewards/accuracies": 0.762499988079071,
1756
+ "rewards/chosen": 0.1332973688840866,
1757
+ "rewards/margins": 0.5175554752349854,
1758
+ "rewards/rejected": -0.38425812125205994,
1759
+ "step": 1050
1760
+ },
1761
+ {
1762
+ "epoch": 0.693490350016356,
1763
+ "grad_norm": 6.625,
1764
+ "learning_rate": 1.2981193261308284e-06,
1765
+ "logits/chosen": -2.485957622528076,
1766
+ "logits/rejected": -2.440640687942505,
1767
+ "logps/chosen": -243.56118774414062,
1768
+ "logps/rejected": -264.2271728515625,
1769
+ "loss": 1.3184,
1770
+ "rewards/accuracies": 0.800000011920929,
1771
+ "rewards/chosen": 0.058649301528930664,
1772
+ "rewards/margins": 0.4494194984436035,
1773
+ "rewards/rejected": -0.39077019691467285,
1774
+ "step": 1060
1775
+ },
1776
+ {
1777
+ "epoch": 0.700032711808963,
1778
+ "grad_norm": 7.625,
1779
+ "learning_rate": 1.2483514541663501e-06,
1780
+ "logits/chosen": -2.567582607269287,
1781
+ "logits/rejected": -2.398489475250244,
1782
+ "logps/chosen": -268.79248046875,
1783
+ "logps/rejected": -236.4168701171875,
1784
+ "loss": 1.3346,
1785
+ "rewards/accuracies": 0.7875000238418579,
1786
+ "rewards/chosen": 0.1122722402215004,
1787
+ "rewards/margins": 0.54461270570755,
1788
+ "rewards/rejected": -0.43234047293663025,
1789
+ "step": 1070
1790
+ },
1791
+ {
1792
+ "epoch": 0.7065750736015701,
1793
+ "grad_norm": 9.875,
1794
+ "learning_rate": 1.1992369501944096e-06,
1795
+ "logits/chosen": -2.6160035133361816,
1796
+ "logits/rejected": -2.485849380493164,
1797
+ "logps/chosen": -301.0981140136719,
1798
+ "logps/rejected": -261.76226806640625,
1799
+ "loss": 1.3344,
1800
+ "rewards/accuracies": 0.7749999761581421,
1801
+ "rewards/chosen": 0.12587206065654755,
1802
+ "rewards/margins": 0.49982064962387085,
1803
+ "rewards/rejected": -0.3739486336708069,
1804
+ "step": 1080
1805
+ },
1806
+ {
1807
+ "epoch": 0.7131174353941773,
1808
+ "grad_norm": 5.15625,
1809
+ "learning_rate": 1.1508014522784803e-06,
1810
+ "logits/chosen": -2.5031826496124268,
1811
+ "logits/rejected": -2.380375862121582,
1812
+ "logps/chosen": -259.3973693847656,
1813
+ "logps/rejected": -242.9427490234375,
1814
+ "loss": 1.3325,
1815
+ "rewards/accuracies": 0.75,
1816
+ "rewards/chosen": 0.098679319024086,
1817
+ "rewards/margins": 0.5007960796356201,
1818
+ "rewards/rejected": -0.40211671590805054,
1819
+ "step": 1090
1820
+ },
1821
+ {
1822
+ "epoch": 0.7196597971867844,
1823
+ "grad_norm": 6.3125,
1824
+ "learning_rate": 1.1030702440368319e-06,
1825
+ "logits/chosen": -2.4038383960723877,
1826
+ "logits/rejected": -2.402168035507202,
1827
+ "logps/chosen": -282.268798828125,
1828
+ "logps/rejected": -266.29132080078125,
1829
+ "loss": 1.3186,
1830
+ "rewards/accuracies": 0.8125,
1831
+ "rewards/chosen": 0.16258536279201508,
1832
+ "rewards/margins": 0.7569273114204407,
1833
+ "rewards/rejected": -0.5943418741226196,
1834
+ "step": 1100
1835
+ },
1836
+ {
1837
+ "epoch": 0.7196597971867844,
1838
+ "eval_logits/chosen": -2.3106985092163086,
1839
+ "eval_logits/rejected": -2.2305588722229004,
1840
+ "eval_logps/chosen": -270.9344787597656,
1841
+ "eval_logps/rejected": -257.6895446777344,
1842
+ "eval_loss": 1.3422751426696777,
1843
+ "eval_rewards/accuracies": 0.7325000166893005,
1844
+ "eval_rewards/chosen": 0.12420222908258438,
1845
+ "eval_rewards/margins": 0.48286232352256775,
1846
+ "eval_rewards/rejected": -0.3586600422859192,
1847
+ "eval_runtime": 192.5343,
1848
+ "eval_samples_per_second": 10.388,
1849
+ "eval_steps_per_second": 0.519,
1850
+ "step": 1100
1851
+ },
1852
+ {
1853
+ "epoch": 0.7262021589793916,
1854
+ "grad_norm": 8.0625,
1855
+ "learning_rate": 1.0560682414443315e-06,
1856
+ "logits/chosen": -2.492471694946289,
1857
+ "logits/rejected": -2.3619418144226074,
1858
+ "logps/chosen": -296.5441589355469,
1859
+ "logps/rejected": -249.51174926757812,
1860
+ "loss": 1.333,
1861
+ "rewards/accuracies": 0.800000011920929,
1862
+ "rewards/chosen": 0.08190511167049408,
1863
+ "rewards/margins": 0.522915244102478,
1864
+ "rewards/rejected": -0.44101008772850037,
1865
+ "step": 1110
1866
+ },
1867
+ {
1868
+ "epoch": 0.7327445207719987,
1869
+ "grad_norm": 7.03125,
1870
+ "learning_rate": 1.009819979826156e-06,
1871
+ "logits/chosen": -2.49381947517395,
1872
+ "logits/rejected": -2.4158735275268555,
1873
+ "logps/chosen": -277.04742431640625,
1874
+ "logps/rejected": -277.2239074707031,
1875
+ "loss": 1.3209,
1876
+ "rewards/accuracies": 0.800000011920929,
1877
+ "rewards/chosen": 0.1789507120847702,
1878
+ "rewards/margins": 0.5798290967941284,
1879
+ "rewards/rejected": -0.4008784294128418,
1880
+ "step": 1120
1881
+ },
1882
+ {
1883
+ "epoch": 0.7392868825646058,
1884
+ "grad_norm": 9.875,
1885
+ "learning_rate": 9.643496010502054e-07,
1886
+ "logits/chosen": -2.383232831954956,
1887
+ "logits/rejected": -2.3699889183044434,
1888
+ "logps/chosen": -282.37872314453125,
1889
+ "logps/rejected": -288.27764892578125,
1890
+ "loss": 1.3569,
1891
+ "rewards/accuracies": 0.699999988079071,
1892
+ "rewards/chosen": 0.049896519631147385,
1893
+ "rewards/margins": 0.3302707076072693,
1894
+ "rewards/rejected": -0.280374139547348,
1895
+ "step": 1130
1896
+ },
1897
+ {
1898
+ "epoch": 0.745829244357213,
1899
+ "grad_norm": 7.75,
1900
+ "learning_rate": 9.196808409249086e-07,
1901
+ "logits/chosen": -2.4595417976379395,
1902
+ "logits/rejected": -2.346752882003784,
1903
+ "logps/chosen": -263.95172119140625,
1904
+ "logps/rejected": -211.8850555419922,
1905
+ "loss": 1.3457,
1906
+ "rewards/accuracies": 0.7124999761581421,
1907
+ "rewards/chosen": 0.07734758406877518,
1908
+ "rewards/margins": 0.34833234548568726,
1909
+ "rewards/rejected": -0.2709847092628479,
1910
+ "step": 1140
1911
+ },
1912
+ {
1913
+ "epoch": 0.7523716061498201,
1914
+ "grad_norm": 8.25,
1915
+ "learning_rate": 8.758370168089797e-07,
1916
+ "logits/chosen": -2.46266770362854,
1917
+ "logits/rejected": -2.4414191246032715,
1918
+ "logps/chosen": -258.66815185546875,
1919
+ "logps/rejected": -272.05242919921875,
1920
+ "loss": 1.3453,
1921
+ "rewards/accuracies": 0.75,
1922
+ "rewards/chosen": 0.12471766769886017,
1923
+ "rewards/margins": 0.5029908418655396,
1924
+ "rewards/rejected": -0.3782731890678406,
1925
+ "step": 1150
1926
+ },
1927
+ {
1928
+ "epoch": 0.7589139679424273,
1929
+ "grad_norm": 7.90625,
1930
+ "learning_rate": 8.328410154396318e-07,
1931
+ "logits/chosen": -2.473407030105591,
1932
+ "logits/rejected": -2.4644668102264404,
1933
+ "logps/chosen": -270.7806396484375,
1934
+ "logps/rejected": -228.4518280029297,
1935
+ "loss": 1.3378,
1936
+ "rewards/accuracies": 0.7124999761581421,
1937
+ "rewards/chosen": 0.08448322862386703,
1938
+ "rewards/margins": 0.4490872323513031,
1939
+ "rewards/rejected": -0.36460399627685547,
1940
+ "step": 1160
1941
+ },
1942
+ {
1943
+ "epoch": 0.7654563297350343,
1944
+ "grad_norm": 6.28125,
1945
+ "learning_rate": 7.907152809855529e-07,
1946
+ "logits/chosen": -2.4258522987365723,
1947
+ "logits/rejected": -2.3456382751464844,
1948
+ "logps/chosen": -260.29730224609375,
1949
+ "logps/rejected": -251.9761199951172,
1950
+ "loss": 1.3517,
1951
+ "rewards/accuracies": 0.625,
1952
+ "rewards/chosen": 0.03364313393831253,
1953
+ "rewards/margins": 0.29857534170150757,
1954
+ "rewards/rejected": -0.26493218541145325,
1955
+ "step": 1170
1956
+ },
1957
+ {
1958
+ "epoch": 0.7719986915276414,
1959
+ "grad_norm": 11.1875,
1960
+ "learning_rate": 7.494818033309207e-07,
1961
+ "logits/chosen": -2.4767394065856934,
1962
+ "logits/rejected": -2.3014492988586426,
1963
+ "logps/chosen": -238.3695526123047,
1964
+ "logps/rejected": -213.4428253173828,
1965
+ "loss": 1.3469,
1966
+ "rewards/accuracies": 0.7250000238418579,
1967
+ "rewards/chosen": 0.12686462700366974,
1968
+ "rewards/margins": 0.3980763554573059,
1969
+ "rewards/rejected": -0.27121174335479736,
1970
+ "step": 1180
1971
+ },
1972
+ {
1973
+ "epoch": 0.7785410533202486,
1974
+ "grad_norm": 7.1875,
1975
+ "learning_rate": 7.091621065965521e-07,
1976
+ "logits/chosen": -2.454749584197998,
1977
+ "logits/rejected": -2.4374210834503174,
1978
+ "logps/chosen": -276.31488037109375,
1979
+ "logps/rejected": -310.75,
1980
+ "loss": 1.3416,
1981
+ "rewards/accuracies": 0.6625000238418579,
1982
+ "rewards/chosen": 0.041704945266246796,
1983
+ "rewards/margins": 0.38392168283462524,
1984
+ "rewards/rejected": -0.34221673011779785,
1985
+ "step": 1190
1986
+ },
1987
+ {
1988
+ "epoch": 0.7850834151128557,
1989
+ "grad_norm": 7.34375,
1990
+ "learning_rate": 6.697772379041823e-07,
1991
+ "logits/chosen": -2.4566853046417236,
1992
+ "logits/rejected": -2.416715145111084,
1993
+ "logps/chosen": -245.9681396484375,
1994
+ "logps/rejected": -241.95858764648438,
1995
+ "loss": 1.3299,
1996
+ "rewards/accuracies": 0.6875,
1997
+ "rewards/chosen": 0.12603041529655457,
1998
+ "rewards/margins": 0.42295509576797485,
1999
+ "rewards/rejected": -0.2969246804714203,
2000
+ "step": 1200
2001
+ },
2002
+ {
2003
+ "epoch": 0.7850834151128557,
2004
+ "eval_logits/chosen": -2.3078200817108154,
2005
+ "eval_logits/rejected": -2.2274787425994873,
2006
+ "eval_logps/chosen": -270.70892333984375,
2007
+ "eval_logps/rejected": -257.372802734375,
2008
+ "eval_loss": 1.3416602611541748,
2009
+ "eval_rewards/accuracies": 0.7425000071525574,
2010
+ "eval_rewards/chosen": 0.14675647020339966,
2011
+ "eval_rewards/margins": 0.4737465977668762,
2012
+ "eval_rewards/rejected": -0.3269902169704437,
2013
+ "eval_runtime": 192.5364,
2014
+ "eval_samples_per_second": 10.388,
2015
+ "eval_steps_per_second": 0.519,
2016
+ "step": 1200
2017
+ },
2018
+ {
2019
+ "epoch": 0.7916257769054629,
2020
+ "grad_norm": 5.8125,
2021
+ "learning_rate": 6.313477563897466e-07,
2022
+ "logits/chosen": -2.516838550567627,
2023
+ "logits/rejected": -2.4841699600219727,
2024
+ "logps/chosen": -243.35372924804688,
2025
+ "logps/rejected": -246.02450561523438,
2026
+ "loss": 1.3459,
2027
+ "rewards/accuracies": 0.762499988079071,
2028
+ "rewards/chosen": 0.11812669038772583,
2029
+ "rewards/margins": 0.5516510009765625,
2030
+ "rewards/rejected": -0.43352431058883667,
2031
+ "step": 1210
2032
+ },
2033
+ {
2034
+ "epoch": 0.79816813869807,
2035
+ "grad_norm": 21.5,
2036
+ "learning_rate": 5.9389372247138e-07,
2037
+ "logits/chosen": -2.493438482284546,
2038
+ "logits/rejected": -2.3540115356445312,
2039
+ "logps/chosen": -215.5032501220703,
2040
+ "logps/rejected": -212.7133026123047,
2041
+ "loss": 1.3261,
2042
+ "rewards/accuracies": 0.8374999761581421,
2043
+ "rewards/chosen": 0.16954867541790009,
2044
+ "rewards/margins": 0.5659071803092957,
2045
+ "rewards/rejected": -0.39635851979255676,
2046
+ "step": 1220
2047
+ },
2048
+ {
2049
+ "epoch": 0.8047105004906772,
2050
+ "grad_norm": 13.0625,
2051
+ "learning_rate": 5.574346873777714e-07,
2052
+ "logits/chosen": -2.4212934970855713,
2053
+ "logits/rejected": -2.4491488933563232,
2054
+ "logps/chosen": -249.1747589111328,
2055
+ "logps/rejected": -282.01385498046875,
2056
+ "loss": 1.3411,
2057
+ "rewards/accuracies": 0.75,
2058
+ "rewards/chosen": 0.17586642503738403,
2059
+ "rewards/margins": 0.467965304851532,
2060
+ "rewards/rejected": -0.29209887981414795,
2061
+ "step": 1230
2062
+ },
2063
+ {
2064
+ "epoch": 0.8112528622832843,
2065
+ "grad_norm": 7.5625,
2066
+ "learning_rate": 5.219896829422927e-07,
2067
+ "logits/chosen": -2.491405487060547,
2068
+ "logits/rejected": -2.3750882148742676,
2069
+ "logps/chosen": -252.0865020751953,
2070
+ "logps/rejected": -231.3611602783203,
2071
+ "loss": 1.3306,
2072
+ "rewards/accuracies": 0.7250000238418579,
2073
+ "rewards/chosen": 0.12007620185613632,
2074
+ "rewards/margins": 0.5494218468666077,
2075
+ "rewards/rejected": -0.42934560775756836,
2076
+ "step": 1240
2077
+ },
2078
+ {
2079
+ "epoch": 0.8177952240758914,
2080
+ "grad_norm": 7.5,
2081
+ "learning_rate": 4.875772116682817e-07,
2082
+ "logits/chosen": -2.359609842300415,
2083
+ "logits/rejected": -2.25789475440979,
2084
+ "logps/chosen": -228.32534790039062,
2085
+ "logps/rejected": -250.2344512939453,
2086
+ "loss": 1.3465,
2087
+ "rewards/accuracies": 0.699999988079071,
2088
+ "rewards/chosen": 0.0579691156744957,
2089
+ "rewards/margins": 0.4779117703437805,
2090
+ "rewards/rejected": -0.41994261741638184,
2091
+ "step": 1250
2092
+ },
2093
+ {
2094
+ "epoch": 0.8243375858684985,
2095
+ "grad_norm": 8.125,
2096
+ "learning_rate": 4.542152370706149e-07,
2097
+ "logits/chosen": -2.395474672317505,
2098
+ "logits/rejected": -2.3001809120178223,
2099
+ "logps/chosen": -254.91268920898438,
2100
+ "logps/rejected": -258.6715087890625,
2101
+ "loss": 1.331,
2102
+ "rewards/accuracies": 0.675000011920929,
2103
+ "rewards/chosen": 0.06895018368959427,
2104
+ "rewards/margins": 0.4956684112548828,
2105
+ "rewards/rejected": -0.4267183244228363,
2106
+ "step": 1260
2107
+ },
2108
+ {
2109
+ "epoch": 0.8308799476611056,
2110
+ "grad_norm": 6.25,
2111
+ "learning_rate": 4.2192117429865067e-07,
2112
+ "logits/chosen": -2.28416109085083,
2113
+ "logits/rejected": -2.2375693321228027,
2114
+ "logps/chosen": -239.7650909423828,
2115
+ "logps/rejected": -263.7681579589844,
2116
+ "loss": 1.3323,
2117
+ "rewards/accuracies": 0.6625000238418579,
2118
+ "rewards/chosen": 0.11223635822534561,
2119
+ "rewards/margins": 0.42569345235824585,
2120
+ "rewards/rejected": -0.3134571313858032,
2121
+ "step": 1270
2122
+ },
2123
+ {
2124
+ "epoch": 0.8374223094537128,
2125
+ "grad_norm": 6.96875,
2126
+ "learning_rate": 3.907118810454172e-07,
2127
+ "logits/chosen": -2.401803970336914,
2128
+ "logits/rejected": -2.3998618125915527,
2129
+ "logps/chosen": -274.4956359863281,
2130
+ "logps/rejected": -298.844970703125,
2131
+ "loss": 1.3443,
2132
+ "rewards/accuracies": 0.7250000238418579,
2133
+ "rewards/chosen": 0.061115562915802,
2134
+ "rewards/margins": 0.4573536813259125,
2135
+ "rewards/rejected": -0.39623817801475525,
2136
+ "step": 1280
2137
+ },
2138
+ {
2139
+ "epoch": 0.8439646712463199,
2140
+ "grad_norm": 6.46875,
2141
+ "learning_rate": 3.6060364874779455e-07,
2142
+ "logits/chosen": -2.399519443511963,
2143
+ "logits/rejected": -2.386014461517334,
2144
+ "logps/chosen": -262.71044921875,
2145
+ "logps/rejected": -251.5861358642578,
2146
+ "loss": 1.351,
2147
+ "rewards/accuracies": 0.637499988079071,
2148
+ "rewards/chosen": 0.02161099575459957,
2149
+ "rewards/margins": 0.339490681886673,
2150
+ "rewards/rejected": -0.31787967681884766,
2151
+ "step": 1290
2152
+ },
2153
+ {
2154
+ "epoch": 0.850507033038927,
2155
+ "grad_norm": 6.25,
2156
+ "learning_rate": 3.3161219408229026e-07,
2157
+ "logits/chosen": -2.5390801429748535,
2158
+ "logits/rejected": -2.4665403366088867,
2159
+ "logps/chosen": -261.7703552246094,
2160
+ "logps/rejected": -245.41683959960938,
2161
+ "loss": 1.3248,
2162
+ "rewards/accuracies": 0.7124999761581421,
2163
+ "rewards/chosen": 0.11996345221996307,
2164
+ "rewards/margins": 0.4374065399169922,
2165
+ "rewards/rejected": -0.31744304299354553,
2166
+ "step": 1300
2167
+ },
2168
+ {
2169
+ "epoch": 0.850507033038927,
2170
+ "eval_logits/chosen": -2.3104758262634277,
2171
+ "eval_logits/rejected": -2.2305808067321777,
2172
+ "eval_logps/chosen": -270.62164306640625,
2173
+ "eval_logps/rejected": -257.2347106933594,
2174
+ "eval_loss": 1.341292381286621,
2175
+ "eval_rewards/accuracies": 0.7524999976158142,
2176
+ "eval_rewards/chosen": 0.15548919141292572,
2177
+ "eval_rewards/margins": 0.4686690866947174,
2178
+ "eval_rewards/rejected": -0.3131798803806305,
2179
+ "eval_runtime": 192.5429,
2180
+ "eval_samples_per_second": 10.387,
2181
+ "eval_steps_per_second": 0.519,
2182
+ "step": 1300
2183
+ },
2184
+ {
2185
+ "epoch": 0.8570493948315342,
2186
+ "grad_norm": 6.34375,
2187
+ "learning_rate": 3.0375265076083796e-07,
2188
+ "logits/chosen": -2.5264501571655273,
2189
+ "logits/rejected": -2.514683246612549,
2190
+ "logps/chosen": -277.8392028808594,
2191
+ "logps/rejected": -238.16134643554688,
2192
+ "loss": 1.3384,
2193
+ "rewards/accuracies": 0.699999988079071,
2194
+ "rewards/chosen": 0.1312626153230667,
2195
+ "rewards/margins": 0.3692128658294678,
2196
+ "rewards/rejected": -0.23795023560523987,
2197
+ "step": 1310
2198
+ },
2199
+ {
2200
+ "epoch": 0.8635917566241413,
2201
+ "grad_norm": 7.1875,
2202
+ "learning_rate": 2.7703956163091153e-07,
2203
+ "logits/chosen": -2.4872682094573975,
2204
+ "logits/rejected": -2.399378538131714,
2205
+ "logps/chosen": -252.8599853515625,
2206
+ "logps/rejected": -243.26425170898438,
2207
+ "loss": 1.3411,
2208
+ "rewards/accuracies": 0.824999988079071,
2209
+ "rewards/chosen": 0.09871190041303635,
2210
+ "rewards/margins": 0.4442734718322754,
2211
+ "rewards/rejected": -0.3455616533756256,
2212
+ "step": 1320
2213
+ },
2214
+ {
2215
+ "epoch": 0.8701341184167485,
2216
+ "grad_norm": 7.71875,
2217
+ "learning_rate": 2.514868710840723e-07,
2218
+ "logits/chosen": -2.482853889465332,
2219
+ "logits/rejected": -2.420342445373535,
2220
+ "logps/chosen": -310.83123779296875,
2221
+ "logps/rejected": -267.9164123535156,
2222
+ "loss": 1.3399,
2223
+ "rewards/accuracies": 0.625,
2224
+ "rewards/chosen": 0.10147074609994888,
2225
+ "rewards/margins": 0.3591291904449463,
2226
+ "rewards/rejected": -0.2576584219932556,
2227
+ "step": 1330
2228
+ },
2229
+ {
2230
+ "epoch": 0.8766764802093556,
2231
+ "grad_norm": 7.09375,
2232
+ "learning_rate": 2.271079177769117e-07,
2233
+ "logits/chosen": -2.4147746562957764,
2234
+ "logits/rejected": -2.420971632003784,
2235
+ "logps/chosen": -284.0067138671875,
2236
+ "logps/rejected": -307.20989990234375,
2237
+ "loss": 1.3301,
2238
+ "rewards/accuracies": 0.6875,
2239
+ "rewards/chosen": 0.12000395357608795,
2240
+ "rewards/margins": 0.36496061086654663,
2241
+ "rewards/rejected": -0.24495668709278107,
2242
+ "step": 1340
2243
+ },
2244
+ {
2245
+ "epoch": 0.8832188420019627,
2246
+ "grad_norm": 6.53125,
2247
+ "learning_rate": 2.0391542766819456e-07,
2248
+ "logits/chosen": -2.473135471343994,
2249
+ "logits/rejected": -2.308255195617676,
2250
+ "logps/chosen": -278.90167236328125,
2251
+ "logps/rejected": -262.17694091796875,
2252
+ "loss": 1.3217,
2253
+ "rewards/accuracies": 0.800000011920929,
2254
+ "rewards/chosen": 0.08379217237234116,
2255
+ "rewards/margins": 0.525702953338623,
2256
+ "rewards/rejected": -0.4419107437133789,
2257
+ "step": 1350
2258
+ },
2259
+ {
2260
+ "epoch": 0.8897612037945698,
2261
+ "grad_norm": 8.1875,
2262
+ "learning_rate": 1.8192150737583264e-07,
2263
+ "logits/chosen": -2.471273899078369,
2264
+ "logits/rejected": -2.355404853820801,
2265
+ "logps/chosen": -262.5338439941406,
2266
+ "logps/rejected": -240.73681640625,
2267
+ "loss": 1.3361,
2268
+ "rewards/accuracies": 0.675000011920929,
2269
+ "rewards/chosen": 0.20122098922729492,
2270
+ "rewards/margins": 0.45535793900489807,
2271
+ "rewards/rejected": -0.25413697957992554,
2272
+ "step": 1360
2273
+ },
2274
+ {
2275
+ "epoch": 0.8963035655871769,
2276
+ "grad_norm": 8.75,
2277
+ "learning_rate": 1.61137637857158e-07,
2278
+ "logits/chosen": -2.5680222511291504,
2279
+ "logits/rejected": -2.452162027359009,
2280
+ "logps/chosen": -331.45050048828125,
2281
+ "logps/rejected": -277.1683654785156,
2282
+ "loss": 1.3416,
2283
+ "rewards/accuracies": 0.7749999761581421,
2284
+ "rewards/chosen": 0.10738255083560944,
2285
+ "rewards/margins": 0.42853283882141113,
2286
+ "rewards/rejected": -0.3211502432823181,
2287
+ "step": 1370
2288
+ },
2289
+ {
2290
+ "epoch": 0.9028459273797841,
2291
+ "grad_norm": 6.6875,
2292
+ "learning_rate": 1.415746684157951e-07,
2293
+ "logits/chosen": -2.469827890396118,
2294
+ "logits/rejected": -2.433368444442749,
2295
+ "logps/chosen": -222.18994140625,
2296
+ "logps/rejected": -224.70907592773438,
2297
+ "loss": 1.3334,
2298
+ "rewards/accuracies": 0.75,
2299
+ "rewards/chosen": 0.13235194981098175,
2300
+ "rewards/margins": 0.4484756886959076,
2301
+ "rewards/rejected": -0.3161238133907318,
2302
+ "step": 1380
2303
+ },
2304
+ {
2305
+ "epoch": 0.9093882891723912,
2306
+ "grad_norm": 7.03125,
2307
+ "learning_rate": 1.232428110382586e-07,
2308
+ "logits/chosen": -2.569434642791748,
2309
+ "logits/rejected": -2.4922871589660645,
2310
+ "logps/chosen": -249.40328979492188,
2311
+ "logps/rejected": -248.40054321289062,
2312
+ "loss": 1.3386,
2313
+ "rewards/accuracies": 0.675000011920929,
2314
+ "rewards/chosen": -0.005796324461698532,
2315
+ "rewards/margins": 0.4979252815246582,
2316
+ "rewards/rejected": -0.5037215948104858,
2317
+ "step": 1390
2318
+ },
2319
+ {
2320
+ "epoch": 0.9159306509649984,
2321
+ "grad_norm": 6.9375,
2322
+ "learning_rate": 1.0615163506323856e-07,
2323
+ "logits/chosen": -2.4826645851135254,
2324
+ "logits/rejected": -2.3935964107513428,
2325
+ "logps/chosen": -256.60003662109375,
2326
+ "logps/rejected": -247.3723602294922,
2327
+ "loss": 1.3398,
2328
+ "rewards/accuracies": 0.6875,
2329
+ "rewards/chosen": 0.10640044510364532,
2330
+ "rewards/margins": 0.3504992723464966,
2331
+ "rewards/rejected": -0.24409881234169006,
2332
+ "step": 1400
2333
+ },
2334
+ {
2335
+ "epoch": 0.9159306509649984,
2336
+ "eval_logits/chosen": -2.3116648197174072,
2337
+ "eval_logits/rejected": -2.2316980361938477,
2338
+ "eval_logps/chosen": -270.7674560546875,
2339
+ "eval_logps/rejected": -257.3534851074219,
2340
+ "eval_loss": 1.3414233922958374,
2341
+ "eval_rewards/accuracies": 0.7475000023841858,
2342
+ "eval_rewards/chosen": 0.14090880751609802,
2343
+ "eval_rewards/margins": 0.46596458554267883,
2344
+ "eval_rewards/rejected": -0.3250557780265808,
2345
+ "eval_runtime": 192.5164,
2346
+ "eval_samples_per_second": 10.389,
2347
+ "eval_steps_per_second": 0.519,
2348
+ "step": 1400
2349
+ },
2350
+ {
2351
+ "epoch": 0.9224730127576055,
2352
+ "grad_norm": 6.5625,
2353
+ "learning_rate": 9.031006218634342e-08,
2354
+ "logits/chosen": -2.4582056999206543,
2355
+ "logits/rejected": -2.4208710193634033,
2356
+ "logps/chosen": -294.5101623535156,
2357
+ "logps/rejected": -322.93707275390625,
2358
+ "loss": 1.3372,
2359
+ "rewards/accuracies": 0.7124999761581421,
2360
+ "rewards/chosen": 0.17300447821617126,
2361
+ "rewards/margins": 0.528673529624939,
2362
+ "rewards/rejected": -0.3556690812110901,
2363
+ "step": 1410
2364
+ },
2365
+ {
2366
+ "epoch": 0.9290153745502127,
2367
+ "grad_norm": 7.46875,
2368
+ "learning_rate": 7.572636180292831e-08,
2369
+ "logits/chosen": -2.4200243949890137,
2370
+ "logits/rejected": -2.357060194015503,
2371
+ "logps/chosen": -254.03231811523438,
2372
+ "logps/rejected": -243.3819580078125,
2373
+ "loss": 1.3276,
2374
+ "rewards/accuracies": 0.7250000238418579,
2375
+ "rewards/chosen": 0.08049002289772034,
2376
+ "rewards/margins": 0.46086424589157104,
2377
+ "rewards/rejected": -0.3803742825984955,
2378
+ "step": 1420
2379
+ },
2380
+ {
2381
+ "epoch": 0.9355577363428198,
2382
+ "grad_norm": 7.4375,
2383
+ "learning_rate": 6.240814669141559e-08,
2384
+ "logits/chosen": -2.4541358947753906,
2385
+ "logits/rejected": -2.4223341941833496,
2386
+ "logps/chosen": -266.42156982421875,
2387
+ "logps/rejected": -304.84600830078125,
2388
+ "loss": 1.3296,
2389
+ "rewards/accuracies": 0.7250000238418579,
2390
+ "rewards/chosen": 0.07571353018283844,
2391
+ "rewards/margins": 0.49027562141418457,
2392
+ "rewards/rejected": -0.4145621359348297,
2393
+ "step": 1430
2394
+ },
2395
+ {
2396
+ "epoch": 0.9421000981354269,
2397
+ "grad_norm": 8.25,
2398
+ "learning_rate": 5.036236903938285e-08,
2399
+ "logits/chosen": -2.5123915672302246,
2400
+ "logits/rejected": -2.4232895374298096,
2401
+ "logps/chosen": -289.10601806640625,
2402
+ "logps/rejected": -232.24868774414062,
2403
+ "loss": 1.3312,
2404
+ "rewards/accuracies": 0.7250000238418579,
2405
+ "rewards/chosen": 0.08486177772283554,
2406
+ "rewards/margins": 0.33679136633872986,
2407
+ "rewards/rejected": -0.2519296109676361,
2408
+ "step": 1440
2409
+ },
2410
+ {
2411
+ "epoch": 0.948642459928034,
2412
+ "grad_norm": 6.90625,
2413
+ "learning_rate": 3.959531681447859e-08,
2414
+ "logits/chosen": -2.3544886112213135,
2415
+ "logits/rejected": -2.306253433227539,
2416
+ "logps/chosen": -259.6851501464844,
2417
+ "logps/rejected": -238.7523956298828,
2418
+ "loss": 1.3431,
2419
+ "rewards/accuracies": 0.699999988079071,
2420
+ "rewards/chosen": 0.04705673083662987,
2421
+ "rewards/margins": 0.3753374218940735,
2422
+ "rewards/rejected": -0.3282806873321533,
2423
+ "step": 1450
2424
+ },
2425
+ {
2426
+ "epoch": 0.9551848217206411,
2427
+ "grad_norm": 7.03125,
2428
+ "learning_rate": 3.0112610482064544e-08,
2429
+ "logits/chosen": -2.586221933364868,
2430
+ "logits/rejected": -2.4770185947418213,
2431
+ "logps/chosen": -299.3462829589844,
2432
+ "logps/rejected": -263.6593017578125,
2433
+ "loss": 1.342,
2434
+ "rewards/accuracies": 0.7124999761581421,
2435
+ "rewards/chosen": 0.11202242225408554,
2436
+ "rewards/margins": 0.4375079572200775,
2437
+ "rewards/rejected": -0.3254855275154114,
2438
+ "step": 1460
2439
+ },
2440
+ {
2441
+ "epoch": 0.9617271835132483,
2442
+ "grad_norm": 6.96875,
2443
+ "learning_rate": 2.1919200071301715e-08,
2444
+ "logits/chosen": -2.5899202823638916,
2445
+ "logits/rejected": -2.45597767829895,
2446
+ "logps/chosen": -292.31683349609375,
2447
+ "logps/rejected": -233.8472900390625,
2448
+ "loss": 1.3513,
2449
+ "rewards/accuracies": 0.7250000238418579,
2450
+ "rewards/chosen": 0.09856925159692764,
2451
+ "rewards/margins": 0.4128998816013336,
2452
+ "rewards/rejected": -0.31433066725730896,
2453
+ "step": 1470
2454
+ },
2455
+ {
2456
+ "epoch": 0.9682695453058554,
2457
+ "grad_norm": 6.78125,
2458
+ "learning_rate": 1.50193625912029e-08,
2459
+ "logits/chosen": -2.3694183826446533,
2460
+ "logits/rejected": -2.3877651691436768,
2461
+ "logps/chosen": -229.5559539794922,
2462
+ "logps/rejected": -265.0757141113281,
2463
+ "loss": 1.3336,
2464
+ "rewards/accuracies": 0.737500011920929,
2465
+ "rewards/chosen": -0.014900955371558666,
2466
+ "rewards/margins": 0.37212496995925903,
2467
+ "rewards/rejected": -0.38702592253685,
2468
+ "step": 1480
2469
+ },
2470
+ {
2471
+ "epoch": 0.9748119070984625,
2472
+ "grad_norm": 8.25,
2473
+ "learning_rate": 9.416699798010521e-09,
2474
+ "logits/chosen": -2.3672139644622803,
2475
+ "logits/rejected": -2.2716662883758545,
2476
+ "logps/chosen": -253.46322631835938,
2477
+ "logps/rejected": -208.71768188476562,
2478
+ "loss": 1.3194,
2479
+ "rewards/accuracies": 0.7875000238418579,
2480
+ "rewards/chosen": 0.11514721810817719,
2481
+ "rewards/margins": 0.5433107018470764,
2482
+ "rewards/rejected": -0.42816343903541565,
2483
+ "step": 1490
2484
+ },
2485
+ {
2486
+ "epoch": 0.9813542688910697,
2487
+ "grad_norm": 6.5,
2488
+ "learning_rate": 5.114136315058083e-09,
2489
+ "logits/chosen": -2.3912510871887207,
2490
+ "logits/rejected": -2.3454089164733887,
2491
+ "logps/chosen": -262.8589782714844,
2492
+ "logps/rejected": -278.1678161621094,
2493
+ "loss": 1.325,
2494
+ "rewards/accuracies": 0.800000011920929,
2495
+ "rewards/chosen": 0.12093720585107803,
2496
+ "rewards/margins": 0.5334186553955078,
2497
+ "rewards/rejected": -0.41248148679733276,
2498
+ "step": 1500
2499
+ },
2500
+ {
2501
+ "epoch": 0.9813542688910697,
2502
+ "eval_logits/chosen": -2.313671827316284,
2503
+ "eval_logits/rejected": -2.23390793800354,
2504
+ "eval_logps/chosen": -270.74359130859375,
2505
+ "eval_logps/rejected": -257.3707275390625,
2506
+ "eval_loss": 1.3408766984939575,
2507
+ "eval_rewards/accuracies": 0.7475000023841858,
2508
+ "eval_rewards/chosen": 0.14329300820827484,
2509
+ "eval_rewards/margins": 0.47007519006729126,
2510
+ "eval_rewards/rejected": -0.3267821967601776,
2511
+ "eval_runtime": 192.5128,
2512
+ "eval_samples_per_second": 10.389,
2513
+ "eval_steps_per_second": 0.519,
2514
+ "step": 1500
2515
+ },
2516
+ {
2517
+ "epoch": 0.9878966306836768,
2518
+ "grad_norm": 7.09375,
2519
+ "learning_rate": 2.113918106098345e-09,
2520
+ "logits/chosen": -2.456124782562256,
2521
+ "logits/rejected": -2.4603703022003174,
2522
+ "logps/chosen": -279.5404052734375,
2523
+ "logps/rejected": -286.7142333984375,
2524
+ "loss": 1.3363,
2525
+ "rewards/accuracies": 0.675000011920929,
2526
+ "rewards/chosen": 0.1024564728140831,
2527
+ "rewards/margins": 0.3759761452674866,
2528
+ "rewards/rejected": -0.2735196650028229,
2529
+ "step": 1510
2530
+ },
2531
+ {
2532
+ "epoch": 0.994438992476284,
2533
+ "grad_norm": 8.6875,
2534
+ "learning_rate": 4.176113028983575e-10,
2535
+ "logits/chosen": -2.407866954803467,
2536
+ "logits/rejected": -2.355156421661377,
2537
+ "logps/chosen": -266.18231201171875,
2538
+ "logps/rejected": -265.04022216796875,
2539
+ "loss": 1.337,
2540
+ "rewards/accuracies": 0.675000011920929,
2541
+ "rewards/chosen": 0.07989536225795746,
2542
+ "rewards/margins": 0.4557567238807678,
2543
+ "rewards/rejected": -0.37586134672164917,
2544
+ "step": 1520
2545
+ },
2546
+ {
2547
+ "epoch": 0.9996728819103696,
2548
+ "step": 1528,
2549
+ "total_flos": 0.0,
2550
+ "train_loss": 1.343712306771603,
2551
+ "train_runtime": 13958.9948,
2552
+ "train_samples_per_second": 4.38,
2553
+ "train_steps_per_second": 0.109
2554
+ }
2555
+ ],
2556
+ "logging_steps": 10,
2557
+ "max_steps": 1528,
2558
+ "num_input_tokens_seen": 0,
2559
+ "num_train_epochs": 1,
2560
+ "save_steps": 100,
2561
+ "stateful_callbacks": {
2562
+ "TrainerControl": {
2563
+ "args": {
2564
+ "should_epoch_stop": false,
2565
+ "should_evaluate": false,
2566
+ "should_log": false,
2567
+ "should_save": true,
2568
+ "should_training_stop": true
2569
+ },
2570
+ "attributes": {}
2571
+ }
2572
+ },
2573
+ "total_flos": 0.0,
2574
+ "train_batch_size": 2,
2575
+ "trial_name": null,
2576
+ "trial_params": null
2577
+ }