Kimory-X commited on
Commit
87c424a
·
verified ·
1 Parent(s): 00838a7

Model save

Browse files
Files changed (4) hide show
  1. README.md +90 -0
  2. all_results.json +9 -0
  3. train_results.json +9 -0
  4. trainer_state.json +2562 -0
README.md ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: mistralai/Mistral-7B-v0.1
4
+ tags:
5
+ - trl
6
+ - dpo
7
+ - generated_from_trainer
8
+ library_name: peft
9
+ model-index:
10
+ - name: zephyr-7b-mypo3_sim-qlora-lr5e6-beta0.30
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/1014579852qq-tsinghua-university/huggingface/runs/wnnhas0b)
18
+ # zephyr-7b-mypo3_sim-qlora-lr5e6-beta0.30
19
+
20
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on an unknown dataset.
21
+ It achieves the following results on the evaluation set:
22
+ - Loss: 1.3541
23
+ - Rewards/chosen: -0.0637
24
+ - Rewards/rejected: -0.3677
25
+ - Rewards/accuracies: 0.7025
26
+ - Rewards/margins: 0.3039
27
+ - Logps/rejected: -2.2834
28
+ - Logps/chosen: -1.1380
29
+ - Logits/rejected: -1.9704
30
+ - Logits/chosen: -2.0513
31
+
32
+ ## Model description
33
+
34
+ More information needed
35
+
36
+ ## Intended uses & limitations
37
+
38
+ More information needed
39
+
40
+ ## Training and evaluation data
41
+
42
+ More information needed
43
+
44
+ ## Training procedure
45
+
46
+ ### Training hyperparameters
47
+
48
+ The following hyperparameters were used during training:
49
+ - learning_rate: 5e-06
50
+ - train_batch_size: 2
51
+ - eval_batch_size: 4
52
+ - seed: 42
53
+ - distributed_type: multi-GPU
54
+ - num_devices: 5
55
+ - gradient_accumulation_steps: 4
56
+ - total_train_batch_size: 40
57
+ - total_eval_batch_size: 20
58
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
59
+ - lr_scheduler_type: cosine
60
+ - lr_scheduler_warmup_ratio: 0.1
61
+ - num_epochs: 1
62
+
63
+ ### Training results
64
+
65
+ | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
66
+ |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
67
+ | 1.3799 | 0.0654 | 100 | 1.3804 | -0.0062 | -0.0408 | 0.6600 | 0.0346 | -1.1940 | -0.9462 | -2.1974 | -2.2810 |
68
+ | 1.3728 | 0.1308 | 200 | 1.3734 | -0.0308 | -0.1119 | 0.6900 | 0.0811 | -1.4310 | -1.0283 | -2.2618 | -2.3330 |
69
+ | 1.3605 | 0.1963 | 300 | 1.3670 | -0.0656 | -0.2070 | 0.7200 | 0.1414 | -1.7478 | -1.1442 | -2.1971 | -2.2674 |
70
+ | 1.3607 | 0.2617 | 400 | 1.3637 | -0.0644 | -0.2551 | 0.6975 | 0.1908 | -1.9084 | -1.1401 | -2.2602 | -2.3277 |
71
+ | 1.3642 | 0.3271 | 500 | 1.3625 | -0.0744 | -0.3109 | 0.6875 | 0.2366 | -2.0943 | -1.1734 | -2.1841 | -2.2534 |
72
+ | 1.3489 | 0.3925 | 600 | 1.3649 | -0.1095 | -0.4197 | 0.6850 | 0.3101 | -2.4568 | -1.2906 | -2.0263 | -2.1039 |
73
+ | 1.3735 | 0.4580 | 700 | 1.3653 | -0.1046 | -0.4143 | 0.7000 | 0.3097 | -2.4389 | -1.2743 | -1.9237 | -2.0155 |
74
+ | 1.3606 | 0.5234 | 800 | 1.3592 | -0.0745 | -0.3701 | 0.6950 | 0.2956 | -2.2915 | -1.1739 | -1.9493 | -2.0356 |
75
+ | 1.3462 | 0.5888 | 900 | 1.3568 | -0.0854 | -0.3668 | 0.7050 | 0.2815 | -2.2807 | -1.2100 | -1.9785 | -2.0609 |
76
+ | 1.3527 | 0.6542 | 1000 | 1.3548 | -0.0626 | -0.3514 | 0.7050 | 0.2888 | -2.2291 | -1.1342 | -1.9978 | -2.0771 |
77
+ | 1.3483 | 0.7197 | 1100 | 1.3558 | -0.0665 | -0.3741 | 0.7025 | 0.3076 | -2.3048 | -1.1471 | -1.9802 | -2.0598 |
78
+ | 1.3558 | 0.7851 | 1200 | 1.3542 | -0.0628 | -0.3646 | 0.7050 | 0.3018 | -2.2733 | -1.1348 | -1.9719 | -2.0522 |
79
+ | 1.3515 | 0.8505 | 1300 | 1.3543 | -0.0644 | -0.3702 | 0.7050 | 0.3058 | -2.2918 | -1.1402 | -1.9694 | -2.0505 |
80
+ | 1.3572 | 0.9159 | 1400 | 1.3540 | -0.0639 | -0.3674 | 0.7075 | 0.3035 | -2.2825 | -1.1385 | -1.9716 | -2.0522 |
81
+ | 1.3527 | 0.9814 | 1500 | 1.3541 | -0.0637 | -0.3677 | 0.7025 | 0.3039 | -2.2834 | -1.1380 | -1.9704 | -2.0513 |
82
+
83
+
84
+ ### Framework versions
85
+
86
+ - PEFT 0.10.0
87
+ - Transformers 4.43.1
88
+ - Pytorch 2.1.2+cu121
89
+ - Datasets 2.18.0
90
+ - Tokenizers 0.19.1
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9996728819103696,
3
+ "total_flos": 0.0,
4
+ "train_loss": 1.3600662709530735,
5
+ "train_runtime": 13966.5548,
6
+ "train_samples": 61134,
7
+ "train_samples_per_second": 4.377,
8
+ "train_steps_per_second": 0.109
9
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 0.9996728819103696,
3
+ "total_flos": 0.0,
4
+ "train_loss": 1.3600662709530735,
5
+ "train_runtime": 13966.5548,
6
+ "train_samples": 61134,
7
+ "train_samples_per_second": 4.377,
8
+ "train_steps_per_second": 0.109
9
+ }
trainer_state.json ADDED
@@ -0,0 +1,2562 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9996728819103696,
5
+ "eval_steps": 100,
6
+ "global_step": 1528,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.006542361792607131,
13
+ "grad_norm": 0.2294921875,
14
+ "learning_rate": 3.267973856209151e-07,
15
+ "logits/chosen": -2.5530099868774414,
16
+ "logits/rejected": -2.454012632369995,
17
+ "logps/chosen": -0.937676727771759,
18
+ "logps/rejected": -1.109251856803894,
19
+ "loss": 1.3863,
20
+ "rewards/accuracies": 0.4124999940395355,
21
+ "rewards/chosen": -9.286254498874769e-05,
22
+ "rewards/margins": -7.532918971264735e-05,
23
+ "rewards/rejected": -1.7533380741951987e-05,
24
+ "step": 10
25
+ },
26
+ {
27
+ "epoch": 0.013084723585214262,
28
+ "grad_norm": 0.1435546875,
29
+ "learning_rate": 6.535947712418302e-07,
30
+ "logits/chosen": -2.480818510055542,
31
+ "logits/rejected": -2.3557686805725098,
32
+ "logps/chosen": -0.9598627090454102,
33
+ "logps/rejected": -0.9752475619316101,
34
+ "loss": 1.3863,
35
+ "rewards/accuracies": 0.5,
36
+ "rewards/chosen": -7.735938197583891e-06,
37
+ "rewards/margins": 0.00011255677964072675,
38
+ "rewards/rejected": -0.00012029272329527885,
39
+ "step": 20
40
+ },
41
+ {
42
+ "epoch": 0.019627085377821395,
43
+ "grad_norm": 0.1630859375,
44
+ "learning_rate": 9.80392156862745e-07,
45
+ "logits/chosen": -2.4215729236602783,
46
+ "logits/rejected": -2.3662164211273193,
47
+ "logps/chosen": -0.9298206567764282,
48
+ "logps/rejected": -1.1405647993087769,
49
+ "loss": 1.3862,
50
+ "rewards/accuracies": 0.550000011920929,
51
+ "rewards/chosen": -0.00022919184993952513,
52
+ "rewards/margins": 0.0003895942063536495,
53
+ "rewards/rejected": -0.0006187860853970051,
54
+ "step": 30
55
+ },
56
+ {
57
+ "epoch": 0.026169447170428524,
58
+ "grad_norm": 0.2197265625,
59
+ "learning_rate": 1.3071895424836604e-06,
60
+ "logits/chosen": -2.3722825050354004,
61
+ "logits/rejected": -2.3141565322875977,
62
+ "logps/chosen": -1.0184338092803955,
63
+ "logps/rejected": -1.0292493104934692,
64
+ "loss": 1.386,
65
+ "rewards/accuracies": 0.5375000238418579,
66
+ "rewards/chosen": 0.00016174458141904324,
67
+ "rewards/margins": 0.00046144178486429155,
68
+ "rewards/rejected": -0.00029969721799716353,
69
+ "step": 40
70
+ },
71
+ {
72
+ "epoch": 0.03271180896303565,
73
+ "grad_norm": 0.1962890625,
74
+ "learning_rate": 1.6339869281045753e-06,
75
+ "logits/chosen": -2.4260201454162598,
76
+ "logits/rejected": -2.3685925006866455,
77
+ "logps/chosen": -0.840053915977478,
78
+ "logps/rejected": -1.0239614248275757,
79
+ "loss": 1.3853,
80
+ "rewards/accuracies": 0.762499988079071,
81
+ "rewards/chosen": 0.0067680226638913155,
82
+ "rewards/margins": 0.005519701633602381,
83
+ "rewards/rejected": 0.0012483217287808657,
84
+ "step": 50
85
+ },
86
+ {
87
+ "epoch": 0.03925417075564279,
88
+ "grad_norm": 0.251953125,
89
+ "learning_rate": 1.96078431372549e-06,
90
+ "logits/chosen": -2.5593574047088623,
91
+ "logits/rejected": -2.252335786819458,
92
+ "logps/chosen": -0.8980849981307983,
93
+ "logps/rejected": -1.0699880123138428,
94
+ "loss": 1.385,
95
+ "rewards/accuracies": 0.6625000238418579,
96
+ "rewards/chosen": 0.0039029642939567566,
97
+ "rewards/margins": 0.0045288316905498505,
98
+ "rewards/rejected": -0.0006258675130084157,
99
+ "step": 60
100
+ },
101
+ {
102
+ "epoch": 0.04579653254824992,
103
+ "grad_norm": 0.271484375,
104
+ "learning_rate": 2.2875816993464053e-06,
105
+ "logits/chosen": -2.476616382598877,
106
+ "logits/rejected": -2.4485809803009033,
107
+ "logps/chosen": -0.987118124961853,
108
+ "logps/rejected": -1.2191505432128906,
109
+ "loss": 1.3832,
110
+ "rewards/accuracies": 0.637499988079071,
111
+ "rewards/chosen": -0.01106469426304102,
112
+ "rewards/margins": 0.01529990416020155,
113
+ "rewards/rejected": -0.02636459842324257,
114
+ "step": 70
115
+ },
116
+ {
117
+ "epoch": 0.05233889434085705,
118
+ "grad_norm": 0.30078125,
119
+ "learning_rate": 2.6143790849673208e-06,
120
+ "logits/chosen": -2.353388786315918,
121
+ "logits/rejected": -2.254671335220337,
122
+ "logps/chosen": -1.0187981128692627,
123
+ "logps/rejected": -1.1903177499771118,
124
+ "loss": 1.3827,
125
+ "rewards/accuracies": 0.5874999761581421,
126
+ "rewards/chosen": -0.03571772575378418,
127
+ "rewards/margins": 0.013263550586998463,
128
+ "rewards/rejected": -0.04898127168416977,
129
+ "step": 80
130
+ },
131
+ {
132
+ "epoch": 0.058881256133464184,
133
+ "grad_norm": 0.33984375,
134
+ "learning_rate": 2.9411764705882355e-06,
135
+ "logits/chosen": -2.3827168941497803,
136
+ "logits/rejected": -2.307100772857666,
137
+ "logps/chosen": -0.9496015310287476,
138
+ "logps/rejected": -1.1855210065841675,
139
+ "loss": 1.3815,
140
+ "rewards/accuracies": 0.699999988079071,
141
+ "rewards/chosen": -0.018529271706938744,
142
+ "rewards/margins": 0.020624618977308273,
143
+ "rewards/rejected": -0.03915388882160187,
144
+ "step": 90
145
+ },
146
+ {
147
+ "epoch": 0.0654236179260713,
148
+ "grad_norm": 0.2333984375,
149
+ "learning_rate": 3.2679738562091506e-06,
150
+ "logits/chosen": -2.4862723350524902,
151
+ "logits/rejected": -2.4109530448913574,
152
+ "logps/chosen": -0.9562190175056458,
153
+ "logps/rejected": -1.266717553138733,
154
+ "loss": 1.3799,
155
+ "rewards/accuracies": 0.5625,
156
+ "rewards/chosen": -0.022339094430208206,
157
+ "rewards/margins": 0.035685621201992035,
158
+ "rewards/rejected": -0.05802471563220024,
159
+ "step": 100
160
+ },
161
+ {
162
+ "epoch": 0.0654236179260713,
163
+ "eval_logits/chosen": -2.2809672355651855,
164
+ "eval_logits/rejected": -2.1974074840545654,
165
+ "eval_logps/chosen": -0.9461590647697449,
166
+ "eval_logps/rejected": -1.194018840789795,
167
+ "eval_loss": 1.3803561925888062,
168
+ "eval_rewards/accuracies": 0.6600000262260437,
169
+ "eval_rewards/chosen": -0.006188389845192432,
170
+ "eval_rewards/margins": 0.034646403044462204,
171
+ "eval_rewards/rejected": -0.04083478823304176,
172
+ "eval_runtime": 193.2216,
173
+ "eval_samples_per_second": 10.351,
174
+ "eval_steps_per_second": 0.518,
175
+ "step": 100
176
+ },
177
+ {
178
+ "epoch": 0.07196597971867845,
179
+ "grad_norm": 0.80859375,
180
+ "learning_rate": 3.5947712418300657e-06,
181
+ "logits/chosen": -2.346219301223755,
182
+ "logits/rejected": -2.3201358318328857,
183
+ "logps/chosen": -1.0299032926559448,
184
+ "logps/rejected": -1.1942877769470215,
185
+ "loss": 1.3801,
186
+ "rewards/accuracies": 0.6625000238418579,
187
+ "rewards/chosen": -0.01987696811556816,
188
+ "rewards/margins": 0.039442867040634155,
189
+ "rewards/rejected": -0.05931984260678291,
190
+ "step": 110
191
+ },
192
+ {
193
+ "epoch": 0.07850834151128558,
194
+ "grad_norm": 0.427734375,
195
+ "learning_rate": 3.92156862745098e-06,
196
+ "logits/chosen": -2.382317543029785,
197
+ "logits/rejected": -2.2658298015594482,
198
+ "logps/chosen": -0.9744423627853394,
199
+ "logps/rejected": -1.2910969257354736,
200
+ "loss": 1.3775,
201
+ "rewards/accuracies": 0.737500011920929,
202
+ "rewards/chosen": -0.018291741609573364,
203
+ "rewards/margins": 0.041886262595653534,
204
+ "rewards/rejected": -0.0601780004799366,
205
+ "step": 120
206
+ },
207
+ {
208
+ "epoch": 0.08505070330389271,
209
+ "grad_norm": 1.59375,
210
+ "learning_rate": 4.2483660130718954e-06,
211
+ "logits/chosen": -2.4718565940856934,
212
+ "logits/rejected": -2.3606743812561035,
213
+ "logps/chosen": -1.1133302450180054,
214
+ "logps/rejected": -1.388488531112671,
215
+ "loss": 1.3854,
216
+ "rewards/accuracies": 0.612500011920929,
217
+ "rewards/chosen": -0.06918147951364517,
218
+ "rewards/margins": 0.031170833855867386,
219
+ "rewards/rejected": -0.10035230964422226,
220
+ "step": 130
221
+ },
222
+ {
223
+ "epoch": 0.09159306509649984,
224
+ "grad_norm": 0.5234375,
225
+ "learning_rate": 4.5751633986928105e-06,
226
+ "logits/chosen": -2.500530242919922,
227
+ "logits/rejected": -2.386270523071289,
228
+ "logps/chosen": -0.9971791505813599,
229
+ "logps/rejected": -1.2755845785140991,
230
+ "loss": 1.375,
231
+ "rewards/accuracies": 0.6625000238418579,
232
+ "rewards/chosen": -0.0014015674823895097,
233
+ "rewards/margins": 0.06071232631802559,
234
+ "rewards/rejected": -0.06211389973759651,
235
+ "step": 140
236
+ },
237
+ {
238
+ "epoch": 0.09813542688910697,
239
+ "grad_norm": 0.41015625,
240
+ "learning_rate": 4.901960784313726e-06,
241
+ "logits/chosen": -2.470759391784668,
242
+ "logits/rejected": -2.4136905670166016,
243
+ "logps/chosen": -1.0106420516967773,
244
+ "logps/rejected": -1.3554531335830688,
245
+ "loss": 1.3771,
246
+ "rewards/accuracies": 0.762499988079071,
247
+ "rewards/chosen": -0.013763192109763622,
248
+ "rewards/margins": 0.06293424218893051,
249
+ "rewards/rejected": -0.076697438955307,
250
+ "step": 150
251
+ },
252
+ {
253
+ "epoch": 0.1046777886817141,
254
+ "grad_norm": 0.6953125,
255
+ "learning_rate": 4.999680264259825e-06,
256
+ "logits/chosen": -2.5543665885925293,
257
+ "logits/rejected": -2.3195226192474365,
258
+ "logps/chosen": -0.8638038635253906,
259
+ "logps/rejected": -1.1180086135864258,
260
+ "loss": 1.3734,
261
+ "rewards/accuracies": 0.7124999761581421,
262
+ "rewards/chosen": 9.241541556548327e-05,
263
+ "rewards/margins": 0.05282307416200638,
264
+ "rewards/rejected": -0.052730657160282135,
265
+ "step": 160
266
+ },
267
+ {
268
+ "epoch": 0.11122015047432122,
269
+ "grad_norm": 0.6171875,
270
+ "learning_rate": 4.998114408534616e-06,
271
+ "logits/chosen": -2.4441475868225098,
272
+ "logits/rejected": -2.248377561569214,
273
+ "logps/chosen": -0.9274452328681946,
274
+ "logps/rejected": -1.4115386009216309,
275
+ "loss": 1.3736,
276
+ "rewards/accuracies": 0.6875,
277
+ "rewards/chosen": -0.019874438643455505,
278
+ "rewards/margins": 0.09404312819242477,
279
+ "rewards/rejected": -0.11391757428646088,
280
+ "step": 170
281
+ },
282
+ {
283
+ "epoch": 0.11776251226692837,
284
+ "grad_norm": 0.53515625,
285
+ "learning_rate": 4.995244522215781e-06,
286
+ "logits/chosen": -2.4685444831848145,
287
+ "logits/rejected": -2.441394329071045,
288
+ "logps/chosen": -1.172699213027954,
289
+ "logps/rejected": -1.4395049810409546,
290
+ "loss": 1.373,
291
+ "rewards/accuracies": 0.612500011920929,
292
+ "rewards/chosen": -0.07093976438045502,
293
+ "rewards/margins": 0.05027998611330986,
294
+ "rewards/rejected": -0.12121973931789398,
295
+ "step": 180
296
+ },
297
+ {
298
+ "epoch": 0.1243048740595355,
299
+ "grad_norm": 1.25,
300
+ "learning_rate": 4.9910721034010655e-06,
301
+ "logits/chosen": -2.487239360809326,
302
+ "logits/rejected": -2.382844924926758,
303
+ "logps/chosen": -0.9922649264335632,
304
+ "logps/rejected": -1.4326521158218384,
305
+ "loss": 1.3774,
306
+ "rewards/accuracies": 0.675000011920929,
307
+ "rewards/chosen": -0.02712084725499153,
308
+ "rewards/margins": 0.08723781257867813,
309
+ "rewards/rejected": -0.11435864865779877,
310
+ "step": 190
311
+ },
312
+ {
313
+ "epoch": 0.1308472358521426,
314
+ "grad_norm": 0.333984375,
315
+ "learning_rate": 4.985599330117931e-06,
316
+ "logits/chosen": -2.43881893157959,
317
+ "logits/rejected": -2.389829635620117,
318
+ "logps/chosen": -0.9726101160049438,
319
+ "logps/rejected": -1.330091953277588,
320
+ "loss": 1.3728,
321
+ "rewards/accuracies": 0.699999988079071,
322
+ "rewards/chosen": -0.026006106287240982,
323
+ "rewards/margins": 0.07863597571849823,
324
+ "rewards/rejected": -0.10464207828044891,
325
+ "step": 200
326
+ },
327
+ {
328
+ "epoch": 0.1308472358521426,
329
+ "eval_logits/chosen": -2.332975387573242,
330
+ "eval_logits/rejected": -2.261765241622925,
331
+ "eval_logps/chosen": -1.0283117294311523,
332
+ "eval_logps/rejected": -1.4310046434402466,
333
+ "eval_loss": 1.373443603515625,
334
+ "eval_rewards/accuracies": 0.6899999976158142,
335
+ "eval_rewards/chosen": -0.030834216624498367,
336
+ "eval_rewards/margins": 0.08109636604785919,
337
+ "eval_rewards/rejected": -0.11193057149648666,
338
+ "eval_runtime": 192.5903,
339
+ "eval_samples_per_second": 10.385,
340
+ "eval_steps_per_second": 0.519,
341
+ "step": 200
342
+ },
343
+ {
344
+ "epoch": 0.13738959764474976,
345
+ "grad_norm": 0.33984375,
346
+ "learning_rate": 4.978829059186611e-06,
347
+ "logits/chosen": -2.504112482070923,
348
+ "logits/rejected": -2.4700379371643066,
349
+ "logps/chosen": -1.1215603351593018,
350
+ "logps/rejected": -1.4604188203811646,
351
+ "loss": 1.3733,
352
+ "rewards/accuracies": 0.7250000238418579,
353
+ "rewards/chosen": -0.042933590710163116,
354
+ "rewards/margins": 0.06642533093690872,
355
+ "rewards/rejected": -0.10935892164707184,
356
+ "step": 210
357
+ },
358
+ {
359
+ "epoch": 0.1439319594373569,
360
+ "grad_norm": 0.6484375,
361
+ "learning_rate": 4.97076482472884e-06,
362
+ "logits/chosen": -2.3071188926696777,
363
+ "logits/rejected": -2.3290657997131348,
364
+ "logps/chosen": -1.1499958038330078,
365
+ "logps/rejected": -1.5844924449920654,
366
+ "loss": 1.3679,
367
+ "rewards/accuracies": 0.6875,
368
+ "rewards/chosen": -0.054123081266880035,
369
+ "rewards/margins": 0.1018981784582138,
370
+ "rewards/rejected": -0.15602126717567444,
371
+ "step": 220
372
+ },
373
+ {
374
+ "epoch": 0.15047432122996401,
375
+ "grad_norm": 0.44140625,
376
+ "learning_rate": 4.961410836323014e-06,
377
+ "logits/chosen": -2.4832520484924316,
378
+ "logits/rejected": -2.376649856567383,
379
+ "logps/chosen": -1.0944504737854004,
380
+ "logps/rejected": -1.3434072732925415,
381
+ "loss": 1.3733,
382
+ "rewards/accuracies": 0.7124999761581421,
383
+ "rewards/chosen": -0.04292039945721626,
384
+ "rewards/margins": 0.057192735373973846,
385
+ "rewards/rejected": -0.10011313855648041,
386
+ "step": 230
387
+ },
388
+ {
389
+ "epoch": 0.15701668302257116,
390
+ "grad_norm": 0.453125,
391
+ "learning_rate": 4.950771976806769e-06,
392
+ "logits/chosen": -2.599358081817627,
393
+ "logits/rejected": -2.4423699378967285,
394
+ "logps/chosen": -0.8888974189758301,
395
+ "logps/rejected": -1.405906081199646,
396
+ "loss": 1.3718,
397
+ "rewards/accuracies": 0.762499988079071,
398
+ "rewards/chosen": -0.01539047621190548,
399
+ "rewards/margins": 0.09842763096094131,
400
+ "rewards/rejected": -0.11381809413433075,
401
+ "step": 240
402
+ },
403
+ {
404
+ "epoch": 0.16355904481517827,
405
+ "grad_norm": 0.58984375,
406
+ "learning_rate": 4.938853799728112e-06,
407
+ "logits/chosen": -2.4943268299102783,
408
+ "logits/rejected": -2.328460216522217,
409
+ "logps/chosen": -1.0880366563796997,
410
+ "logps/rejected": -1.6129534244537354,
411
+ "loss": 1.3694,
412
+ "rewards/accuracies": 0.699999988079071,
413
+ "rewards/chosen": -0.044886864721775055,
414
+ "rewards/margins": 0.12210796028375626,
415
+ "rewards/rejected": -0.1669948399066925,
416
+ "step": 250
417
+ },
418
+ {
419
+ "epoch": 0.17010140660778542,
420
+ "grad_norm": 0.578125,
421
+ "learning_rate": 4.925662526446431e-06,
422
+ "logits/chosen": -2.386685848236084,
423
+ "logits/rejected": -2.312121868133545,
424
+ "logps/chosen": -1.2352960109710693,
425
+ "logps/rejected": -1.6638290882110596,
426
+ "loss": 1.3658,
427
+ "rewards/accuracies": 0.699999988079071,
428
+ "rewards/chosen": -0.07152961194515228,
429
+ "rewards/margins": 0.11174267530441284,
430
+ "rewards/rejected": -0.18327227234840393,
431
+ "step": 260
432
+ },
433
+ {
434
+ "epoch": 0.17664376840039253,
435
+ "grad_norm": 0.78515625,
436
+ "learning_rate": 4.911205042884912e-06,
437
+ "logits/chosen": -2.4438509941101074,
438
+ "logits/rejected": -2.317202091217041,
439
+ "logps/chosen": -1.4406167268753052,
440
+ "logps/rejected": -1.5704759359359741,
441
+ "loss": 1.3685,
442
+ "rewards/accuracies": 0.6499999761581421,
443
+ "rewards/chosen": -0.13397307693958282,
444
+ "rewards/margins": 0.051210492849349976,
445
+ "rewards/rejected": -0.1851835697889328,
446
+ "step": 270
447
+ },
448
+ {
449
+ "epoch": 0.18318613019299967,
450
+ "grad_norm": 0.3828125,
451
+ "learning_rate": 4.895488895936047e-06,
452
+ "logits/chosen": -2.3865578174591064,
453
+ "logits/rejected": -2.3630199432373047,
454
+ "logps/chosen": -1.1934704780578613,
455
+ "logps/rejected": -1.8741092681884766,
456
+ "loss": 1.362,
457
+ "rewards/accuracies": 0.737500011920929,
458
+ "rewards/chosen": -0.09065061807632446,
459
+ "rewards/margins": 0.14752301573753357,
460
+ "rewards/rejected": -0.23817363381385803,
461
+ "step": 280
462
+ },
463
+ {
464
+ "epoch": 0.18972849198560682,
465
+ "grad_norm": 0.4609375,
466
+ "learning_rate": 4.8785222895221075e-06,
467
+ "logits/chosen": -2.611616611480713,
468
+ "logits/rejected": -2.496882438659668,
469
+ "logps/chosen": -1.1736793518066406,
470
+ "logps/rejected": -1.4841904640197754,
471
+ "loss": 1.3704,
472
+ "rewards/accuracies": 0.6499999761581421,
473
+ "rewards/chosen": -0.09048116952180862,
474
+ "rewards/margins": 0.07914741337299347,
475
+ "rewards/rejected": -0.1696285754442215,
476
+ "step": 290
477
+ },
478
+ {
479
+ "epoch": 0.19627085377821393,
480
+ "grad_norm": 0.4609375,
481
+ "learning_rate": 4.860314080312651e-06,
482
+ "logits/chosen": -2.4393398761749268,
483
+ "logits/rejected": -2.359809398651123,
484
+ "logps/chosen": -1.144270896911621,
485
+ "logps/rejected": -1.7192535400390625,
486
+ "loss": 1.3605,
487
+ "rewards/accuracies": 0.737500011920929,
488
+ "rewards/chosen": -0.07278771698474884,
489
+ "rewards/margins": 0.1271217167377472,
490
+ "rewards/rejected": -0.19990943372249603,
491
+ "step": 300
492
+ },
493
+ {
494
+ "epoch": 0.19627085377821393,
495
+ "eval_logits/chosen": -2.26737117767334,
496
+ "eval_logits/rejected": -2.197129011154175,
497
+ "eval_logps/chosen": -1.1442248821258545,
498
+ "eval_logps/rejected": -1.7477951049804688,
499
+ "eval_loss": 1.3670034408569336,
500
+ "eval_rewards/accuracies": 0.7200000286102295,
501
+ "eval_rewards/chosen": -0.06560814380645752,
502
+ "eval_rewards/margins": 0.1413595825433731,
503
+ "eval_rewards/rejected": -0.20696774125099182,
504
+ "eval_runtime": 192.5244,
505
+ "eval_samples_per_second": 10.388,
506
+ "eval_steps_per_second": 0.519,
507
+ "step": 300
508
+ },
509
+ {
510
+ "epoch": 0.20281321557082108,
511
+ "grad_norm": 0.92578125,
512
+ "learning_rate": 4.840873773101287e-06,
513
+ "logits/chosen": -2.416212558746338,
514
+ "logits/rejected": -2.349287748336792,
515
+ "logps/chosen": -1.2875757217407227,
516
+ "logps/rejected": -1.9493656158447266,
517
+ "loss": 1.3632,
518
+ "rewards/accuracies": 0.7124999761581421,
519
+ "rewards/chosen": -0.09226052463054657,
520
+ "rewards/margins": 0.16571097075939178,
521
+ "rewards/rejected": -0.25797149538993835,
522
+ "step": 310
523
+ },
524
+ {
525
+ "epoch": 0.2093555773634282,
526
+ "grad_norm": 0.5546875,
527
+ "learning_rate": 4.820211515844116e-06,
528
+ "logits/chosen": -2.4312744140625,
529
+ "logits/rejected": -2.428727865219116,
530
+ "logps/chosen": -1.1892143487930298,
531
+ "logps/rejected": -1.7102506160736084,
532
+ "loss": 1.3675,
533
+ "rewards/accuracies": 0.6499999761581421,
534
+ "rewards/chosen": -0.08930884301662445,
535
+ "rewards/margins": 0.11253657191991806,
536
+ "rewards/rejected": -0.2018454372882843,
537
+ "step": 320
538
+ },
539
+ {
540
+ "epoch": 0.21589793915603533,
541
+ "grad_norm": 1.09375,
542
+ "learning_rate": 4.798338094362439e-06,
543
+ "logits/chosen": -2.4368550777435303,
544
+ "logits/rejected": -2.468419075012207,
545
+ "logps/chosen": -1.2478764057159424,
546
+ "logps/rejected": -1.4515265226364136,
547
+ "loss": 1.3713,
548
+ "rewards/accuracies": 0.75,
549
+ "rewards/chosen": -0.09131525456905365,
550
+ "rewards/margins": 0.06188271567225456,
551
+ "rewards/rejected": -0.1531979739665985,
552
+ "step": 330
553
+ },
554
+ {
555
+ "epoch": 0.22244030094864245,
556
+ "grad_norm": 2.1875,
557
+ "learning_rate": 4.775264926712489e-06,
558
+ "logits/chosen": -2.4024834632873535,
559
+ "logits/rejected": -2.3669588565826416,
560
+ "logps/chosen": -1.008465051651001,
561
+ "logps/rejected": -1.6137611865997314,
562
+ "loss": 1.367,
563
+ "rewards/accuracies": 0.699999988079071,
564
+ "rewards/chosen": -0.05111456662416458,
565
+ "rewards/margins": 0.1272425502538681,
566
+ "rewards/rejected": -0.17835713922977448,
567
+ "step": 340
568
+ },
569
+ {
570
+ "epoch": 0.2289826627412496,
571
+ "grad_norm": 0.59375,
572
+ "learning_rate": 4.751004057225147e-06,
573
+ "logits/chosen": -2.4048571586608887,
574
+ "logits/rejected": -2.3667616844177246,
575
+ "logps/chosen": -1.0595513582229614,
576
+ "logps/rejected": -1.6971559524536133,
577
+ "loss": 1.3641,
578
+ "rewards/accuracies": 0.7250000238418579,
579
+ "rewards/chosen": -0.04516686871647835,
580
+ "rewards/margins": 0.1346011608839035,
581
+ "rewards/rejected": -0.17976802587509155,
582
+ "step": 350
583
+ },
584
+ {
585
+ "epoch": 0.23552502453385674,
586
+ "grad_norm": 1.3203125,
587
+ "learning_rate": 4.725568150218719e-06,
588
+ "logits/chosen": -2.3158469200134277,
589
+ "logits/rejected": -2.3420870304107666,
590
+ "logps/chosen": -1.2580420970916748,
591
+ "logps/rejected": -1.8185100555419922,
592
+ "loss": 1.3619,
593
+ "rewards/accuracies": 0.675000011920929,
594
+ "rewards/chosen": -0.09195388108491898,
595
+ "rewards/margins": 0.12128813564777374,
596
+ "rewards/rejected": -0.21324202418327332,
597
+ "step": 360
598
+ },
599
+ {
600
+ "epoch": 0.24206738632646385,
601
+ "grad_norm": 0.625,
602
+ "learning_rate": 4.6989704833880936e-06,
603
+ "logits/chosen": -2.413306474685669,
604
+ "logits/rejected": -2.269974708557129,
605
+ "logps/chosen": -1.261281967163086,
606
+ "logps/rejected": -1.7907254695892334,
607
+ "loss": 1.3625,
608
+ "rewards/accuracies": 0.625,
609
+ "rewards/chosen": -0.09402702748775482,
610
+ "rewards/margins": 0.1201835423707962,
611
+ "rewards/rejected": -0.21421055495738983,
612
+ "step": 370
613
+ },
614
+ {
615
+ "epoch": 0.248609748119071,
616
+ "grad_norm": 1.5078125,
617
+ "learning_rate": 4.671224940873704e-06,
618
+ "logits/chosen": -2.5112624168395996,
619
+ "logits/rejected": -2.4713196754455566,
620
+ "logps/chosen": -1.2645624876022339,
621
+ "logps/rejected": -2.146049976348877,
622
+ "loss": 1.3588,
623
+ "rewards/accuracies": 0.75,
624
+ "rewards/chosen": -0.09442068636417389,
625
+ "rewards/margins": 0.2256404459476471,
626
+ "rewards/rejected": -0.3200611174106598,
627
+ "step": 380
628
+ },
629
+ {
630
+ "epoch": 0.25515210991167814,
631
+ "grad_norm": 0.765625,
632
+ "learning_rate": 4.642346006013925e-06,
633
+ "logits/chosen": -2.540147066116333,
634
+ "logits/rejected": -2.479360580444336,
635
+ "logps/chosen": -1.2912054061889648,
636
+ "logps/rejected": -1.8986918926239014,
637
+ "loss": 1.3605,
638
+ "rewards/accuracies": 0.699999988079071,
639
+ "rewards/chosen": -0.10545520484447479,
640
+ "rewards/margins": 0.16333678364753723,
641
+ "rewards/rejected": -0.26879197359085083,
642
+ "step": 390
643
+ },
644
+ {
645
+ "epoch": 0.2616944717042852,
646
+ "grad_norm": 1.296875,
647
+ "learning_rate": 4.612348753784682e-06,
648
+ "logits/chosen": -2.5153968334198,
649
+ "logits/rejected": -2.3895232677459717,
650
+ "logps/chosen": -1.239985466003418,
651
+ "logps/rejected": -1.7698838710784912,
652
+ "loss": 1.3607,
653
+ "rewards/accuracies": 0.6875,
654
+ "rewards/chosen": -0.0712096244096756,
655
+ "rewards/margins": 0.15390561521053314,
656
+ "rewards/rejected": -0.22511525452136993,
657
+ "step": 400
658
+ },
659
+ {
660
+ "epoch": 0.2616944717042852,
661
+ "eval_logits/chosen": -2.3277456760406494,
662
+ "eval_logits/rejected": -2.260209560394287,
663
+ "eval_logps/chosen": -1.140083909034729,
664
+ "eval_logps/rejected": -1.9083786010742188,
665
+ "eval_loss": 1.363683819770813,
666
+ "eval_rewards/accuracies": 0.6974999904632568,
667
+ "eval_rewards/chosen": -0.06436584144830704,
668
+ "eval_rewards/margins": 0.19077691435813904,
669
+ "eval_rewards/rejected": -0.2551427483558655,
670
+ "eval_runtime": 192.5656,
671
+ "eval_samples_per_second": 10.386,
672
+ "eval_steps_per_second": 0.519,
673
+ "step": 400
674
+ },
675
+ {
676
+ "epoch": 0.26823683349689237,
677
+ "grad_norm": 0.57421875,
678
+ "learning_rate": 4.5812488429302245e-06,
679
+ "logits/chosen": -2.4453935623168945,
680
+ "logits/rejected": -2.369070053100586,
681
+ "logps/chosen": -1.1491725444793701,
682
+ "logps/rejected": -1.6909382343292236,
683
+ "loss": 1.3609,
684
+ "rewards/accuracies": 0.6875,
685
+ "rewards/chosen": -0.06435368955135345,
686
+ "rewards/margins": 0.14920566976070404,
687
+ "rewards/rejected": -0.2135593444108963,
688
+ "step": 410
689
+ },
690
+ {
691
+ "epoch": 0.2747791952894995,
692
+ "grad_norm": 1.0,
693
+ "learning_rate": 4.54906250778917e-06,
694
+ "logits/chosen": -2.410181760787964,
695
+ "logits/rejected": -2.414539337158203,
696
+ "logps/chosen": -1.1408861875534058,
697
+ "logps/rejected": -2.0139617919921875,
698
+ "loss": 1.352,
699
+ "rewards/accuracies": 0.6875,
700
+ "rewards/chosen": -0.06627814471721649,
701
+ "rewards/margins": 0.22981509566307068,
702
+ "rewards/rejected": -0.296093225479126,
703
+ "step": 420
704
+ },
705
+ {
706
+ "epoch": 0.28132155708210665,
707
+ "grad_norm": 0.55859375,
708
+ "learning_rate": 4.515806549820084e-06,
709
+ "logits/chosen": -2.3839383125305176,
710
+ "logits/rejected": -2.2792294025421143,
711
+ "logps/chosen": -1.236488938331604,
712
+ "logps/rejected": -1.9642620086669922,
713
+ "loss": 1.3659,
714
+ "rewards/accuracies": 0.6625000238418579,
715
+ "rewards/chosen": -0.09444905072450638,
716
+ "rewards/margins": 0.17058061063289642,
717
+ "rewards/rejected": -0.2650296688079834,
718
+ "step": 430
719
+ },
720
+ {
721
+ "epoch": 0.2878639188747138,
722
+ "grad_norm": 1.4453125,
723
+ "learning_rate": 4.48149832883101e-06,
724
+ "logits/chosen": -2.4349424839019775,
725
+ "logits/rejected": -2.303603410720825,
726
+ "logps/chosen": -1.3925915956497192,
727
+ "logps/rejected": -2.356269359588623,
728
+ "loss": 1.3655,
729
+ "rewards/accuracies": 0.699999988079071,
730
+ "rewards/chosen": -0.12983675301074982,
731
+ "rewards/margins": 0.2615430951118469,
732
+ "rewards/rejected": -0.39137983322143555,
733
+ "step": 440
734
+ },
735
+ {
736
+ "epoch": 0.2944062806673209,
737
+ "grad_norm": 0.37890625,
738
+ "learning_rate": 4.446155753917559e-06,
739
+ "logits/chosen": -2.427264928817749,
740
+ "logits/rejected": -2.3634347915649414,
741
+ "logps/chosen": -1.017136573791504,
742
+ "logps/rejected": -1.411401629447937,
743
+ "loss": 1.3677,
744
+ "rewards/accuracies": 0.612500011920929,
745
+ "rewards/chosen": -0.04918839782476425,
746
+ "rewards/margins": 0.07388140261173248,
747
+ "rewards/rejected": -0.12306980043649673,
748
+ "step": 450
749
+ },
750
+ {
751
+ "epoch": 0.30094864245992803,
752
+ "grad_norm": 0.70703125,
753
+ "learning_rate": 4.409797274114245e-06,
754
+ "logits/chosen": -2.504045009613037,
755
+ "logits/rejected": -2.3919787406921387,
756
+ "logps/chosen": -1.056522250175476,
757
+ "logps/rejected": -1.699597716331482,
758
+ "loss": 1.3575,
759
+ "rewards/accuracies": 0.800000011920929,
760
+ "rewards/chosen": -0.033085618168115616,
761
+ "rewards/margins": 0.1550474911928177,
762
+ "rewards/rejected": -0.1881330907344818,
763
+ "step": 460
764
+ },
765
+ {
766
+ "epoch": 0.30749100425253517,
767
+ "grad_norm": 1.21875,
768
+ "learning_rate": 4.372441868763981e-06,
769
+ "logits/chosen": -2.3571760654449463,
770
+ "logits/rejected": -2.317436933517456,
771
+ "logps/chosen": -1.1195136308670044,
772
+ "logps/rejected": -2.229264497756958,
773
+ "loss": 1.3748,
774
+ "rewards/accuracies": 0.8125,
775
+ "rewards/chosen": -0.07059238851070404,
776
+ "rewards/margins": 0.27098971605300903,
777
+ "rewards/rejected": -0.34158211946487427,
778
+ "step": 470
779
+ },
780
+ {
781
+ "epoch": 0.3140333660451423,
782
+ "grad_norm": 1.0859375,
783
+ "learning_rate": 4.334109037610757e-06,
784
+ "logits/chosen": -2.4400782585144043,
785
+ "logits/rejected": -2.4532837867736816,
786
+ "logps/chosen": -1.236473798751831,
787
+ "logps/rejected": -1.9734086990356445,
788
+ "loss": 1.3618,
789
+ "rewards/accuracies": 0.6875,
790
+ "rewards/chosen": -0.08498811721801758,
791
+ "rewards/margins": 0.18319377303123474,
792
+ "rewards/rejected": -0.2681818902492523,
793
+ "step": 480
794
+ },
795
+ {
796
+ "epoch": 0.3205757278377494,
797
+ "grad_norm": 0.51953125,
798
+ "learning_rate": 4.294818790620644e-06,
799
+ "logits/chosen": -2.3443775177001953,
800
+ "logits/rejected": -2.243708372116089,
801
+ "logps/chosen": -0.9792125821113586,
802
+ "logps/rejected": -2.1587882041931152,
803
+ "loss": 1.3573,
804
+ "rewards/accuracies": 0.762499988079071,
805
+ "rewards/chosen": -0.030305195599794388,
806
+ "rewards/margins": 0.2875385582447052,
807
+ "rewards/rejected": -0.3178437352180481,
808
+ "step": 490
809
+ },
810
+ {
811
+ "epoch": 0.32711808963035655,
812
+ "grad_norm": 0.8125,
813
+ "learning_rate": 4.2545916375364835e-06,
814
+ "logits/chosen": -2.3785207271575928,
815
+ "logits/rejected": -2.3217720985412598,
816
+ "logps/chosen": -1.1460835933685303,
817
+ "logps/rejected": -2.0593409538269043,
818
+ "loss": 1.3642,
819
+ "rewards/accuracies": 0.7124999761581421,
820
+ "rewards/chosen": -0.06508897244930267,
821
+ "rewards/margins": 0.23457971215248108,
822
+ "rewards/rejected": -0.29966866970062256,
823
+ "step": 500
824
+ },
825
+ {
826
+ "epoch": 0.32711808963035655,
827
+ "eval_logits/chosen": -2.2533767223358154,
828
+ "eval_logits/rejected": -2.184062957763672,
829
+ "eval_logps/chosen": -1.1733927726745605,
830
+ "eval_logps/rejected": -2.094301223754883,
831
+ "eval_loss": 1.3624868392944336,
832
+ "eval_rewards/accuracies": 0.6875,
833
+ "eval_rewards/chosen": -0.07435852289199829,
834
+ "eval_rewards/margins": 0.23656095564365387,
835
+ "eval_rewards/rejected": -0.31091946363449097,
836
+ "eval_runtime": 192.5008,
837
+ "eval_samples_per_second": 10.39,
838
+ "eval_steps_per_second": 0.519,
839
+ "step": 500
840
+ },
841
+ {
842
+ "epoch": 0.3336604514229637,
843
+ "grad_norm": 1.0390625,
844
+ "learning_rate": 4.213448577171676e-06,
845
+ "logits/chosen": -2.344510316848755,
846
+ "logits/rejected": -2.1984267234802246,
847
+ "logps/chosen": -1.0696611404418945,
848
+ "logps/rejected": -1.9805517196655273,
849
+ "loss": 1.3504,
850
+ "rewards/accuracies": 0.699999988079071,
851
+ "rewards/chosen": -0.06186025217175484,
852
+ "rewards/margins": 0.2206924855709076,
853
+ "rewards/rejected": -0.2825527489185333,
854
+ "step": 510
855
+ },
856
+ {
857
+ "epoch": 0.34020281321557083,
858
+ "grad_norm": 1.609375,
859
+ "learning_rate": 4.171411086448674e-06,
860
+ "logits/chosen": -2.3313636779785156,
861
+ "logits/rejected": -2.265958786010742,
862
+ "logps/chosen": -1.5542428493499756,
863
+ "logps/rejected": -2.0602355003356934,
864
+ "loss": 1.369,
865
+ "rewards/accuracies": 0.737500011920929,
866
+ "rewards/chosen": -0.15848460793495178,
867
+ "rewards/margins": 0.12949112057685852,
868
+ "rewards/rejected": -0.2879757285118103,
869
+ "step": 520
870
+ },
871
+ {
872
+ "epoch": 0.346745175008178,
873
+ "grad_norm": 1.5703125,
874
+ "learning_rate": 4.128501109187903e-06,
875
+ "logits/chosen": -2.389492988586426,
876
+ "logits/rejected": -2.257643222808838,
877
+ "logps/chosen": -1.029513955116272,
878
+ "logps/rejected": -2.194140911102295,
879
+ "loss": 1.3562,
880
+ "rewards/accuracies": 0.75,
881
+ "rewards/chosen": -0.043850500136613846,
882
+ "rewards/margins": 0.27712133526802063,
883
+ "rewards/rejected": -0.3209717869758606,
884
+ "step": 530
885
+ },
886
+ {
887
+ "epoch": 0.35328753680078506,
888
+ "grad_norm": 0.59765625,
889
+ "learning_rate": 4.084741044652956e-06,
890
+ "logits/chosen": -2.425668239593506,
891
+ "logits/rejected": -2.2258124351501465,
892
+ "logps/chosen": -1.0867255926132202,
893
+ "logps/rejected": -2.128763437271118,
894
+ "loss": 1.3618,
895
+ "rewards/accuracies": 0.7124999761581421,
896
+ "rewards/chosen": -0.06123261898756027,
897
+ "rewards/margins": 0.2616799473762512,
898
+ "rewards/rejected": -0.3229126036167145,
899
+ "step": 540
900
+ },
901
+ {
902
+ "epoch": 0.3598298985933922,
903
+ "grad_norm": 1.21875,
904
+ "learning_rate": 4.040153735858041e-06,
905
+ "logits/chosen": -2.3242132663726807,
906
+ "logits/rejected": -2.2058465480804443,
907
+ "logps/chosen": -1.2005125284194946,
908
+ "logps/rejected": -1.8793357610702515,
909
+ "loss": 1.3602,
910
+ "rewards/accuracies": 0.7124999761581421,
911
+ "rewards/chosen": -0.08682914078235626,
912
+ "rewards/margins": 0.17635366320610046,
913
+ "rewards/rejected": -0.26318278908729553,
914
+ "step": 550
915
+ },
916
+ {
917
+ "epoch": 0.36637226038599935,
918
+ "grad_norm": 1.015625,
919
+ "learning_rate": 3.9947624576437975e-06,
920
+ "logits/chosen": -2.3776307106018066,
921
+ "logits/rejected": -2.244612455368042,
922
+ "logps/chosen": -1.2981702089309692,
923
+ "logps/rejected": -2.359877824783325,
924
+ "loss": 1.3495,
925
+ "rewards/accuracies": 0.6875,
926
+ "rewards/chosen": -0.12080104649066925,
927
+ "rewards/margins": 0.25152236223220825,
928
+ "rewards/rejected": -0.3723233938217163,
929
+ "step": 560
930
+ },
931
+ {
932
+ "epoch": 0.3729146221786065,
933
+ "grad_norm": 2.03125,
934
+ "learning_rate": 3.948590904527689e-06,
935
+ "logits/chosen": -2.256986141204834,
936
+ "logits/rejected": -2.216099977493286,
937
+ "logps/chosen": -1.3303107023239136,
938
+ "logps/rejected": -2.433795213699341,
939
+ "loss": 1.3479,
940
+ "rewards/accuracies": 0.675000011920929,
941
+ "rewards/chosen": -0.11694888770580292,
942
+ "rewards/margins": 0.29369375109672546,
943
+ "rewards/rejected": -0.4106426239013672,
944
+ "step": 570
945
+ },
946
+ {
947
+ "epoch": 0.37945698397121363,
948
+ "grad_norm": 0.70703125,
949
+ "learning_rate": 3.901663178335318e-06,
950
+ "logits/chosen": -2.440453052520752,
951
+ "logits/rejected": -2.309227228164673,
952
+ "logps/chosen": -1.2955634593963623,
953
+ "logps/rejected": -2.1659696102142334,
954
+ "loss": 1.3523,
955
+ "rewards/accuracies": 0.762499988079071,
956
+ "rewards/chosen": -0.09721510112285614,
957
+ "rewards/margins": 0.2235115021467209,
958
+ "rewards/rejected": -0.32072657346725464,
959
+ "step": 580
960
+ },
961
+ {
962
+ "epoch": 0.3859993457638207,
963
+ "grad_norm": 2.109375,
964
+ "learning_rate": 3.854003775619142e-06,
965
+ "logits/chosen": -2.2439610958099365,
966
+ "logits/rejected": -2.22778582572937,
967
+ "logps/chosen": -1.3622268438339233,
968
+ "logps/rejected": -2.140564441680908,
969
+ "loss": 1.357,
970
+ "rewards/accuracies": 0.699999988079071,
971
+ "rewards/chosen": -0.10588552057743073,
972
+ "rewards/margins": 0.22728273272514343,
973
+ "rewards/rejected": -0.33316826820373535,
974
+ "step": 590
975
+ },
976
+ {
977
+ "epoch": 0.39254170755642787,
978
+ "grad_norm": 1.3828125,
979
+ "learning_rate": 3.805637574871115e-06,
980
+ "logits/chosen": -2.2500593662261963,
981
+ "logits/rejected": -2.1690380573272705,
982
+ "logps/chosen": -1.1624057292938232,
983
+ "logps/rejected": -2.2276828289031982,
984
+ "loss": 1.3489,
985
+ "rewards/accuracies": 0.75,
986
+ "rewards/chosen": -0.07329526543617249,
987
+ "rewards/margins": 0.307359516620636,
988
+ "rewards/rejected": -0.38065481185913086,
989
+ "step": 600
990
+ },
991
+ {
992
+ "epoch": 0.39254170755642787,
993
+ "eval_logits/chosen": -2.103938341140747,
994
+ "eval_logits/rejected": -2.0262529850006104,
995
+ "eval_logps/chosen": -1.2905856370925903,
996
+ "eval_logps/rejected": -2.456772565841675,
997
+ "eval_loss": 1.3648583889007568,
998
+ "eval_rewards/accuracies": 0.6850000023841858,
999
+ "eval_rewards/chosen": -0.10951638966798782,
1000
+ "eval_rewards/margins": 0.3101446032524109,
1001
+ "eval_rewards/rejected": -0.4196610152721405,
1002
+ "eval_runtime": 192.5629,
1003
+ "eval_samples_per_second": 10.386,
1004
+ "eval_steps_per_second": 0.519,
1005
+ "step": 600
1006
+ },
1007
+ {
1008
+ "epoch": 0.399084069349035,
1009
+ "grad_norm": 1.046875,
1010
+ "learning_rate": 3.7565898235359717e-06,
1011
+ "logits/chosen": -2.273394823074341,
1012
+ "logits/rejected": -2.1735503673553467,
1013
+ "logps/chosen": -1.2579439878463745,
1014
+ "logps/rejected": -2.1365649700164795,
1015
+ "loss": 1.3734,
1016
+ "rewards/accuracies": 0.675000011920929,
1017
+ "rewards/chosen": -0.10619225353002548,
1018
+ "rewards/margins": 0.23582622408866882,
1019
+ "rewards/rejected": -0.3420184850692749,
1020
+ "step": 610
1021
+ },
1022
+ {
1023
+ "epoch": 0.40562643114164215,
1024
+ "grad_norm": 0.7578125,
1025
+ "learning_rate": 3.7068861248319127e-06,
1026
+ "logits/chosen": -2.228017807006836,
1027
+ "logits/rejected": -2.165989398956299,
1028
+ "logps/chosen": -1.0932655334472656,
1029
+ "logps/rejected": -2.0412039756774902,
1030
+ "loss": 1.364,
1031
+ "rewards/accuracies": 0.6875,
1032
+ "rewards/chosen": -0.05144476890563965,
1033
+ "rewards/margins": 0.2414330542087555,
1034
+ "rewards/rejected": -0.29287782311439514,
1035
+ "step": 620
1036
+ },
1037
+ {
1038
+ "epoch": 0.41216879293424924,
1039
+ "grad_norm": 0.671875,
1040
+ "learning_rate": 3.6565524243855695e-06,
1041
+ "logits/chosen": -2.2673258781433105,
1042
+ "logits/rejected": -2.1727194786071777,
1043
+ "logps/chosen": -1.0849422216415405,
1044
+ "logps/rejected": -2.769263744354248,
1045
+ "loss": 1.354,
1046
+ "rewards/accuracies": 0.824999988079071,
1047
+ "rewards/chosen": -0.04491913691163063,
1048
+ "rewards/margins": 0.44512462615966797,
1049
+ "rewards/rejected": -0.4900437295436859,
1050
+ "step": 630
1051
+ },
1052
+ {
1053
+ "epoch": 0.4187111547268564,
1054
+ "grad_norm": 0.92578125,
1055
+ "learning_rate": 3.6056149966882325e-06,
1056
+ "logits/chosen": -2.2358603477478027,
1057
+ "logits/rejected": -2.202164888381958,
1058
+ "logps/chosen": -1.0952130556106567,
1059
+ "logps/rejected": -2.2472681999206543,
1060
+ "loss": 1.3624,
1061
+ "rewards/accuracies": 0.6875,
1062
+ "rewards/chosen": -0.06672855466604233,
1063
+ "rewards/margins": 0.2949059009552002,
1064
+ "rewards/rejected": -0.3616344630718231,
1065
+ "step": 640
1066
+ },
1067
+ {
1068
+ "epoch": 0.4252535165194635,
1069
+ "grad_norm": 0.6953125,
1070
+ "learning_rate": 3.554100431380414e-06,
1071
+ "logits/chosen": -2.2916007041931152,
1072
+ "logits/rejected": -2.186967134475708,
1073
+ "logps/chosen": -1.1993496417999268,
1074
+ "logps/rejected": -2.332792043685913,
1075
+ "loss": 1.3578,
1076
+ "rewards/accuracies": 0.7875000238418579,
1077
+ "rewards/chosen": -0.07057426869869232,
1078
+ "rewards/margins": 0.2939244210720062,
1079
+ "rewards/rejected": -0.36449870467185974,
1080
+ "step": 650
1081
+ },
1082
+ {
1083
+ "epoch": 0.43179587831207067,
1084
+ "grad_norm": 0.875,
1085
+ "learning_rate": 3.5020356193718934e-06,
1086
+ "logits/chosen": -2.22035813331604,
1087
+ "logits/rejected": -2.168062686920166,
1088
+ "logps/chosen": -1.1497726440429688,
1089
+ "logps/rejected": -2.155555248260498,
1090
+ "loss": 1.3671,
1091
+ "rewards/accuracies": 0.7124999761581421,
1092
+ "rewards/chosen": -0.07245108485221863,
1093
+ "rewards/margins": 0.2514120042324066,
1094
+ "rewards/rejected": -0.32386311888694763,
1095
+ "step": 660
1096
+ },
1097
+ {
1098
+ "epoch": 0.4383382401046778,
1099
+ "grad_norm": 1.0390625,
1100
+ "learning_rate": 3.4494477388045035e-06,
1101
+ "logits/chosen": -2.310704469680786,
1102
+ "logits/rejected": -2.0709216594696045,
1103
+ "logps/chosen": -1.172662377357483,
1104
+ "logps/rejected": -1.904979944229126,
1105
+ "loss": 1.3603,
1106
+ "rewards/accuracies": 0.75,
1107
+ "rewards/chosen": -0.05323634669184685,
1108
+ "rewards/margins": 0.19738993048667908,
1109
+ "rewards/rejected": -0.25062626600265503,
1110
+ "step": 670
1111
+ },
1112
+ {
1113
+ "epoch": 0.4448806018972849,
1114
+ "grad_norm": 2.875,
1115
+ "learning_rate": 3.3963642408649783e-06,
1116
+ "logits/chosen": -2.2430570125579834,
1117
+ "logits/rejected": -2.1614744663238525,
1118
+ "logps/chosen": -1.2116143703460693,
1119
+ "logps/rejected": -2.23248028755188,
1120
+ "loss": 1.354,
1121
+ "rewards/accuracies": 0.7250000238418579,
1122
+ "rewards/chosen": -0.07912047952413559,
1123
+ "rewards/margins": 0.2703726291656494,
1124
+ "rewards/rejected": -0.3494931161403656,
1125
+ "step": 680
1126
+ },
1127
+ {
1128
+ "epoch": 0.45142296368989204,
1129
+ "grad_norm": 1.3671875,
1130
+ "learning_rate": 3.3428128354552727e-06,
1131
+ "logits/chosen": -2.129528522491455,
1132
+ "logits/rejected": -2.0921757221221924,
1133
+ "logps/chosen": -1.3085339069366455,
1134
+ "logps/rejected": -2.421966314315796,
1135
+ "loss": 1.3511,
1136
+ "rewards/accuracies": 0.699999988079071,
1137
+ "rewards/chosen": -0.10610251128673553,
1138
+ "rewards/margins": 0.3096415400505066,
1139
+ "rewards/rejected": -0.4157440662384033,
1140
+ "step": 690
1141
+ },
1142
+ {
1143
+ "epoch": 0.4579653254824992,
1144
+ "grad_norm": 3.203125,
1145
+ "learning_rate": 3.2888214767278246e-06,
1146
+ "logits/chosen": -2.199361562728882,
1147
+ "logits/rejected": -2.0280404090881348,
1148
+ "logps/chosen": -1.3939224481582642,
1149
+ "logps/rejected": -2.0829360485076904,
1150
+ "loss": 1.3735,
1151
+ "rewards/accuracies": 0.7124999761581421,
1152
+ "rewards/chosen": -0.14213696122169495,
1153
+ "rewards/margins": 0.17512670159339905,
1154
+ "rewards/rejected": -0.3172636330127716,
1155
+ "step": 700
1156
+ },
1157
+ {
1158
+ "epoch": 0.4579653254824992,
1159
+ "eval_logits/chosen": -2.015493392944336,
1160
+ "eval_logits/rejected": -1.9237298965454102,
1161
+ "eval_logps/chosen": -1.274337887763977,
1162
+ "eval_logps/rejected": -2.438903570175171,
1163
+ "eval_loss": 1.365309715270996,
1164
+ "eval_rewards/accuracies": 0.699999988079071,
1165
+ "eval_rewards/chosen": -0.10464204847812653,
1166
+ "eval_rewards/margins": 0.3096581697463989,
1167
+ "eval_rewards/rejected": -0.41430023312568665,
1168
+ "eval_runtime": 192.5749,
1169
+ "eval_samples_per_second": 10.386,
1170
+ "eval_steps_per_second": 0.519,
1171
+ "step": 700
1172
+ },
1173
+ {
1174
+ "epoch": 0.46450768727510633,
1175
+ "grad_norm": 0.6640625,
1176
+ "learning_rate": 3.2344183484933247e-06,
1177
+ "logits/chosen": -2.1935503482818604,
1178
+ "logits/rejected": -2.140709400177002,
1179
+ "logps/chosen": -1.2663156986236572,
1180
+ "logps/rejected": -2.3020918369293213,
1181
+ "loss": 1.3563,
1182
+ "rewards/accuracies": 0.7875000238418579,
1183
+ "rewards/chosen": -0.09797848761081696,
1184
+ "rewards/margins": 0.29972079396247864,
1185
+ "rewards/rejected": -0.3976992964744568,
1186
+ "step": 710
1187
+ },
1188
+ {
1189
+ "epoch": 0.47105004906771347,
1190
+ "grad_norm": 0.69921875,
1191
+ "learning_rate": 3.179631849508597e-06,
1192
+ "logits/chosen": -2.1977057456970215,
1193
+ "logits/rejected": -2.2053167819976807,
1194
+ "logps/chosen": -1.2915914058685303,
1195
+ "logps/rejected": -1.9569549560546875,
1196
+ "loss": 1.3619,
1197
+ "rewards/accuracies": 0.675000011920929,
1198
+ "rewards/chosen": -0.10860061645507812,
1199
+ "rewards/margins": 0.14782245457172394,
1200
+ "rewards/rejected": -0.25642308592796326,
1201
+ "step": 720
1202
+ },
1203
+ {
1204
+ "epoch": 0.47759241086032056,
1205
+ "grad_norm": 0.67578125,
1206
+ "learning_rate": 3.1244905786522796e-06,
1207
+ "logits/chosen": -2.2671637535095215,
1208
+ "logits/rejected": -2.1876721382141113,
1209
+ "logps/chosen": -1.1692320108413696,
1210
+ "logps/rejected": -1.852294683456421,
1211
+ "loss": 1.3602,
1212
+ "rewards/accuracies": 0.800000011920929,
1213
+ "rewards/chosen": -0.07541967183351517,
1214
+ "rewards/margins": 0.19343288242816925,
1215
+ "rewards/rejected": -0.268852561712265,
1216
+ "step": 730
1217
+ },
1218
+ {
1219
+ "epoch": 0.4841347726529277,
1220
+ "grad_norm": 2.296875,
1221
+ "learning_rate": 3.0690233199960393e-06,
1222
+ "logits/chosen": -2.04817271232605,
1223
+ "logits/rejected": -1.990860939025879,
1224
+ "logps/chosen": -1.218370795249939,
1225
+ "logps/rejected": -1.9844430685043335,
1226
+ "loss": 1.3607,
1227
+ "rewards/accuracies": 0.762499988079071,
1228
+ "rewards/chosen": -0.0848841518163681,
1229
+ "rewards/margins": 0.20134691894054413,
1230
+ "rewards/rejected": -0.2862311005592346,
1231
+ "step": 740
1232
+ },
1233
+ {
1234
+ "epoch": 0.49067713444553485,
1235
+ "grad_norm": 0.92578125,
1236
+ "learning_rate": 3.0132590277791163e-06,
1237
+ "logits/chosen": -2.258011817932129,
1238
+ "logits/rejected": -2.1347296237945557,
1239
+ "logps/chosen": -1.089874029159546,
1240
+ "logps/rejected": -2.269296646118164,
1241
+ "loss": 1.3491,
1242
+ "rewards/accuracies": 0.7875000238418579,
1243
+ "rewards/chosen": -0.06120484322309494,
1244
+ "rewards/margins": 0.3166188597679138,
1245
+ "rewards/rejected": -0.37782374024391174,
1246
+ "step": 750
1247
+ },
1248
+ {
1249
+ "epoch": 0.497219496238142,
1250
+ "grad_norm": 3.25,
1251
+ "learning_rate": 2.9572268112940354e-06,
1252
+ "logits/chosen": -2.220092296600342,
1253
+ "logits/rejected": -2.2047886848449707,
1254
+ "logps/chosen": -1.5497848987579346,
1255
+ "logps/rejected": -2.784870147705078,
1256
+ "loss": 1.3614,
1257
+ "rewards/accuracies": 0.675000011920929,
1258
+ "rewards/chosen": -0.18973752856254578,
1259
+ "rewards/margins": 0.32363444566726685,
1260
+ "rewards/rejected": -0.5133720636367798,
1261
+ "step": 760
1262
+ },
1263
+ {
1264
+ "epoch": 0.5037618580307491,
1265
+ "grad_norm": 0.984375,
1266
+ "learning_rate": 2.9009559196913882e-06,
1267
+ "logits/chosen": -2.261192798614502,
1268
+ "logits/rejected": -2.1291980743408203,
1269
+ "logps/chosen": -1.2792052030563354,
1270
+ "logps/rejected": -2.2746200561523438,
1271
+ "loss": 1.3559,
1272
+ "rewards/accuracies": 0.7124999761581421,
1273
+ "rewards/chosen": -0.08147454261779785,
1274
+ "rewards/margins": 0.27496951818466187,
1275
+ "rewards/rejected": -0.3564440906047821,
1276
+ "step": 770
1277
+ },
1278
+ {
1279
+ "epoch": 0.5103042198233563,
1280
+ "grad_norm": 0.74609375,
1281
+ "learning_rate": 2.844475726711595e-06,
1282
+ "logits/chosen": -2.2254199981689453,
1283
+ "logits/rejected": -2.0712227821350098,
1284
+ "logps/chosen": -1.2435195446014404,
1285
+ "logps/rejected": -2.187425136566162,
1286
+ "loss": 1.3536,
1287
+ "rewards/accuracies": 0.6625000238418579,
1288
+ "rewards/chosen": -0.09367978572845459,
1289
+ "rewards/margins": 0.2134389877319336,
1290
+ "rewards/rejected": -0.30711880326271057,
1291
+ "step": 780
1292
+ },
1293
+ {
1294
+ "epoch": 0.5168465816159633,
1295
+ "grad_norm": 1.703125,
1296
+ "learning_rate": 2.7878157153516446e-06,
1297
+ "logits/chosen": -2.119149684906006,
1298
+ "logits/rejected": -2.021897554397583,
1299
+ "logps/chosen": -1.083522081375122,
1300
+ "logps/rejected": -2.1679904460906982,
1301
+ "loss": 1.3483,
1302
+ "rewards/accuracies": 0.762499988079071,
1303
+ "rewards/chosen": -0.054320644587278366,
1304
+ "rewards/margins": 0.27394142746925354,
1305
+ "rewards/rejected": -0.328262060880661,
1306
+ "step": 790
1307
+ },
1308
+ {
1309
+ "epoch": 0.5233889434085705,
1310
+ "grad_norm": 1.0625,
1311
+ "learning_rate": 2.731005462474787e-06,
1312
+ "logits/chosen": -2.2120959758758545,
1313
+ "logits/rejected": -2.121436595916748,
1314
+ "logps/chosen": -1.0082324743270874,
1315
+ "logps/rejected": -2.022320508956909,
1316
+ "loss": 1.3606,
1317
+ "rewards/accuracies": 0.7250000238418579,
1318
+ "rewards/chosen": -0.05436345189809799,
1319
+ "rewards/margins": 0.24595093727111816,
1320
+ "rewards/rejected": -0.30031436681747437,
1321
+ "step": 800
1322
+ },
1323
+ {
1324
+ "epoch": 0.5233889434085705,
1325
+ "eval_logits/chosen": -2.035632371902466,
1326
+ "eval_logits/rejected": -1.9493231773376465,
1327
+ "eval_logps/chosen": -1.1739249229431152,
1328
+ "eval_logps/rejected": -2.291534900665283,
1329
+ "eval_loss": 1.3592066764831543,
1330
+ "eval_rewards/accuracies": 0.6949999928474426,
1331
+ "eval_rewards/chosen": -0.074518121778965,
1332
+ "eval_rewards/margins": 0.2955714762210846,
1333
+ "eval_rewards/rejected": -0.3700896203517914,
1334
+ "eval_runtime": 192.6116,
1335
+ "eval_samples_per_second": 10.384,
1336
+ "eval_steps_per_second": 0.519,
1337
+ "step": 800
1338
+ },
1339
+ {
1340
+ "epoch": 0.5299313052011776,
1341
+ "grad_norm": 1.3671875,
1342
+ "learning_rate": 2.67407462337124e-06,
1343
+ "logits/chosen": -2.2169487476348877,
1344
+ "logits/rejected": -2.0953001976013184,
1345
+ "logps/chosen": -1.4013360738754272,
1346
+ "logps/rejected": -2.4634041786193848,
1347
+ "loss": 1.3603,
1348
+ "rewards/accuracies": 0.699999988079071,
1349
+ "rewards/chosen": -0.1341012716293335,
1350
+ "rewards/margins": 0.2964847683906555,
1351
+ "rewards/rejected": -0.430586040019989,
1352
+ "step": 810
1353
+ },
1354
+ {
1355
+ "epoch": 0.5364736669937847,
1356
+ "grad_norm": 1.9140625,
1357
+ "learning_rate": 2.617052916277952e-06,
1358
+ "logits/chosen": -2.237308979034424,
1359
+ "logits/rejected": -2.043175458908081,
1360
+ "logps/chosen": -1.1022270917892456,
1361
+ "logps/rejected": -2.4109721183776855,
1362
+ "loss": 1.3607,
1363
+ "rewards/accuracies": 0.75,
1364
+ "rewards/chosen": -0.0516006276011467,
1365
+ "rewards/margins": 0.34269508719444275,
1366
+ "rewards/rejected": -0.39429575204849243,
1367
+ "step": 820
1368
+ },
1369
+ {
1370
+ "epoch": 0.5430160287863919,
1371
+ "grad_norm": 1.3828125,
1372
+ "learning_rate": 2.5599701068654985e-06,
1373
+ "logits/chosen": -2.218151092529297,
1374
+ "logits/rejected": -2.1844322681427,
1375
+ "logps/chosen": -1.1159567832946777,
1376
+ "logps/rejected": -2.1058316230773926,
1377
+ "loss": 1.355,
1378
+ "rewards/accuracies": 0.699999988079071,
1379
+ "rewards/chosen": -0.053866881877183914,
1380
+ "rewards/margins": 0.2691417634487152,
1381
+ "rewards/rejected": -0.32300865650177,
1382
+ "step": 830
1383
+ },
1384
+ {
1385
+ "epoch": 0.549558390578999,
1386
+ "grad_norm": 0.875,
1387
+ "learning_rate": 2.5028559927002326e-06,
1388
+ "logits/chosen": -2.2796895503997803,
1389
+ "logits/rejected": -2.186133861541748,
1390
+ "logps/chosen": -0.9579225778579712,
1391
+ "logps/rejected": -1.9913718700408936,
1392
+ "loss": 1.3578,
1393
+ "rewards/accuracies": 0.6875,
1394
+ "rewards/chosen": -0.025524040684103966,
1395
+ "rewards/margins": 0.27697938680648804,
1396
+ "rewards/rejected": -0.30250340700149536,
1397
+ "step": 840
1398
+ },
1399
+ {
1400
+ "epoch": 0.5561007523716062,
1401
+ "grad_norm": 0.796875,
1402
+ "learning_rate": 2.4457403876897756e-06,
1403
+ "logits/chosen": -2.087404251098633,
1404
+ "logits/rejected": -2.0218749046325684,
1405
+ "logps/chosen": -1.2815109491348267,
1406
+ "logps/rejected": -1.9342288970947266,
1407
+ "loss": 1.3532,
1408
+ "rewards/accuracies": 0.6499999761581421,
1409
+ "rewards/chosen": -0.09889832884073257,
1410
+ "rewards/margins": 0.20040933787822723,
1411
+ "rewards/rejected": -0.2993076741695404,
1412
+ "step": 850
1413
+ },
1414
+ {
1415
+ "epoch": 0.5626431141642133,
1416
+ "grad_norm": 0.9609375,
1417
+ "learning_rate": 2.388653106519975e-06,
1418
+ "logits/chosen": -2.2493090629577637,
1419
+ "logits/rejected": -2.053457498550415,
1420
+ "logps/chosen": -1.3042584657669067,
1421
+ "logps/rejected": -2.256798028945923,
1422
+ "loss": 1.3593,
1423
+ "rewards/accuracies": 0.699999988079071,
1424
+ "rewards/chosen": -0.10398749262094498,
1425
+ "rewards/margins": 0.2570543885231018,
1426
+ "rewards/rejected": -0.36104193329811096,
1427
+ "step": 860
1428
+ },
1429
+ {
1430
+ "epoch": 0.5691854759568205,
1431
+ "grad_norm": 2.09375,
1432
+ "learning_rate": 2.331623949091467e-06,
1433
+ "logits/chosen": -2.2236335277557373,
1434
+ "logits/rejected": -2.198726177215576,
1435
+ "logps/chosen": -1.1993200778961182,
1436
+ "logps/rejected": -1.8928115367889404,
1437
+ "loss": 1.3601,
1438
+ "rewards/accuracies": 0.699999988079071,
1439
+ "rewards/chosen": -0.09070239961147308,
1440
+ "rewards/margins": 0.17054910957813263,
1441
+ "rewards/rejected": -0.2612515091896057,
1442
+ "step": 870
1443
+ },
1444
+ {
1445
+ "epoch": 0.5757278377494276,
1446
+ "grad_norm": 1.296875,
1447
+ "learning_rate": 2.2746826849639513e-06,
1448
+ "logits/chosen": -2.2221107482910156,
1449
+ "logits/rejected": -2.101364850997925,
1450
+ "logps/chosen": -1.217944860458374,
1451
+ "logps/rejected": -2.0411202907562256,
1452
+ "loss": 1.3569,
1453
+ "rewards/accuracies": 0.7250000238418579,
1454
+ "rewards/chosen": -0.08954662829637527,
1455
+ "rewards/margins": 0.22928118705749512,
1456
+ "rewards/rejected": -0.3188278079032898,
1457
+ "step": 880
1458
+ },
1459
+ {
1460
+ "epoch": 0.5822701995420346,
1461
+ "grad_norm": 1.1015625,
1462
+ "learning_rate": 2.2178590378162957e-06,
1463
+ "logits/chosen": -2.2392585277557373,
1464
+ "logits/rejected": -2.123332977294922,
1465
+ "logps/chosen": -1.3043301105499268,
1466
+ "logps/rejected": -2.3116343021392822,
1467
+ "loss": 1.361,
1468
+ "rewards/accuracies": 0.675000011920929,
1469
+ "rewards/chosen": -0.10152602195739746,
1470
+ "rewards/margins": 0.2721943259239197,
1471
+ "rewards/rejected": -0.37372034788131714,
1472
+ "step": 890
1473
+ },
1474
+ {
1475
+ "epoch": 0.5888125613346418,
1476
+ "grad_norm": 1.7109375,
1477
+ "learning_rate": 2.1611826699306104e-06,
1478
+ "logits/chosen": -2.129063844680786,
1479
+ "logits/rejected": -2.0367393493652344,
1480
+ "logps/chosen": -1.134124755859375,
1481
+ "logps/rejected": -2.7047417163848877,
1482
+ "loss": 1.3462,
1483
+ "rewards/accuracies": 0.675000011920929,
1484
+ "rewards/chosen": -0.05923975631594658,
1485
+ "rewards/margins": 0.4080350399017334,
1486
+ "rewards/rejected": -0.46727481484413147,
1487
+ "step": 900
1488
+ },
1489
+ {
1490
+ "epoch": 0.5888125613346418,
1491
+ "eval_logits/chosen": -2.060875415802002,
1492
+ "eval_logits/rejected": -1.9784667491912842,
1493
+ "eval_logps/chosen": -1.210047960281372,
1494
+ "eval_logps/rejected": -2.2806875705718994,
1495
+ "eval_loss": 1.3567863702774048,
1496
+ "eval_rewards/accuracies": 0.7049999833106995,
1497
+ "eval_rewards/chosen": -0.08535508811473846,
1498
+ "eval_rewards/margins": 0.2814803123474121,
1499
+ "eval_rewards/rejected": -0.36683544516563416,
1500
+ "eval_runtime": 192.5258,
1501
+ "eval_samples_per_second": 10.388,
1502
+ "eval_steps_per_second": 0.519,
1503
+ "step": 900
1504
+ },
1505
+ {
1506
+ "epoch": 0.5953549231272489,
1507
+ "grad_norm": 1.2109375,
1508
+ "learning_rate": 2.1046831667083483e-06,
1509
+ "logits/chosen": -2.20284104347229,
1510
+ "logits/rejected": -2.077505111694336,
1511
+ "logps/chosen": -1.4609758853912354,
1512
+ "logps/rejected": -2.2469534873962402,
1513
+ "loss": 1.3637,
1514
+ "rewards/accuracies": 0.7749999761581421,
1515
+ "rewards/chosen": -0.123102568089962,
1516
+ "rewards/margins": 0.23405058681964874,
1517
+ "rewards/rejected": -0.35715317726135254,
1518
+ "step": 910
1519
+ },
1520
+ {
1521
+ "epoch": 0.6018972849198561,
1522
+ "grad_norm": 1.25,
1523
+ "learning_rate": 2.048390021226559e-06,
1524
+ "logits/chosen": -2.1845784187316895,
1525
+ "logits/rejected": -2.092951536178589,
1526
+ "logps/chosen": -1.1394016742706299,
1527
+ "logps/rejected": -2.110645055770874,
1528
+ "loss": 1.3573,
1529
+ "rewards/accuracies": 0.7250000238418579,
1530
+ "rewards/chosen": -0.05954326316714287,
1531
+ "rewards/margins": 0.2537994980812073,
1532
+ "rewards/rejected": -0.31334277987480164,
1533
+ "step": 920
1534
+ },
1535
+ {
1536
+ "epoch": 0.6084396467124632,
1537
+ "grad_norm": 1.828125,
1538
+ "learning_rate": 1.9923326188423212e-06,
1539
+ "logits/chosen": -2.264650821685791,
1540
+ "logits/rejected": -2.1612370014190674,
1541
+ "logps/chosen": -1.2139930725097656,
1542
+ "logps/rejected": -1.9334100484848022,
1543
+ "loss": 1.3628,
1544
+ "rewards/accuracies": 0.675000011920929,
1545
+ "rewards/chosen": -0.07819408178329468,
1546
+ "rewards/margins": 0.19942335784435272,
1547
+ "rewards/rejected": -0.2776174545288086,
1548
+ "step": 930
1549
+ },
1550
+ {
1551
+ "epoch": 0.6149820085050703,
1552
+ "grad_norm": 1.25,
1553
+ "learning_rate": 1.936540221853415e-06,
1554
+ "logits/chosen": -2.1965224742889404,
1555
+ "logits/rejected": -2.0824341773986816,
1556
+ "logps/chosen": -1.2189127206802368,
1557
+ "logps/rejected": -2.891991376876831,
1558
+ "loss": 1.3409,
1559
+ "rewards/accuracies": 0.75,
1560
+ "rewards/chosen": -0.08479885011911392,
1561
+ "rewards/margins": 0.45503416657447815,
1562
+ "rewards/rejected": -0.5398330092430115,
1563
+ "step": 940
1564
+ },
1565
+ {
1566
+ "epoch": 0.6215243702976775,
1567
+ "grad_norm": 1.1796875,
1568
+ "learning_rate": 1.8810419542232245e-06,
1569
+ "logits/chosen": -2.2407994270324707,
1570
+ "logits/rejected": -2.135453939437866,
1571
+ "logps/chosen": -1.1888017654418945,
1572
+ "logps/rejected": -1.9870593547821045,
1573
+ "loss": 1.3479,
1574
+ "rewards/accuracies": 0.75,
1575
+ "rewards/chosen": -0.0872371569275856,
1576
+ "rewards/margins": 0.1756531447172165,
1577
+ "rewards/rejected": -0.2628903090953827,
1578
+ "step": 950
1579
+ },
1580
+ {
1581
+ "epoch": 0.6280667320902846,
1582
+ "grad_norm": 1.5546875,
1583
+ "learning_rate": 1.8258667863778573e-06,
1584
+ "logits/chosen": -2.12614107131958,
1585
+ "logits/rejected": -2.054868221282959,
1586
+ "logps/chosen": -1.2383328676223755,
1587
+ "logps/rejected": -2.1294407844543457,
1588
+ "loss": 1.3472,
1589
+ "rewards/accuracies": 0.6625000238418579,
1590
+ "rewards/chosen": -0.10099540650844574,
1591
+ "rewards/margins": 0.2554927170276642,
1592
+ "rewards/rejected": -0.3564881384372711,
1593
+ "step": 960
1594
+ },
1595
+ {
1596
+ "epoch": 0.6346090938828918,
1597
+ "grad_norm": 0.64453125,
1598
+ "learning_rate": 1.7710435200834126e-06,
1599
+ "logits/chosen": -2.1406524181365967,
1600
+ "logits/rejected": -2.126784086227417,
1601
+ "logps/chosen": -1.0479339361190796,
1602
+ "logps/rejected": -2.2236762046813965,
1603
+ "loss": 1.3411,
1604
+ "rewards/accuracies": 0.8125,
1605
+ "rewards/chosen": -0.047475624829530716,
1606
+ "rewards/margins": 0.30173081159591675,
1607
+ "rewards/rejected": -0.34920641779899597,
1608
+ "step": 970
1609
+ },
1610
+ {
1611
+ "epoch": 0.6411514556754988,
1612
+ "grad_norm": 3.921875,
1613
+ "learning_rate": 1.7166007734112808e-06,
1614
+ "logits/chosen": -2.194304943084717,
1615
+ "logits/rejected": -2.0795140266418457,
1616
+ "logps/chosen": -1.2003707885742188,
1617
+ "logps/rejected": -2.1898512840270996,
1618
+ "loss": 1.3494,
1619
+ "rewards/accuracies": 0.7250000238418579,
1620
+ "rewards/chosen": -0.07285211980342865,
1621
+ "rewards/margins": 0.28994783759117126,
1622
+ "rewards/rejected": -0.3627999424934387,
1623
+ "step": 980
1624
+ },
1625
+ {
1626
+ "epoch": 0.647693817468106,
1627
+ "grad_norm": 1.40625,
1628
+ "learning_rate": 1.6625669657993483e-06,
1629
+ "logits/chosen": -2.308607578277588,
1630
+ "logits/rejected": -2.228633403778076,
1631
+ "logps/chosen": -1.0984961986541748,
1632
+ "logps/rejected": -2.0189504623413086,
1633
+ "loss": 1.3514,
1634
+ "rewards/accuracies": 0.762499988079071,
1635
+ "rewards/chosen": -0.06574535369873047,
1636
+ "rewards/margins": 0.24618005752563477,
1637
+ "rewards/rejected": -0.31192541122436523,
1638
+ "step": 990
1639
+ },
1640
+ {
1641
+ "epoch": 0.6542361792607131,
1642
+ "grad_norm": 1.0859375,
1643
+ "learning_rate": 1.6089703032168736e-06,
1644
+ "logits/chosen": -2.1339077949523926,
1645
+ "logits/rejected": -2.12742018699646,
1646
+ "logps/chosen": -1.3830599784851074,
1647
+ "logps/rejected": -2.1155436038970947,
1648
+ "loss": 1.3527,
1649
+ "rewards/accuracies": 0.699999988079071,
1650
+ "rewards/chosen": -0.12206210941076279,
1651
+ "rewards/margins": 0.2249002456665039,
1652
+ "rewards/rejected": -0.3469623625278473,
1653
+ "step": 1000
1654
+ },
1655
+ {
1656
+ "epoch": 0.6542361792607131,
1657
+ "eval_logits/chosen": -2.0770561695098877,
1658
+ "eval_logits/rejected": -1.9978020191192627,
1659
+ "eval_logps/chosen": -1.134178638458252,
1660
+ "eval_logps/rejected": -2.2290825843811035,
1661
+ "eval_loss": 1.3548479080200195,
1662
+ "eval_rewards/accuracies": 0.7049999833106995,
1663
+ "eval_rewards/chosen": -0.06259430944919586,
1664
+ "eval_rewards/margins": 0.28875967860221863,
1665
+ "eval_rewards/rejected": -0.3513540029525757,
1666
+ "eval_runtime": 192.5548,
1667
+ "eval_samples_per_second": 10.387,
1668
+ "eval_steps_per_second": 0.519,
1669
+ "step": 1000
1670
+ },
1671
+ {
1672
+ "epoch": 0.6607785410533202,
1673
+ "grad_norm": 2.125,
1674
+ "learning_rate": 1.55583876344081e-06,
1675
+ "logits/chosen": -2.131939649581909,
1676
+ "logits/rejected": -1.9190185070037842,
1677
+ "logps/chosen": -1.2219233512878418,
1678
+ "logps/rejected": -2.523764133453369,
1679
+ "loss": 1.351,
1680
+ "rewards/accuracies": 0.699999988079071,
1681
+ "rewards/chosen": -0.0856153592467308,
1682
+ "rewards/margins": 0.3270259499549866,
1683
+ "rewards/rejected": -0.4126412868499756,
1684
+ "step": 1010
1685
+ },
1686
+ {
1687
+ "epoch": 0.6673209028459274,
1688
+ "grad_norm": 1.234375,
1689
+ "learning_rate": 1.5032000814512372e-06,
1690
+ "logits/chosen": -2.2159934043884277,
1691
+ "logits/rejected": -2.1700668334960938,
1692
+ "logps/chosen": -1.3070309162139893,
1693
+ "logps/rejected": -2.2616512775421143,
1694
+ "loss": 1.3547,
1695
+ "rewards/accuracies": 0.675000011920929,
1696
+ "rewards/chosen": -0.11680419743061066,
1697
+ "rewards/margins": 0.2500377595424652,
1698
+ "rewards/rejected": -0.36684197187423706,
1699
+ "step": 1020
1700
+ },
1701
+ {
1702
+ "epoch": 0.6738632646385345,
1703
+ "grad_norm": 0.8125,
1704
+ "learning_rate": 1.4510817349535323e-06,
1705
+ "logits/chosen": -2.184436082839966,
1706
+ "logits/rejected": -2.1190972328186035,
1707
+ "logps/chosen": -1.2869107723236084,
1708
+ "logps/rejected": -2.4080114364624023,
1709
+ "loss": 1.3565,
1710
+ "rewards/accuracies": 0.699999988079071,
1711
+ "rewards/chosen": -0.09363288432359695,
1712
+ "rewards/margins": 0.29534098505973816,
1713
+ "rewards/rejected": -0.3889738917350769,
1714
+ "step": 1030
1715
+ },
1716
+ {
1717
+ "epoch": 0.6804056264311417,
1718
+ "grad_norm": 1.5234375,
1719
+ "learning_rate": 1.3995109300348537e-06,
1720
+ "logits/chosen": -2.2407493591308594,
1721
+ "logits/rejected": -2.1420841217041016,
1722
+ "logps/chosen": -1.2200353145599365,
1723
+ "logps/rejected": -2.453103542327881,
1724
+ "loss": 1.3542,
1725
+ "rewards/accuracies": 0.6499999761581421,
1726
+ "rewards/chosen": -0.08661921322345734,
1727
+ "rewards/margins": 0.32051774859428406,
1728
+ "rewards/rejected": -0.4071369767189026,
1729
+ "step": 1040
1730
+ },
1731
+ {
1732
+ "epoch": 0.6869479882237488,
1733
+ "grad_norm": 0.97265625,
1734
+ "learning_rate": 1.348514586962389e-06,
1735
+ "logits/chosen": -2.22872257232666,
1736
+ "logits/rejected": -2.131127119064331,
1737
+ "logps/chosen": -1.181624174118042,
1738
+ "logps/rejected": -2.3231348991394043,
1739
+ "loss": 1.3554,
1740
+ "rewards/accuracies": 0.762499988079071,
1741
+ "rewards/chosen": -0.061110030859708786,
1742
+ "rewards/margins": 0.31904760003089905,
1743
+ "rewards/rejected": -0.38015761971473694,
1744
+ "step": 1050
1745
+ },
1746
+ {
1747
+ "epoch": 0.693490350016356,
1748
+ "grad_norm": 0.78125,
1749
+ "learning_rate": 1.2981193261308284e-06,
1750
+ "logits/chosen": -2.1978235244750977,
1751
+ "logits/rejected": -2.145724058151245,
1752
+ "logps/chosen": -1.0791642665863037,
1753
+ "logps/rejected": -2.3014321327209473,
1754
+ "loss": 1.3478,
1755
+ "rewards/accuracies": 0.675000011920929,
1756
+ "rewards/chosen": -0.05117698386311531,
1757
+ "rewards/margins": 0.3088831603527069,
1758
+ "rewards/rejected": -0.3600601851940155,
1759
+ "step": 1060
1760
+ },
1761
+ {
1762
+ "epoch": 0.700032711808963,
1763
+ "grad_norm": 2.78125,
1764
+ "learning_rate": 1.2483514541663501e-06,
1765
+ "logits/chosen": -2.2404794692993164,
1766
+ "logits/rejected": -2.0921406745910645,
1767
+ "logps/chosen": -1.1699206829071045,
1768
+ "logps/rejected": -2.083617687225342,
1769
+ "loss": 1.3563,
1770
+ "rewards/accuracies": 0.737500011920929,
1771
+ "rewards/chosen": -0.08945433795452118,
1772
+ "rewards/margins": 0.25244590640068054,
1773
+ "rewards/rejected": -0.3419002592563629,
1774
+ "step": 1070
1775
+ },
1776
+ {
1777
+ "epoch": 0.7065750736015701,
1778
+ "grad_norm": 0.98828125,
1779
+ "learning_rate": 1.1992369501944096e-06,
1780
+ "logits/chosen": -2.3222763538360596,
1781
+ "logits/rejected": -2.1950221061706543,
1782
+ "logps/chosen": -1.0542547702789307,
1783
+ "logps/rejected": -2.0735292434692383,
1784
+ "loss": 1.362,
1785
+ "rewards/accuracies": 0.7749999761581421,
1786
+ "rewards/chosen": -0.04106102138757706,
1787
+ "rewards/margins": 0.2844560146331787,
1788
+ "rewards/rejected": -0.32551708817481995,
1789
+ "step": 1080
1790
+ },
1791
+ {
1792
+ "epoch": 0.7131174353941773,
1793
+ "grad_norm": 1.6328125,
1794
+ "learning_rate": 1.1508014522784803e-06,
1795
+ "logits/chosen": -2.225466251373291,
1796
+ "logits/rejected": -2.114065408706665,
1797
+ "logps/chosen": -1.1339337825775146,
1798
+ "logps/rejected": -1.9321873188018799,
1799
+ "loss": 1.3505,
1800
+ "rewards/accuracies": 0.7124999761581421,
1801
+ "rewards/chosen": -0.07440061867237091,
1802
+ "rewards/margins": 0.20720815658569336,
1803
+ "rewards/rejected": -0.2816087603569031,
1804
+ "step": 1090
1805
+ },
1806
+ {
1807
+ "epoch": 0.7196597971867844,
1808
+ "grad_norm": 1.2109375,
1809
+ "learning_rate": 1.1030702440368319e-06,
1810
+ "logits/chosen": -2.1251111030578613,
1811
+ "logits/rejected": -2.109551429748535,
1812
+ "logps/chosen": -1.254265546798706,
1813
+ "logps/rejected": -2.5865750312805176,
1814
+ "loss": 1.3483,
1815
+ "rewards/accuracies": 0.7875000238418579,
1816
+ "rewards/chosen": -0.0920356959104538,
1817
+ "rewards/margins": 0.36417168378829956,
1818
+ "rewards/rejected": -0.4562074542045593,
1819
+ "step": 1100
1820
+ },
1821
+ {
1822
+ "epoch": 0.7196597971867844,
1823
+ "eval_logits/chosen": -2.059796094894409,
1824
+ "eval_logits/rejected": -1.9802155494689941,
1825
+ "eval_logps/chosen": -1.1471033096313477,
1826
+ "eval_logps/rejected": -2.3047683238983154,
1827
+ "eval_loss": 1.3557965755462646,
1828
+ "eval_rewards/accuracies": 0.7024999856948853,
1829
+ "eval_rewards/chosen": -0.06647168844938278,
1830
+ "eval_rewards/margins": 0.30758801102638245,
1831
+ "eval_rewards/rejected": -0.37405967712402344,
1832
+ "eval_runtime": 192.5505,
1833
+ "eval_samples_per_second": 10.387,
1834
+ "eval_steps_per_second": 0.519,
1835
+ "step": 1100
1836
+ },
1837
+ {
1838
+ "epoch": 0.7262021589793916,
1839
+ "grad_norm": 1.9609375,
1840
+ "learning_rate": 1.0560682414443315e-06,
1841
+ "logits/chosen": -2.192129135131836,
1842
+ "logits/rejected": -2.050372362136841,
1843
+ "logps/chosen": -1.2834129333496094,
1844
+ "logps/rejected": -2.1301047801971436,
1845
+ "loss": 1.3589,
1846
+ "rewards/accuracies": 0.762499988079071,
1847
+ "rewards/chosen": -0.08568480610847473,
1848
+ "rewards/margins": 0.2297004908323288,
1849
+ "rewards/rejected": -0.31538528203964233,
1850
+ "step": 1110
1851
+ },
1852
+ {
1853
+ "epoch": 0.7327445207719987,
1854
+ "grad_norm": 1.03125,
1855
+ "learning_rate": 1.009819979826156e-06,
1856
+ "logits/chosen": -2.202493190765381,
1857
+ "logits/rejected": -2.130976438522339,
1858
+ "logps/chosen": -1.107555627822876,
1859
+ "logps/rejected": -2.4273107051849365,
1860
+ "loss": 1.3471,
1861
+ "rewards/accuracies": 0.675000011920929,
1862
+ "rewards/chosen": -0.048850033432245255,
1863
+ "rewards/margins": 0.3565898835659027,
1864
+ "rewards/rejected": -0.40544000267982483,
1865
+ "step": 1120
1866
+ },
1867
+ {
1868
+ "epoch": 0.7392868825646058,
1869
+ "grad_norm": 1.5703125,
1870
+ "learning_rate": 9.643496010502054e-07,
1871
+ "logits/chosen": -2.088268756866455,
1872
+ "logits/rejected": -2.066437244415283,
1873
+ "logps/chosen": -1.4891324043273926,
1874
+ "logps/rejected": -2.3021399974823,
1875
+ "loss": 1.3614,
1876
+ "rewards/accuracies": 0.699999988079071,
1877
+ "rewards/chosen": -0.12404866516590118,
1878
+ "rewards/margins": 0.20802974700927734,
1879
+ "rewards/rejected": -0.33207839727401733,
1880
+ "step": 1130
1881
+ },
1882
+ {
1883
+ "epoch": 0.745829244357213,
1884
+ "grad_norm": 0.87109375,
1885
+ "learning_rate": 9.196808409249086e-07,
1886
+ "logits/chosen": -2.1621835231781006,
1887
+ "logits/rejected": -2.047921657562256,
1888
+ "logps/chosen": -1.2306039333343506,
1889
+ "logps/rejected": -2.0379228591918945,
1890
+ "loss": 1.3603,
1891
+ "rewards/accuracies": 0.675000011920929,
1892
+ "rewards/chosen": -0.07890699058771133,
1893
+ "rewards/margins": 0.23432865738868713,
1894
+ "rewards/rejected": -0.3132356107234955,
1895
+ "step": 1140
1896
+ },
1897
+ {
1898
+ "epoch": 0.7523716061498201,
1899
+ "grad_norm": 1.390625,
1900
+ "learning_rate": 8.758370168089797e-07,
1901
+ "logits/chosen": -2.1358230113983154,
1902
+ "logits/rejected": -2.1172406673431396,
1903
+ "logps/chosen": -1.055590271949768,
1904
+ "logps/rejected": -2.3818840980529785,
1905
+ "loss": 1.3584,
1906
+ "rewards/accuracies": 0.7250000238418579,
1907
+ "rewards/chosen": -0.07134061306715012,
1908
+ "rewards/margins": 0.3324960172176361,
1909
+ "rewards/rejected": -0.4038366377353668,
1910
+ "step": 1150
1911
+ },
1912
+ {
1913
+ "epoch": 0.7589139679424273,
1914
+ "grad_norm": 2.09375,
1915
+ "learning_rate": 8.328410154396318e-07,
1916
+ "logits/chosen": -2.1972057819366455,
1917
+ "logits/rejected": -2.175696849822998,
1918
+ "logps/chosen": -1.1727460622787476,
1919
+ "logps/rejected": -2.109738349914551,
1920
+ "loss": 1.3503,
1921
+ "rewards/accuracies": 0.6875,
1922
+ "rewards/chosen": -0.048242855817079544,
1923
+ "rewards/margins": 0.2853914201259613,
1924
+ "rewards/rejected": -0.33363425731658936,
1925
+ "step": 1160
1926
+ },
1927
+ {
1928
+ "epoch": 0.7654563297350343,
1929
+ "grad_norm": 1.3203125,
1930
+ "learning_rate": 7.907152809855529e-07,
1931
+ "logits/chosen": -2.1163055896759033,
1932
+ "logits/rejected": -2.045872926712036,
1933
+ "logps/chosen": -1.347551941871643,
1934
+ "logps/rejected": -2.065335750579834,
1935
+ "loss": 1.3635,
1936
+ "rewards/accuracies": 0.6499999761581421,
1937
+ "rewards/chosen": -0.12098999321460724,
1938
+ "rewards/margins": 0.18405140936374664,
1939
+ "rewards/rejected": -0.3050413727760315,
1940
+ "step": 1170
1941
+ },
1942
+ {
1943
+ "epoch": 0.7719986915276414,
1944
+ "grad_norm": 1.875,
1945
+ "learning_rate": 7.494818033309207e-07,
1946
+ "logits/chosen": -2.158379554748535,
1947
+ "logits/rejected": -2.000858783721924,
1948
+ "logps/chosen": -1.1294301748275757,
1949
+ "logps/rejected": -1.8507928848266602,
1950
+ "loss": 1.355,
1951
+ "rewards/accuracies": 0.637499988079071,
1952
+ "rewards/chosen": -0.06103038787841797,
1953
+ "rewards/margins": 0.1857200264930725,
1954
+ "rewards/rejected": -0.24675039947032928,
1955
+ "step": 1180
1956
+ },
1957
+ {
1958
+ "epoch": 0.7785410533202486,
1959
+ "grad_norm": 0.6953125,
1960
+ "learning_rate": 7.091621065965521e-07,
1961
+ "logits/chosen": -2.1394495964050293,
1962
+ "logits/rejected": -2.115633726119995,
1963
+ "logps/chosen": -1.1215696334838867,
1964
+ "logps/rejected": -2.1693038940429688,
1965
+ "loss": 1.3518,
1966
+ "rewards/accuracies": 0.675000011920929,
1967
+ "rewards/chosen": -0.060358185321092606,
1968
+ "rewards/margins": 0.26639682054519653,
1969
+ "rewards/rejected": -0.32675498723983765,
1970
+ "step": 1190
1971
+ },
1972
+ {
1973
+ "epoch": 0.7850834151128557,
1974
+ "grad_norm": 1.1328125,
1975
+ "learning_rate": 6.697772379041823e-07,
1976
+ "logits/chosen": -2.1604604721069336,
1977
+ "logits/rejected": -2.116100788116455,
1978
+ "logps/chosen": -1.2930102348327637,
1979
+ "logps/rejected": -2.0868096351623535,
1980
+ "loss": 1.3558,
1981
+ "rewards/accuracies": 0.7749999761581421,
1982
+ "rewards/chosen": -0.0996454581618309,
1983
+ "rewards/margins": 0.2223787009716034,
1984
+ "rewards/rejected": -0.3220241367816925,
1985
+ "step": 1200
1986
+ },
1987
+ {
1988
+ "epoch": 0.7850834151128557,
1989
+ "eval_logits/chosen": -2.052238702774048,
1990
+ "eval_logits/rejected": -1.9719146490097046,
1991
+ "eval_logps/chosen": -1.1348469257354736,
1992
+ "eval_logps/rejected": -2.2732903957366943,
1993
+ "eval_loss": 1.3541505336761475,
1994
+ "eval_rewards/accuracies": 0.7049999833106995,
1995
+ "eval_rewards/chosen": -0.0627947449684143,
1996
+ "eval_rewards/margins": 0.30182158946990967,
1997
+ "eval_rewards/rejected": -0.3646163046360016,
1998
+ "eval_runtime": 192.5468,
1999
+ "eval_samples_per_second": 10.387,
2000
+ "eval_steps_per_second": 0.519,
2001
+ "step": 1200
2002
+ },
2003
+ {
2004
+ "epoch": 0.7916257769054629,
2005
+ "grad_norm": 2.140625,
2006
+ "learning_rate": 6.313477563897466e-07,
2007
+ "logits/chosen": -2.198477268218994,
2008
+ "logits/rejected": -2.16310453414917,
2009
+ "logps/chosen": -1.0447008609771729,
2010
+ "logps/rejected": -2.3631505966186523,
2011
+ "loss": 1.3481,
2012
+ "rewards/accuracies": 0.762499988079071,
2013
+ "rewards/chosen": -0.05016545578837395,
2014
+ "rewards/margins": 0.3603706955909729,
2015
+ "rewards/rejected": -0.4105361998081207,
2016
+ "step": 1210
2017
+ },
2018
+ {
2019
+ "epoch": 0.79816813869807,
2020
+ "grad_norm": 0.7890625,
2021
+ "learning_rate": 5.9389372247138e-07,
2022
+ "logits/chosen": -2.1831748485565186,
2023
+ "logits/rejected": -2.049494504928589,
2024
+ "logps/chosen": -1.0988541841506958,
2025
+ "logps/rejected": -2.0942153930664062,
2026
+ "loss": 1.3416,
2027
+ "rewards/accuracies": 0.737500011920929,
2028
+ "rewards/chosen": -0.08485239744186401,
2029
+ "rewards/margins": 0.2300022840499878,
2030
+ "rewards/rejected": -0.3148546814918518,
2031
+ "step": 1220
2032
+ },
2033
+ {
2034
+ "epoch": 0.8047105004906772,
2035
+ "grad_norm": 1.578125,
2036
+ "learning_rate": 5.574346873777714e-07,
2037
+ "logits/chosen": -2.1483089923858643,
2038
+ "logits/rejected": -2.146624803543091,
2039
+ "logps/chosen": -1.1647026538848877,
2040
+ "logps/rejected": -2.1079747676849365,
2041
+ "loss": 1.3511,
2042
+ "rewards/accuracies": 0.762499988079071,
2043
+ "rewards/chosen": -0.05951924994587898,
2044
+ "rewards/margins": 0.2496817409992218,
2045
+ "rewards/rejected": -0.3092009723186493,
2046
+ "step": 1230
2047
+ },
2048
+ {
2049
+ "epoch": 0.8112528622832843,
2050
+ "grad_norm": 1.703125,
2051
+ "learning_rate": 5.219896829422927e-07,
2052
+ "logits/chosen": -2.1868844032287598,
2053
+ "logits/rejected": -2.071396589279175,
2054
+ "logps/chosen": -1.1010544300079346,
2055
+ "logps/rejected": -2.396719455718994,
2056
+ "loss": 1.3481,
2057
+ "rewards/accuracies": 0.7875000238418579,
2058
+ "rewards/chosen": -0.06258542090654373,
2059
+ "rewards/margins": 0.3550402522087097,
2060
+ "rewards/rejected": -0.41762566566467285,
2061
+ "step": 1240
2062
+ },
2063
+ {
2064
+ "epoch": 0.8177952240758914,
2065
+ "grad_norm": 0.8984375,
2066
+ "learning_rate": 4.875772116682817e-07,
2067
+ "logits/chosen": -2.044847249984741,
2068
+ "logits/rejected": -1.9548499584197998,
2069
+ "logps/chosen": -1.1090381145477295,
2070
+ "logps/rejected": -2.3034088611602783,
2071
+ "loss": 1.3592,
2072
+ "rewards/accuracies": 0.7124999761581421,
2073
+ "rewards/chosen": -0.07034337520599365,
2074
+ "rewards/margins": 0.2901187539100647,
2075
+ "rewards/rejected": -0.36046212911605835,
2076
+ "step": 1250
2077
+ },
2078
+ {
2079
+ "epoch": 0.8243375858684985,
2080
+ "grad_norm": 1.671875,
2081
+ "learning_rate": 4.542152370706149e-07,
2082
+ "logits/chosen": -2.072751522064209,
2083
+ "logits/rejected": -1.9732776880264282,
2084
+ "logps/chosen": -1.3094862699508667,
2085
+ "logps/rejected": -2.3592586517333984,
2086
+ "loss": 1.3592,
2087
+ "rewards/accuracies": 0.637499988079071,
2088
+ "rewards/chosen": -0.12541761994361877,
2089
+ "rewards/margins": 0.2671012580394745,
2090
+ "rewards/rejected": -0.3925188481807709,
2091
+ "step": 1260
2092
+ },
2093
+ {
2094
+ "epoch": 0.8308799476611056,
2095
+ "grad_norm": 0.98046875,
2096
+ "learning_rate": 4.2192117429865067e-07,
2097
+ "logits/chosen": -1.9732551574707031,
2098
+ "logits/rejected": -1.8982023000717163,
2099
+ "logps/chosen": -1.216827630996704,
2100
+ "logps/rejected": -2.2013864517211914,
2101
+ "loss": 1.3502,
2102
+ "rewards/accuracies": 0.675000011920929,
2103
+ "rewards/chosen": -0.07743276655673981,
2104
+ "rewards/margins": 0.25767821073532104,
2105
+ "rewards/rejected": -0.33511093258857727,
2106
+ "step": 1270
2107
+ },
2108
+ {
2109
+ "epoch": 0.8374223094537128,
2110
+ "grad_norm": 4.125,
2111
+ "learning_rate": 3.907118810454172e-07,
2112
+ "logits/chosen": -2.127220869064331,
2113
+ "logits/rejected": -2.117337703704834,
2114
+ "logps/chosen": -1.1793607473373413,
2115
+ "logps/rejected": -2.3696749210357666,
2116
+ "loss": 1.3544,
2117
+ "rewards/accuracies": 0.800000011920929,
2118
+ "rewards/chosen": -0.07967950403690338,
2119
+ "rewards/margins": 0.31056874990463257,
2120
+ "rewards/rejected": -0.39024823904037476,
2121
+ "step": 1280
2122
+ },
2123
+ {
2124
+ "epoch": 0.8439646712463199,
2125
+ "grad_norm": 1.578125,
2126
+ "learning_rate": 3.6060364874779455e-07,
2127
+ "logits/chosen": -2.1260552406311035,
2128
+ "logits/rejected": -2.1024839878082275,
2129
+ "logps/chosen": -1.387900471687317,
2130
+ "logps/rejected": -2.1651806831359863,
2131
+ "loss": 1.3659,
2132
+ "rewards/accuracies": 0.6625000238418579,
2133
+ "rewards/chosen": -0.10205866396427155,
2134
+ "rewards/margins": 0.2208395004272461,
2135
+ "rewards/rejected": -0.32289814949035645,
2136
+ "step": 1290
2137
+ },
2138
+ {
2139
+ "epoch": 0.850507033038927,
2140
+ "grad_norm": 0.76171875,
2141
+ "learning_rate": 3.3161219408229026e-07,
2142
+ "logits/chosen": -2.2469534873962402,
2143
+ "logits/rejected": -2.171238422393799,
2144
+ "logps/chosen": -1.35056471824646,
2145
+ "logps/rejected": -2.452230453491211,
2146
+ "loss": 1.3515,
2147
+ "rewards/accuracies": 0.7749999761581421,
2148
+ "rewards/chosen": -0.1100931391119957,
2149
+ "rewards/margins": 0.2768345773220062,
2150
+ "rewards/rejected": -0.3869277536869049,
2151
+ "step": 1300
2152
+ },
2153
+ {
2154
+ "epoch": 0.850507033038927,
2155
+ "eval_logits/chosen": -2.0505053997039795,
2156
+ "eval_logits/rejected": -1.9694212675094604,
2157
+ "eval_logps/chosen": -1.1402031183242798,
2158
+ "eval_logps/rejected": -2.2917935848236084,
2159
+ "eval_loss": 1.3543208837509155,
2160
+ "eval_rewards/accuracies": 0.7049999833106995,
2161
+ "eval_rewards/chosen": -0.06440159678459167,
2162
+ "eval_rewards/margins": 0.30576565861701965,
2163
+ "eval_rewards/rejected": -0.3701672852039337,
2164
+ "eval_runtime": 192.512,
2165
+ "eval_samples_per_second": 10.389,
2166
+ "eval_steps_per_second": 0.519,
2167
+ "step": 1300
2168
+ },
2169
+ {
2170
+ "epoch": 0.8570493948315342,
2171
+ "grad_norm": 1.1640625,
2172
+ "learning_rate": 3.0375265076083796e-07,
2173
+ "logits/chosen": -2.238908529281616,
2174
+ "logits/rejected": -2.219423770904541,
2175
+ "logps/chosen": -1.4968189001083374,
2176
+ "logps/rejected": -1.8981587886810303,
2177
+ "loss": 1.3654,
2178
+ "rewards/accuracies": 0.6875,
2179
+ "rewards/chosen": -0.1579573005437851,
2180
+ "rewards/margins": 0.11951179802417755,
2181
+ "rewards/rejected": -0.27746909856796265,
2182
+ "step": 1310
2183
+ },
2184
+ {
2185
+ "epoch": 0.8635917566241413,
2186
+ "grad_norm": 1.234375,
2187
+ "learning_rate": 2.7703956163091153e-07,
2188
+ "logits/chosen": -2.2014071941375732,
2189
+ "logits/rejected": -2.1226069927215576,
2190
+ "logps/chosen": -1.133750081062317,
2191
+ "logps/rejected": -2.1364989280700684,
2192
+ "loss": 1.3546,
2193
+ "rewards/accuracies": 0.675000011920929,
2194
+ "rewards/chosen": -0.06972350180149078,
2195
+ "rewards/margins": 0.25297701358795166,
2196
+ "rewards/rejected": -0.32270053029060364,
2197
+ "step": 1320
2198
+ },
2199
+ {
2200
+ "epoch": 0.8701341184167485,
2201
+ "grad_norm": 0.58203125,
2202
+ "learning_rate": 2.514868710840723e-07,
2203
+ "logits/chosen": -2.192532777786255,
2204
+ "logits/rejected": -2.1196610927581787,
2205
+ "logps/chosen": -1.2276318073272705,
2206
+ "logps/rejected": -1.9988428354263306,
2207
+ "loss": 1.3551,
2208
+ "rewards/accuracies": 0.6499999761581421,
2209
+ "rewards/chosen": -0.061108749359846115,
2210
+ "rewards/margins": 0.2282850742340088,
2211
+ "rewards/rejected": -0.2893938422203064,
2212
+ "step": 1330
2213
+ },
2214
+ {
2215
+ "epoch": 0.8766764802093556,
2216
+ "grad_norm": 1.65625,
2217
+ "learning_rate": 2.271079177769117e-07,
2218
+ "logits/chosen": -2.122554302215576,
2219
+ "logits/rejected": -2.1153998374938965,
2220
+ "logps/chosen": -1.290633201599121,
2221
+ "logps/rejected": -2.0642950534820557,
2222
+ "loss": 1.3459,
2223
+ "rewards/accuracies": 0.7124999761581421,
2224
+ "rewards/chosen": -0.09484368562698364,
2225
+ "rewards/margins": 0.19020384550094604,
2226
+ "rewards/rejected": -0.2850475609302521,
2227
+ "step": 1340
2228
+ },
2229
+ {
2230
+ "epoch": 0.8832188420019627,
2231
+ "grad_norm": 1.3125,
2232
+ "learning_rate": 2.0391542766819456e-07,
2233
+ "logits/chosen": -2.1564154624938965,
2234
+ "logits/rejected": -1.9759187698364258,
2235
+ "logps/chosen": -1.3263826370239258,
2236
+ "logps/rejected": -2.2100300788879395,
2237
+ "loss": 1.3491,
2238
+ "rewards/accuracies": 0.675000011920929,
2239
+ "rewards/chosen": -0.10235595703125,
2240
+ "rewards/margins": 0.22581568360328674,
2241
+ "rewards/rejected": -0.32817164063453674,
2242
+ "step": 1350
2243
+ },
2244
+ {
2245
+ "epoch": 0.8897612037945698,
2246
+ "grad_norm": 1.515625,
2247
+ "learning_rate": 1.8192150737583264e-07,
2248
+ "logits/chosen": -2.180866241455078,
2249
+ "logits/rejected": -2.0520682334899902,
2250
+ "logps/chosen": -1.0884932279586792,
2251
+ "logps/rejected": -2.1588473320007324,
2252
+ "loss": 1.3511,
2253
+ "rewards/accuracies": 0.7875000238418579,
2254
+ "rewards/chosen": -0.04868602380156517,
2255
+ "rewards/margins": 0.29052504897117615,
2256
+ "rewards/rejected": -0.33921104669570923,
2257
+ "step": 1360
2258
+ },
2259
+ {
2260
+ "epoch": 0.8963035655871769,
2261
+ "grad_norm": 1.7421875,
2262
+ "learning_rate": 1.61137637857158e-07,
2263
+ "logits/chosen": -2.253401517868042,
2264
+ "logits/rejected": -2.1362268924713135,
2265
+ "logps/chosen": -1.2787913084030151,
2266
+ "logps/rejected": -1.7992349863052368,
2267
+ "loss": 1.3467,
2268
+ "rewards/accuracies": 0.7124999761581421,
2269
+ "rewards/chosen": -0.0960894376039505,
2270
+ "rewards/margins": 0.15551379323005676,
2271
+ "rewards/rejected": -0.25160321593284607,
2272
+ "step": 1370
2273
+ },
2274
+ {
2275
+ "epoch": 0.9028459273797841,
2276
+ "grad_norm": 0.8671875,
2277
+ "learning_rate": 1.415746684157951e-07,
2278
+ "logits/chosen": -2.173496723175049,
2279
+ "logits/rejected": -2.1127450466156006,
2280
+ "logps/chosen": -0.9452150464057922,
2281
+ "logps/rejected": -1.9824330806732178,
2282
+ "loss": 1.34,
2283
+ "rewards/accuracies": 0.7250000238418579,
2284
+ "rewards/chosen": -0.03502793237566948,
2285
+ "rewards/margins": 0.2680448889732361,
2286
+ "rewards/rejected": -0.3030727803707123,
2287
+ "step": 1380
2288
+ },
2289
+ {
2290
+ "epoch": 0.9093882891723912,
2291
+ "grad_norm": 1.890625,
2292
+ "learning_rate": 1.232428110382586e-07,
2293
+ "logits/chosen": -2.2799248695373535,
2294
+ "logits/rejected": -2.1891300678253174,
2295
+ "logps/chosen": -1.111943006515503,
2296
+ "logps/rejected": -2.683452606201172,
2297
+ "loss": 1.3477,
2298
+ "rewards/accuracies": 0.75,
2299
+ "rewards/chosen": -0.05621108412742615,
2300
+ "rewards/margins": 0.4199764132499695,
2301
+ "rewards/rejected": -0.47618746757507324,
2302
+ "step": 1390
2303
+ },
2304
+ {
2305
+ "epoch": 0.9159306509649984,
2306
+ "grad_norm": 1.4921875,
2307
+ "learning_rate": 1.0615163506323856e-07,
2308
+ "logits/chosen": -2.187314510345459,
2309
+ "logits/rejected": -2.100705623626709,
2310
+ "logps/chosen": -1.0987522602081299,
2311
+ "logps/rejected": -2.2279646396636963,
2312
+ "loss": 1.3572,
2313
+ "rewards/accuracies": 0.6625000238418579,
2314
+ "rewards/chosen": -0.07077629119157791,
2315
+ "rewards/margins": 0.282015860080719,
2316
+ "rewards/rejected": -0.3527921736240387,
2317
+ "step": 1400
2318
+ },
2319
+ {
2320
+ "epoch": 0.9159306509649984,
2321
+ "eval_logits/chosen": -2.0522220134735107,
2322
+ "eval_logits/rejected": -1.9716111421585083,
2323
+ "eval_logps/chosen": -1.1385436058044434,
2324
+ "eval_logps/rejected": -2.282519817352295,
2325
+ "eval_loss": 1.3540371656417847,
2326
+ "eval_rewards/accuracies": 0.7074999809265137,
2327
+ "eval_rewards/chosen": -0.0639037936925888,
2328
+ "eval_rewards/margins": 0.3034813106060028,
2329
+ "eval_rewards/rejected": -0.3673850893974304,
2330
+ "eval_runtime": 192.5536,
2331
+ "eval_samples_per_second": 10.387,
2332
+ "eval_steps_per_second": 0.519,
2333
+ "step": 1400
2334
+ },
2335
+ {
2336
+ "epoch": 0.9224730127576055,
2337
+ "grad_norm": 1.34375,
2338
+ "learning_rate": 9.031006218634342e-08,
2339
+ "logits/chosen": -2.1624093055725098,
2340
+ "logits/rejected": -2.102839708328247,
2341
+ "logps/chosen": -1.2127426862716675,
2342
+ "logps/rejected": -2.093308210372925,
2343
+ "loss": 1.3509,
2344
+ "rewards/accuracies": 0.737500011920929,
2345
+ "rewards/chosen": -0.07421986013650894,
2346
+ "rewards/margins": 0.24446694552898407,
2347
+ "rewards/rejected": -0.3186867833137512,
2348
+ "step": 1410
2349
+ },
2350
+ {
2351
+ "epoch": 0.9290153745502127,
2352
+ "grad_norm": 0.734375,
2353
+ "learning_rate": 7.572636180292831e-08,
2354
+ "logits/chosen": -2.111999273300171,
2355
+ "logits/rejected": -2.043356418609619,
2356
+ "logps/chosen": -1.2025669813156128,
2357
+ "logps/rejected": -2.113124370574951,
2358
+ "loss": 1.3489,
2359
+ "rewards/accuracies": 0.7124999761581421,
2360
+ "rewards/chosen": -0.08289094269275665,
2361
+ "rewards/margins": 0.25174033641815186,
2362
+ "rewards/rejected": -0.3346312642097473,
2363
+ "step": 1420
2364
+ },
2365
+ {
2366
+ "epoch": 0.9355577363428198,
2367
+ "grad_norm": 0.8984375,
2368
+ "learning_rate": 6.240814669141559e-08,
2369
+ "logits/chosen": -2.1452536582946777,
2370
+ "logits/rejected": -2.118813991546631,
2371
+ "logps/chosen": -1.1873977184295654,
2372
+ "logps/rejected": -2.4279136657714844,
2373
+ "loss": 1.3462,
2374
+ "rewards/accuracies": 0.6875,
2375
+ "rewards/chosen": -0.07043514400720596,
2376
+ "rewards/margins": 0.3395000398159027,
2377
+ "rewards/rejected": -0.4099351763725281,
2378
+ "step": 1430
2379
+ },
2380
+ {
2381
+ "epoch": 0.9421000981354269,
2382
+ "grad_norm": 2.34375,
2383
+ "learning_rate": 5.036236903938285e-08,
2384
+ "logits/chosen": -2.224623918533325,
2385
+ "logits/rejected": -2.127098560333252,
2386
+ "logps/chosen": -1.1137665510177612,
2387
+ "logps/rejected": -1.9331905841827393,
2388
+ "loss": 1.348,
2389
+ "rewards/accuracies": 0.675000011920929,
2390
+ "rewards/chosen": -0.05770399048924446,
2391
+ "rewards/margins": 0.2274278849363327,
2392
+ "rewards/rejected": -0.28513190150260925,
2393
+ "step": 1440
2394
+ },
2395
+ {
2396
+ "epoch": 0.948642459928034,
2397
+ "grad_norm": 0.80078125,
2398
+ "learning_rate": 3.959531681447859e-08,
2399
+ "logits/chosen": -2.0762100219726562,
2400
+ "logits/rejected": -2.0221409797668457,
2401
+ "logps/chosen": -1.2568187713623047,
2402
+ "logps/rejected": -2.275439739227295,
2403
+ "loss": 1.3529,
2404
+ "rewards/accuracies": 0.6625000238418579,
2405
+ "rewards/chosen": -0.08186222612857819,
2406
+ "rewards/margins": 0.28715780377388,
2407
+ "rewards/rejected": -0.3690200448036194,
2408
+ "step": 1450
2409
+ },
2410
+ {
2411
+ "epoch": 0.9551848217206411,
2412
+ "grad_norm": 0.83203125,
2413
+ "learning_rate": 3.0112610482064544e-08,
2414
+ "logits/chosen": -2.2796406745910645,
2415
+ "logits/rejected": -2.173074722290039,
2416
+ "logps/chosen": -1.1611659526824951,
2417
+ "logps/rejected": -2.0751075744628906,
2418
+ "loss": 1.3478,
2419
+ "rewards/accuracies": 0.7250000238418579,
2420
+ "rewards/chosen": -0.07703422009944916,
2421
+ "rewards/margins": 0.24533399939537048,
2422
+ "rewards/rejected": -0.32236820459365845,
2423
+ "step": 1460
2424
+ },
2425
+ {
2426
+ "epoch": 0.9617271835132483,
2427
+ "grad_norm": 0.77734375,
2428
+ "learning_rate": 2.1919200071301715e-08,
2429
+ "logits/chosen": -2.3272135257720947,
2430
+ "logits/rejected": -2.1798338890075684,
2431
+ "logps/chosen": -1.190741777420044,
2432
+ "logps/rejected": -2.2400176525115967,
2433
+ "loss": 1.3635,
2434
+ "rewards/accuracies": 0.7124999761581421,
2435
+ "rewards/chosen": -0.07553628087043762,
2436
+ "rewards/margins": 0.28841260075569153,
2437
+ "rewards/rejected": -0.36394888162612915,
2438
+ "step": 1470
2439
+ },
2440
+ {
2441
+ "epoch": 0.9682695453058554,
2442
+ "grad_norm": 0.90625,
2443
+ "learning_rate": 1.50193625912029e-08,
2444
+ "logits/chosen": -2.059523105621338,
2445
+ "logits/rejected": -2.071715831756592,
2446
+ "logps/chosen": -1.2252368927001953,
2447
+ "logps/rejected": -2.461535692214966,
2448
+ "loss": 1.3431,
2449
+ "rewards/accuracies": 0.737500011920929,
2450
+ "rewards/chosen": -0.08799092471599579,
2451
+ "rewards/margins": 0.312566339969635,
2452
+ "rewards/rejected": -0.4005572199821472,
2453
+ "step": 1480
2454
+ },
2455
+ {
2456
+ "epoch": 0.9748119070984625,
2457
+ "grad_norm": 1.421875,
2458
+ "learning_rate": 9.416699798010521e-09,
2459
+ "logits/chosen": -2.070498466491699,
2460
+ "logits/rejected": -1.9673669338226318,
2461
+ "logps/chosen": -1.2495880126953125,
2462
+ "logps/rejected": -2.9194693565368652,
2463
+ "loss": 1.3353,
2464
+ "rewards/accuracies": 0.8125,
2465
+ "rewards/chosen": -0.07810261845588684,
2466
+ "rewards/margins": 0.4632912278175354,
2467
+ "rewards/rejected": -0.5413938164710999,
2468
+ "step": 1490
2469
+ },
2470
+ {
2471
+ "epoch": 0.9813542688910697,
2472
+ "grad_norm": 2.125,
2473
+ "learning_rate": 5.114136315058083e-09,
2474
+ "logits/chosen": -2.101090908050537,
2475
+ "logits/rejected": -2.0724215507507324,
2476
+ "logps/chosen": -1.1968806982040405,
2477
+ "logps/rejected": -2.266988754272461,
2478
+ "loss": 1.3527,
2479
+ "rewards/accuracies": 0.737500011920929,
2480
+ "rewards/chosen": -0.10394704341888428,
2481
+ "rewards/margins": 0.25496524572372437,
2482
+ "rewards/rejected": -0.35891225934028625,
2483
+ "step": 1500
2484
+ },
2485
+ {
2486
+ "epoch": 0.9813542688910697,
2487
+ "eval_logits/chosen": -2.051304817199707,
2488
+ "eval_logits/rejected": -1.9704300165176392,
2489
+ "eval_logps/chosen": -1.1379503011703491,
2490
+ "eval_logps/rejected": -2.283447027206421,
2491
+ "eval_loss": 1.3540750741958618,
2492
+ "eval_rewards/accuracies": 0.7024999856948853,
2493
+ "eval_rewards/chosen": -0.06372576951980591,
2494
+ "eval_rewards/margins": 0.30393749475479126,
2495
+ "eval_rewards/rejected": -0.36766326427459717,
2496
+ "eval_runtime": 192.5377,
2497
+ "eval_samples_per_second": 10.388,
2498
+ "eval_steps_per_second": 0.519,
2499
+ "step": 1500
2500
+ },
2501
+ {
2502
+ "epoch": 0.9878966306836768,
2503
+ "grad_norm": 1.40625,
2504
+ "learning_rate": 2.113918106098345e-09,
2505
+ "logits/chosen": -2.1591796875,
2506
+ "logits/rejected": -2.158271312713623,
2507
+ "logps/chosen": -1.1788362264633179,
2508
+ "logps/rejected": -1.7396657466888428,
2509
+ "loss": 1.3552,
2510
+ "rewards/accuracies": 0.7250000238418579,
2511
+ "rewards/chosen": -0.05985639616847038,
2512
+ "rewards/margins": 0.1703128218650818,
2513
+ "rewards/rejected": -0.23016922175884247,
2514
+ "step": 1510
2515
+ },
2516
+ {
2517
+ "epoch": 0.994438992476284,
2518
+ "grad_norm": 1.1640625,
2519
+ "learning_rate": 4.176113028983575e-10,
2520
+ "logits/chosen": -2.104264974594116,
2521
+ "logits/rejected": -2.0566983222961426,
2522
+ "logps/chosen": -1.284037709236145,
2523
+ "logps/rejected": -2.4648444652557373,
2524
+ "loss": 1.3592,
2525
+ "rewards/accuracies": 0.699999988079071,
2526
+ "rewards/chosen": -0.09095671027898788,
2527
+ "rewards/margins": 0.30293816328048706,
2528
+ "rewards/rejected": -0.39389482140541077,
2529
+ "step": 1520
2530
+ },
2531
+ {
2532
+ "epoch": 0.9996728819103696,
2533
+ "step": 1528,
2534
+ "total_flos": 0.0,
2535
+ "train_loss": 1.3600662709530735,
2536
+ "train_runtime": 13966.5548,
2537
+ "train_samples_per_second": 4.377,
2538
+ "train_steps_per_second": 0.109
2539
+ }
2540
+ ],
2541
+ "logging_steps": 10,
2542
+ "max_steps": 1528,
2543
+ "num_input_tokens_seen": 0,
2544
+ "num_train_epochs": 1,
2545
+ "save_steps": 100,
2546
+ "stateful_callbacks": {
2547
+ "TrainerControl": {
2548
+ "args": {
2549
+ "should_epoch_stop": false,
2550
+ "should_evaluate": false,
2551
+ "should_log": false,
2552
+ "should_save": true,
2553
+ "should_training_stop": true
2554
+ },
2555
+ "attributes": {}
2556
+ }
2557
+ },
2558
+ "total_flos": 0.0,
2559
+ "train_batch_size": 2,
2560
+ "trial_name": null,
2561
+ "trial_params": null
2562
+ }