scottsuk0306 commited on
Commit
a30ad28
1 Parent(s): 8d227f6

Model save

Browse files
README.md CHANGED
@@ -2,10 +2,6 @@
2
  license: gemma
3
  base_model: google/gemma-2-2b
4
  tags:
5
- - easylm
6
- - trl
7
- - sft
8
- - generated_from_trainer
9
  - trl
10
  - sft
11
  - generated_from_trainer
@@ -23,7 +19,7 @@ should probably proofread and complete it, then remove this comment. -->
23
 
24
  This model is a fine-tuned version of [google/gemma-2-2b](https://huggingface.co/google/gemma-2-2b) on the alpaca_farm dataset.
25
  It achieves the following results on the evaluation set:
26
- - Loss: 0.6649
27
 
28
  ## Model description
29
 
@@ -52,24 +48,24 @@ The following hyperparameters were used during training:
52
  - total_eval_batch_size: 16
53
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
54
  - lr_scheduler_type: cosine
55
- - num_epochs: 1
56
 
57
  ### Training results
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
- | 0.803 | 0.08 | 50 | 0.6921 |
62
- | 0.6815 | 0.16 | 100 | 0.6862 |
63
- | 0.677 | 0.24 | 150 | 0.6829 |
64
- | 0.6501 | 0.32 | 200 | 0.6809 |
65
- | 0.6621 | 0.4 | 250 | 0.6777 |
66
- | 0.6763 | 0.48 | 300 | 0.6747 |
67
- | 0.6611 | 0.56 | 350 | 0.6715 |
68
- | 0.6639 | 0.64 | 400 | 0.6696 |
69
- | 0.6451 | 0.72 | 450 | 0.6675 |
70
- | 0.6664 | 0.8 | 500 | 0.6659 |
71
- | 0.6597 | 0.88 | 550 | 0.6653 |
72
- | 0.652 | 0.96 | 600 | 0.6650 |
73
 
74
 
75
  ### Framework versions
 
2
  license: gemma
3
  base_model: google/gemma-2-2b
4
  tags:
 
 
 
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
 
19
 
20
  This model is a fine-tuned version of [google/gemma-2-2b](https://huggingface.co/google/gemma-2-2b) on the alpaca_farm dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.7070
23
 
24
  ## Model description
25
 
 
48
  - total_eval_batch_size: 16
49
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
  - lr_scheduler_type: cosine
51
+ - num_epochs: 2
52
 
53
  ### Training results
54
 
55
  | Training Loss | Epoch | Step | Validation Loss |
56
  |:-------------:|:-----:|:----:|:---------------:|
57
+ | 0.6727 | 0.16 | 100 | 0.6869 |
58
+ | 0.6056 | 0.32 | 200 | 0.6831 |
59
+ | 0.7033 | 0.48 | 300 | 0.6797 |
60
+ | 0.6786 | 0.64 | 400 | 0.6771 |
61
+ | 0.6476 | 0.8 | 500 | 0.6736 |
62
+ | 0.6562 | 0.96 | 600 | 0.6708 |
63
+ | 0.461 | 1.12 | 700 | 0.7041 |
64
+ | 0.4578 | 1.28 | 800 | 0.7093 |
65
+ | 0.4817 | 1.44 | 900 | 0.7055 |
66
+ | 0.4324 | 1.6 | 1000 | 0.7080 |
67
+ | 0.4693 | 1.76 | 1100 | 0.7081 |
68
+ | 0.4475 | 1.92 | 1200 | 0.7070 |
69
 
70
 
71
  ### Framework versions
all_results.json CHANGED
@@ -1,14 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "eval_loss": 0.664948046207428,
4
- "eval_runtime": 7.2125,
5
- "eval_samples": 2000,
6
- "eval_samples_per_second": 277.295,
7
- "eval_steps_per_second": 17.331,
8
- "total_flos": 2.283996277689549e+16,
9
- "train_loss": 0.6787182281494141,
10
- "train_runtime": 858.483,
11
  "train_samples": 10000,
12
- "train_samples_per_second": 11.648,
13
- "train_steps_per_second": 0.728
14
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "total_flos": 4.566411071245517e+16,
4
+ "train_loss": 0.5696872653961181,
5
+ "train_runtime": 1519.873,
 
 
 
 
 
6
  "train_samples": 10000,
7
+ "train_samples_per_second": 13.159,
8
+ "train_steps_per_second": 0.822
9
  }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f81e582f1ed027e599fa58a3c598cc6afff4db5db722eb43bef5862cc94ec6c1
3
  size 4988025760
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:661307a70cfa0d3186eb20155bbd950fd35297d5138a4afdcbd550e94c9eb752
3
  size 4988025760
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5f79596aa5e67b627d084721cbc8f0cb490473d8e45dca514ce04157f3076c1
3
  size 1420339880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d28227d7bf815ed9760efe3c3a0375227a9a0d7be7e189a0ea9a0dee8214db1
3
  size 1420339880
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 1.0,
3
- "total_flos": 2.283996277689549e+16,
4
- "train_loss": 0.6787182281494141,
5
- "train_runtime": 858.483,
6
  "train_samples": 10000,
7
- "train_samples_per_second": 11.648,
8
- "train_steps_per_second": 0.728
9
  }
 
1
  {
2
+ "epoch": 2.0,
3
+ "total_flos": 4.566411071245517e+16,
4
+ "train_loss": 0.5696872653961181,
5
+ "train_runtime": 1519.873,
6
  "train_samples": 10000,
7
+ "train_samples_per_second": 13.159,
8
+ "train_steps_per_second": 0.822
9
  }
trainer_state.json CHANGED
@@ -1,207 +1,998 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
- "eval_steps": 50,
6
- "global_step": 625,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.08,
13
- "grad_norm": 4.650625705718994,
14
- "learning_rate": 2.9528747416929465e-06,
15
- "loss": 0.803,
16
- "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  },
18
  {
19
  "epoch": 0.08,
20
- "eval_loss": 0.6921180486679077,
21
- "eval_runtime": 9.1523,
22
- "eval_samples_per_second": 218.525,
23
- "eval_steps_per_second": 13.658,
24
  "step": 50
25
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  {
27
  "epoch": 0.16,
28
- "grad_norm": 4.7580718994140625,
29
- "learning_rate": 2.814460020065795e-06,
30
- "loss": 0.6815,
31
  "step": 100
32
  },
33
  {
34
  "epoch": 0.16,
35
- "eval_loss": 0.6861706972122192,
36
- "eval_runtime": 7.5289,
37
- "eval_samples_per_second": 265.641,
38
- "eval_steps_per_second": 16.603,
39
  "step": 100
40
  },
41
  {
42
- "epoch": 0.24,
43
- "grad_norm": 5.2208075523376465,
44
- "learning_rate": 2.5934529411321173e-06,
45
- "loss": 0.677,
46
- "step": 150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  },
48
  {
49
  "epoch": 0.24,
50
- "eval_loss": 0.6829179525375366,
51
- "eval_runtime": 7.3642,
52
- "eval_samples_per_second": 271.584,
53
- "eval_steps_per_second": 16.974,
54
  "step": 150
55
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  {
57
  "epoch": 0.32,
58
- "grad_norm": 4.9710540771484375,
59
- "learning_rate": 2.303740192468495e-06,
60
- "loss": 0.6501,
61
  "step": 200
62
  },
63
  {
64
  "epoch": 0.32,
65
- "eval_loss": 0.6809178590774536,
66
- "eval_runtime": 7.2004,
67
- "eval_samples_per_second": 277.764,
68
- "eval_steps_per_second": 17.36,
69
  "step": 200
70
  },
71
  {
72
- "epoch": 0.4,
73
- "grad_norm": 4.636897563934326,
74
- "learning_rate": 1.963525491562421e-06,
75
- "loss": 0.6621,
76
- "step": 250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  },
78
  {
79
  "epoch": 0.4,
80
- "eval_loss": 0.6777489185333252,
81
- "eval_runtime": 7.2539,
82
- "eval_samples_per_second": 275.715,
83
- "eval_steps_per_second": 17.232,
84
  "step": 250
85
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  {
87
  "epoch": 0.48,
88
- "grad_norm": 4.727797985076904,
89
- "learning_rate": 1.5941857792939703e-06,
90
- "loss": 0.6763,
91
  "step": 300
92
  },
93
  {
94
  "epoch": 0.48,
95
- "eval_loss": 0.6747137904167175,
96
- "eval_runtime": 11.709,
97
- "eval_samples_per_second": 170.809,
98
- "eval_steps_per_second": 10.676,
99
  "step": 300
100
  },
101
  {
102
- "epoch": 0.56,
103
- "grad_norm": 4.318193435668945,
104
- "learning_rate": 1.2189280281214128e-06,
105
- "loss": 0.6611,
106
- "step": 350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  },
108
  {
109
  "epoch": 0.56,
110
- "eval_loss": 0.6715142726898193,
111
- "eval_runtime": 7.8218,
112
- "eval_samples_per_second": 255.696,
113
- "eval_steps_per_second": 15.981,
114
  "step": 350
115
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  {
117
  "epoch": 0.64,
118
- "grad_norm": 4.506664276123047,
119
- "learning_rate": 8.613310626523911e-07,
120
- "loss": 0.6639,
121
  "step": 400
122
  },
123
  {
124
  "epoch": 0.64,
125
- "eval_loss": 0.6695653200149536,
126
- "eval_runtime": 10.9124,
127
- "eval_samples_per_second": 183.277,
128
- "eval_steps_per_second": 11.455,
129
  "step": 400
130
  },
131
  {
132
- "epoch": 0.72,
133
- "grad_norm": 4.423295974731445,
134
- "learning_rate": 5.438640153769653e-07,
135
- "loss": 0.6451,
136
- "step": 450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  },
138
  {
139
  "epoch": 0.72,
140
- "eval_loss": 0.6674798727035522,
141
- "eval_runtime": 7.1796,
142
- "eval_samples_per_second": 278.565,
143
- "eval_steps_per_second": 17.41,
144
  "step": 450
145
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  {
147
  "epoch": 0.8,
148
- "grad_norm": 4.426822662353516,
149
- "learning_rate": 2.86474508437579e-07,
150
- "loss": 0.6664,
151
  "step": 500
152
  },
153
  {
154
  "epoch": 0.8,
155
- "eval_loss": 0.6659039855003357,
156
- "eval_runtime": 7.1051,
157
- "eval_samples_per_second": 281.487,
158
- "eval_steps_per_second": 17.593,
159
  "step": 500
160
  },
161
  {
162
- "epoch": 0.88,
163
- "grad_norm": 4.4428253173828125,
164
- "learning_rate": 1.0533527116762298e-07,
165
- "loss": 0.6597,
166
- "step": 550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  },
168
  {
169
  "epoch": 0.88,
170
- "eval_loss": 0.6652711033821106,
171
- "eval_runtime": 7.3371,
172
- "eval_samples_per_second": 272.586,
173
- "eval_steps_per_second": 17.037,
174
  "step": 550
175
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  {
177
  "epoch": 0.96,
178
- "grad_norm": 4.120204448699951,
179
- "learning_rate": 1.1827948028283353e-08,
180
- "loss": 0.652,
181
  "step": 600
182
  },
183
  {
184
  "epoch": 0.96,
185
- "eval_loss": 0.6650316715240479,
186
- "eval_runtime": 7.6236,
187
- "eval_samples_per_second": 262.344,
188
- "eval_steps_per_second": 16.397,
189
  "step": 600
190
  },
191
  {
192
- "epoch": 1.0,
193
- "step": 625,
194
- "total_flos": 2.283996277689549e+16,
195
- "train_loss": 0.6787182281494141,
196
- "train_runtime": 858.483,
197
- "train_samples_per_second": 11.648,
198
- "train_steps_per_second": 0.728
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  }
200
  ],
201
- "logging_steps": 50,
202
- "max_steps": 625,
203
  "num_input_tokens_seen": 0,
204
- "num_train_epochs": 1,
205
  "save_steps": 500,
206
  "stateful_callbacks": {
207
  "TrainerControl": {
@@ -215,7 +1006,7 @@
215
  "attributes": {}
216
  }
217
  },
218
- "total_flos": 2.283996277689549e+16,
219
  "train_batch_size": 2,
220
  "trial_name": null,
221
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 100,
6
+ "global_step": 1250,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.016,
13
+ "grad_norm": 6.320438861846924,
14
+ "learning_rate": 2.99952628392495e-06,
15
+ "loss": 1.3054,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.032,
20
+ "grad_norm": 5.475564956665039,
21
+ "learning_rate": 2.9981054349090266e-06,
22
+ "loss": 0.6821,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.048,
27
+ "grad_norm": 5.251683235168457,
28
+ "learning_rate": 2.995738350390921e-06,
29
+ "loss": 0.6935,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.064,
34
+ "grad_norm": 5.057413578033447,
35
+ "learning_rate": 2.9924265254719506e-06,
36
+ "loss": 0.6776,
37
+ "step": 40
38
  },
39
  {
40
  "epoch": 0.08,
41
+ "grad_norm": 4.680449962615967,
42
+ "learning_rate": 2.988172051971717e-06,
43
+ "loss": 0.6582,
 
44
  "step": 50
45
  },
46
+ {
47
+ "epoch": 0.096,
48
+ "grad_norm": 5.521228313446045,
49
+ "learning_rate": 2.982977617106871e-06,
50
+ "loss": 0.6972,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.112,
55
+ "grad_norm": 4.839247226715088,
56
+ "learning_rate": 2.9768465017938084e-06,
57
+ "loss": 0.6749,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.128,
62
+ "grad_norm": 5.12637996673584,
63
+ "learning_rate": 2.9697825785763704e-06,
64
+ "loss": 0.6863,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.144,
69
+ "grad_norm": 4.7851152420043945,
70
+ "learning_rate": 2.961790309179866e-06,
71
+ "loss": 0.6785,
72
+ "step": 90
73
+ },
74
  {
75
  "epoch": 0.16,
76
+ "grad_norm": 4.761219501495361,
77
+ "learning_rate": 2.9528747416929465e-06,
78
+ "loss": 0.6727,
79
  "step": 100
80
  },
81
  {
82
  "epoch": 0.16,
83
+ "eval_loss": 0.6868906021118164,
84
+ "eval_runtime": 8.6852,
85
+ "eval_samples_per_second": 230.276,
86
+ "eval_steps_per_second": 14.392,
87
  "step": 100
88
  },
89
  {
90
+ "epoch": 0.176,
91
+ "grad_norm": 4.815207004547119,
92
+ "learning_rate": 2.943041507379129e-06,
93
+ "loss": 0.6787,
94
+ "step": 110
95
+ },
96
+ {
97
+ "epoch": 0.192,
98
+ "grad_norm": 4.663937568664551,
99
+ "learning_rate": 2.9322968171199645e-06,
100
+ "loss": 0.6772,
101
+ "step": 120
102
+ },
103
+ {
104
+ "epoch": 0.208,
105
+ "grad_norm": 5.208789348602295,
106
+ "learning_rate": 2.9206474574921165e-06,
107
+ "loss": 0.656,
108
+ "step": 130
109
+ },
110
+ {
111
+ "epoch": 0.224,
112
+ "grad_norm": 4.607890605926514,
113
+ "learning_rate": 2.9081007864808113e-06,
114
+ "loss": 0.6792,
115
+ "step": 140
116
  },
117
  {
118
  "epoch": 0.24,
119
+ "grad_norm": 5.205310821533203,
120
+ "learning_rate": 2.894664728832377e-06,
121
+ "loss": 0.6989,
 
122
  "step": 150
123
  },
124
+ {
125
+ "epoch": 0.256,
126
+ "grad_norm": 4.700341701507568,
127
+ "learning_rate": 2.8803477710488056e-06,
128
+ "loss": 0.6673,
129
+ "step": 160
130
+ },
131
+ {
132
+ "epoch": 0.272,
133
+ "grad_norm": 4.271303653717041,
134
+ "learning_rate": 2.8651589560274937e-06,
135
+ "loss": 0.6743,
136
+ "step": 170
137
+ },
138
+ {
139
+ "epoch": 0.288,
140
+ "grad_norm": 4.5935468673706055,
141
+ "learning_rate": 2.8491078773495566e-06,
142
+ "loss": 0.6634,
143
+ "step": 180
144
+ },
145
+ {
146
+ "epoch": 0.304,
147
+ "grad_norm": 4.505198955535889,
148
+ "learning_rate": 2.832204673220317e-06,
149
+ "loss": 0.6487,
150
+ "step": 190
151
+ },
152
  {
153
  "epoch": 0.32,
154
+ "grad_norm": 4.946630477905273,
155
+ "learning_rate": 2.814460020065795e-06,
156
+ "loss": 0.6056,
157
  "step": 200
158
  },
159
  {
160
  "epoch": 0.32,
161
+ "eval_loss": 0.6831114292144775,
162
+ "eval_runtime": 9.3989,
163
+ "eval_samples_per_second": 212.791,
164
+ "eval_steps_per_second": 13.299,
165
  "step": 200
166
  },
167
  {
168
+ "epoch": 0.336,
169
+ "grad_norm": 4.913090229034424,
170
+ "learning_rate": 2.795885125789253e-06,
171
+ "loss": 0.652,
172
+ "step": 210
173
+ },
174
+ {
175
+ "epoch": 0.352,
176
+ "grad_norm": 4.906637191772461,
177
+ "learning_rate": 2.776491722692038e-06,
178
+ "loss": 0.6599,
179
+ "step": 220
180
+ },
181
+ {
182
+ "epoch": 0.368,
183
+ "grad_norm": 4.214770317077637,
184
+ "learning_rate": 2.756292060063213e-06,
185
+ "loss": 0.6539,
186
+ "step": 230
187
+ },
188
+ {
189
+ "epoch": 0.384,
190
+ "grad_norm": 4.808592796325684,
191
+ "learning_rate": 2.735298896442641e-06,
192
+ "loss": 0.6666,
193
+ "step": 240
194
  },
195
  {
196
  "epoch": 0.4,
197
+ "grad_norm": 4.6029839515686035,
198
+ "learning_rate": 2.713525491562421e-06,
199
+ "loss": 0.6924,
 
200
  "step": 250
201
  },
202
+ {
203
+ "epoch": 0.416,
204
+ "grad_norm": 4.828456878662109,
205
+ "learning_rate": 2.690985597971753e-06,
206
+ "loss": 0.6937,
207
+ "step": 260
208
+ },
209
+ {
210
+ "epoch": 0.432,
211
+ "grad_norm": 4.152997970581055,
212
+ "learning_rate": 2.6676934523505355e-06,
213
+ "loss": 0.6664,
214
+ "step": 270
215
+ },
216
+ {
217
+ "epoch": 0.448,
218
+ "grad_norm": 4.614770412445068,
219
+ "learning_rate": 2.643663766517172e-06,
220
+ "loss": 0.6722,
221
+ "step": 280
222
+ },
223
+ {
224
+ "epoch": 0.464,
225
+ "grad_norm": 4.493509292602539,
226
+ "learning_rate": 2.6189117181362736e-06,
227
+ "loss": 0.6689,
228
+ "step": 290
229
+ },
230
  {
231
  "epoch": 0.48,
232
+ "grad_norm": 4.5999274253845215,
233
+ "learning_rate": 2.5934529411321173e-06,
234
+ "loss": 0.7033,
235
  "step": 300
236
  },
237
  {
238
  "epoch": 0.48,
239
+ "eval_loss": 0.6796725988388062,
240
+ "eval_runtime": 9.6974,
241
+ "eval_samples_per_second": 206.241,
242
+ "eval_steps_per_second": 12.89,
243
  "step": 300
244
  },
245
  {
246
+ "epoch": 0.496,
247
+ "grad_norm": 4.2827935218811035,
248
+ "learning_rate": 2.5673035158139285e-06,
249
+ "loss": 0.6596,
250
+ "step": 310
251
+ },
252
+ {
253
+ "epoch": 0.512,
254
+ "grad_norm": 4.7542805671691895,
255
+ "learning_rate": 2.5404799587192076e-06,
256
+ "loss": 0.6549,
257
+ "step": 320
258
+ },
259
+ {
260
+ "epoch": 0.528,
261
+ "grad_norm": 4.7150492668151855,
262
+ "learning_rate": 2.5129992121815365e-06,
263
+ "loss": 0.6757,
264
+ "step": 330
265
+ },
266
+ {
267
+ "epoch": 0.544,
268
+ "grad_norm": 4.777836322784424,
269
+ "learning_rate": 2.484878633629435e-06,
270
+ "loss": 0.6806,
271
+ "step": 340
272
  },
273
  {
274
  "epoch": 0.56,
275
+ "grad_norm": 4.323108196258545,
276
+ "learning_rate": 2.456135984623035e-06,
277
+ "loss": 0.6629,
 
278
  "step": 350
279
  },
280
+ {
281
+ "epoch": 0.576,
282
+ "grad_norm": 4.968432426452637,
283
+ "learning_rate": 2.4267894196355018e-06,
284
+ "loss": 0.6858,
285
+ "step": 360
286
+ },
287
+ {
288
+ "epoch": 0.592,
289
+ "grad_norm": 4.666640758514404,
290
+ "learning_rate": 2.3968574745862785e-06,
291
+ "loss": 0.665,
292
+ "step": 370
293
+ },
294
+ {
295
+ "epoch": 0.608,
296
+ "grad_norm": 4.655545234680176,
297
+ "learning_rate": 2.3663590551334015e-06,
298
+ "loss": 0.6661,
299
+ "step": 380
300
+ },
301
+ {
302
+ "epoch": 0.624,
303
+ "grad_norm": 4.554004669189453,
304
+ "learning_rate": 2.3353134247322823e-06,
305
+ "loss": 0.6559,
306
+ "step": 390
307
+ },
308
  {
309
  "epoch": 0.64,
310
+ "grad_norm": 4.362437725067139,
311
+ "learning_rate": 2.303740192468495e-06,
312
+ "loss": 0.6786,
313
  "step": 400
314
  },
315
  {
316
  "epoch": 0.64,
317
+ "eval_loss": 0.6770616173744202,
318
+ "eval_runtime": 7.3123,
319
+ "eval_samples_per_second": 273.512,
320
+ "eval_steps_per_second": 17.095,
321
  "step": 400
322
  },
323
  {
324
+ "epoch": 0.656,
325
+ "grad_norm": 4.237730503082275,
326
+ "learning_rate": 2.2716593006722595e-06,
327
+ "loss": 0.6456,
328
+ "step": 410
329
+ },
330
+ {
331
+ "epoch": 0.672,
332
+ "grad_norm": 4.062387943267822,
333
+ "learning_rate": 2.2390910123224374e-06,
334
+ "loss": 0.6767,
335
+ "step": 420
336
+ },
337
+ {
338
+ "epoch": 0.688,
339
+ "grad_norm": 4.175485134124756,
340
+ "learning_rate": 2.2060558982479992e-06,
341
+ "loss": 0.6389,
342
+ "step": 430
343
+ },
344
+ {
345
+ "epoch": 0.704,
346
+ "grad_norm": 4.427236080169678,
347
+ "learning_rate": 2.1725748241350487e-06,
348
+ "loss": 0.6403,
349
+ "step": 440
350
  },
351
  {
352
  "epoch": 0.72,
353
+ "grad_norm": 4.3691511154174805,
354
+ "learning_rate": 2.138668937347609e-06,
355
+ "loss": 0.6559,
 
356
  "step": 450
357
  },
358
+ {
359
+ "epoch": 0.736,
360
+ "grad_norm": 4.3593430519104,
361
+ "learning_rate": 2.1043596535704943e-06,
362
+ "loss": 0.6787,
363
+ "step": 460
364
+ },
365
+ {
366
+ "epoch": 0.752,
367
+ "grad_norm": 4.350930213928223,
368
+ "learning_rate": 2.069668643282702e-06,
369
+ "loss": 0.6975,
370
+ "step": 470
371
+ },
372
+ {
373
+ "epoch": 0.768,
374
+ "grad_norm": 4.215636730194092,
375
+ "learning_rate": 2.034617818069876e-06,
376
+ "loss": 0.684,
377
+ "step": 480
378
+ },
379
+ {
380
+ "epoch": 0.784,
381
+ "grad_norm": 3.999462127685547,
382
+ "learning_rate": 1.99922931678448e-06,
383
+ "loss": 0.6607,
384
+ "step": 490
385
+ },
386
  {
387
  "epoch": 0.8,
388
+ "grad_norm": 4.186202526092529,
389
+ "learning_rate": 1.963525491562421e-06,
390
+ "loss": 0.6476,
391
  "step": 500
392
  },
393
  {
394
  "epoch": 0.8,
395
+ "eval_loss": 0.6736403107643127,
396
+ "eval_runtime": 7.3017,
397
+ "eval_samples_per_second": 273.908,
398
+ "eval_steps_per_second": 17.119,
399
  "step": 500
400
  },
401
  {
402
+ "epoch": 0.816,
403
+ "grad_norm": 4.060866355895996,
404
+ "learning_rate": 1.927528893704964e-06,
405
+ "loss": 0.6591,
406
+ "step": 510
407
+ },
408
+ {
409
+ "epoch": 0.832,
410
+ "grad_norm": 4.125776767730713,
411
+ "learning_rate": 1.8912622594348455e-06,
412
+ "loss": 0.6922,
413
+ "step": 520
414
+ },
415
+ {
416
+ "epoch": 0.848,
417
+ "grad_norm": 3.8623437881469727,
418
+ "learning_rate": 1.8547484955355872e-06,
419
+ "loss": 0.6513,
420
+ "step": 530
421
+ },
422
+ {
423
+ "epoch": 0.864,
424
+ "grad_norm": 4.550040245056152,
425
+ "learning_rate": 1.8180106648830824e-06,
426
+ "loss": 0.663,
427
+ "step": 540
428
  },
429
  {
430
  "epoch": 0.88,
431
+ "grad_norm": 4.267393589019775,
432
+ "learning_rate": 1.7810719718785873e-06,
433
+ "loss": 0.6685,
 
434
  "step": 550
435
  },
436
+ {
437
+ "epoch": 0.896,
438
+ "grad_norm": 4.428436756134033,
439
+ "learning_rate": 1.7439557477923257e-06,
440
+ "loss": 0.7051,
441
+ "step": 560
442
+ },
443
+ {
444
+ "epoch": 0.912,
445
+ "grad_norm": 4.695955276489258,
446
+ "learning_rate": 1.706685436026957e-06,
447
+ "loss": 0.6251,
448
+ "step": 570
449
+ },
450
+ {
451
+ "epoch": 0.928,
452
+ "grad_norm": 4.578221797943115,
453
+ "learning_rate": 1.6692845773102223e-06,
454
+ "loss": 0.6569,
455
+ "step": 580
456
+ },
457
+ {
458
+ "epoch": 0.944,
459
+ "grad_norm": 4.15753173828125,
460
+ "learning_rate": 1.6317767948261151e-06,
461
+ "loss": 0.6451,
462
+ "step": 590
463
+ },
464
  {
465
  "epoch": 0.96,
466
+ "grad_norm": 3.946852922439575,
467
+ "learning_rate": 1.5941857792939703e-06,
468
+ "loss": 0.6562,
469
  "step": 600
470
  },
471
  {
472
  "epoch": 0.96,
473
+ "eval_loss": 0.6707749962806702,
474
+ "eval_runtime": 7.4417,
475
+ "eval_samples_per_second": 268.756,
476
+ "eval_steps_per_second": 16.797,
477
  "step": 600
478
  },
479
  {
480
+ "epoch": 0.976,
481
+ "grad_norm": 4.214157581329346,
482
+ "learning_rate": 1.556535274004902e-06,
483
+ "loss": 0.6556,
484
+ "step": 610
485
+ },
486
+ {
487
+ "epoch": 0.992,
488
+ "grad_norm": 4.3779616355896,
489
+ "learning_rate": 1.518849059825029e-06,
490
+ "loss": 0.6764,
491
+ "step": 620
492
+ },
493
+ {
494
+ "epoch": 1.008,
495
+ "grad_norm": 3.7831478118896484,
496
+ "learning_rate": 1.481150940174971e-06,
497
+ "loss": 0.5361,
498
+ "step": 630
499
+ },
500
+ {
501
+ "epoch": 1.024,
502
+ "grad_norm": 4.746358394622803,
503
+ "learning_rate": 1.4434647259950982e-06,
504
+ "loss": 0.4773,
505
+ "step": 640
506
+ },
507
+ {
508
+ "epoch": 1.04,
509
+ "grad_norm": 4.30097770690918,
510
+ "learning_rate": 1.40581422070603e-06,
511
+ "loss": 0.4829,
512
+ "step": 650
513
+ },
514
+ {
515
+ "epoch": 1.056,
516
+ "grad_norm": 4.781858444213867,
517
+ "learning_rate": 1.3682232051738854e-06,
518
+ "loss": 0.4635,
519
+ "step": 660
520
+ },
521
+ {
522
+ "epoch": 1.072,
523
+ "grad_norm": 4.712700366973877,
524
+ "learning_rate": 1.3307154226897775e-06,
525
+ "loss": 0.4775,
526
+ "step": 670
527
+ },
528
+ {
529
+ "epoch": 1.088,
530
+ "grad_norm": 4.640942573547363,
531
+ "learning_rate": 1.293314563973043e-06,
532
+ "loss": 0.4421,
533
+ "step": 680
534
+ },
535
+ {
536
+ "epoch": 1.104,
537
+ "grad_norm": 5.104272365570068,
538
+ "learning_rate": 1.2560442522076746e-06,
539
+ "loss": 0.446,
540
+ "step": 690
541
+ },
542
+ {
543
+ "epoch": 1.12,
544
+ "grad_norm": 5.069291591644287,
545
+ "learning_rate": 1.2189280281214128e-06,
546
+ "loss": 0.461,
547
+ "step": 700
548
+ },
549
+ {
550
+ "epoch": 1.12,
551
+ "eval_loss": 0.7041329145431519,
552
+ "eval_runtime": 7.9231,
553
+ "eval_samples_per_second": 252.428,
554
+ "eval_steps_per_second": 15.777,
555
+ "step": 700
556
+ },
557
+ {
558
+ "epoch": 1.1360000000000001,
559
+ "grad_norm": 5.07060432434082,
560
+ "learning_rate": 1.1819893351169183e-06,
561
+ "loss": 0.4614,
562
+ "step": 710
563
+ },
564
+ {
565
+ "epoch": 1.152,
566
+ "grad_norm": 6.156511306762695,
567
+ "learning_rate": 1.1452515044644133e-06,
568
+ "loss": 0.4785,
569
+ "step": 720
570
+ },
571
+ {
572
+ "epoch": 1.168,
573
+ "grad_norm": 5.145558834075928,
574
+ "learning_rate": 1.108737740565155e-06,
575
+ "loss": 0.4463,
576
+ "step": 730
577
+ },
578
+ {
579
+ "epoch": 1.184,
580
+ "grad_norm": 4.276706218719482,
581
+ "learning_rate": 1.0724711062950359e-06,
582
+ "loss": 0.4595,
583
+ "step": 740
584
+ },
585
+ {
586
+ "epoch": 1.2,
587
+ "grad_norm": 5.321038722991943,
588
+ "learning_rate": 1.036474508437579e-06,
589
+ "loss": 0.4687,
590
+ "step": 750
591
+ },
592
+ {
593
+ "epoch": 1.216,
594
+ "grad_norm": 4.610326766967773,
595
+ "learning_rate": 1.0007706832155202e-06,
596
+ "loss": 0.4293,
597
+ "step": 760
598
+ },
599
+ {
600
+ "epoch": 1.232,
601
+ "grad_norm": 5.0618462562561035,
602
+ "learning_rate": 9.65382181930124e-07,
603
+ "loss": 0.4636,
604
+ "step": 770
605
+ },
606
+ {
607
+ "epoch": 1.248,
608
+ "grad_norm": 4.6133036613464355,
609
+ "learning_rate": 9.303313567172986e-07,
610
+ "loss": 0.4578,
611
+ "step": 780
612
+ },
613
+ {
614
+ "epoch": 1.264,
615
+ "grad_norm": 4.833690166473389,
616
+ "learning_rate": 8.956403464295061e-07,
617
+ "loss": 0.4511,
618
+ "step": 790
619
+ },
620
+ {
621
+ "epoch": 1.28,
622
+ "grad_norm": 4.997283935546875,
623
+ "learning_rate": 8.613310626523911e-07,
624
+ "loss": 0.4578,
625
+ "step": 800
626
+ },
627
+ {
628
+ "epoch": 1.28,
629
+ "eval_loss": 0.7093445062637329,
630
+ "eval_runtime": 9.5227,
631
+ "eval_samples_per_second": 210.025,
632
+ "eval_steps_per_second": 13.127,
633
+ "step": 800
634
+ },
635
+ {
636
+ "epoch": 1.296,
637
+ "grad_norm": 5.087992191314697,
638
+ "learning_rate": 8.274251758649519e-07,
639
+ "loss": 0.4858,
640
+ "step": 810
641
+ },
642
+ {
643
+ "epoch": 1.312,
644
+ "grad_norm": 5.305168628692627,
645
+ "learning_rate": 7.939441017520012e-07,
646
+ "loss": 0.4756,
647
+ "step": 820
648
+ },
649
+ {
650
+ "epoch": 1.328,
651
+ "grad_norm": 4.691494464874268,
652
+ "learning_rate": 7.609089876775628e-07,
653
+ "loss": 0.4593,
654
+ "step": 830
655
+ },
656
+ {
657
+ "epoch": 1.3439999999999999,
658
+ "grad_norm": 6.369288921356201,
659
+ "learning_rate": 7.283406993277403e-07,
660
+ "loss": 0.4461,
661
+ "step": 840
662
+ },
663
+ {
664
+ "epoch": 1.3599999999999999,
665
+ "grad_norm": 4.588751792907715,
666
+ "learning_rate": 6.962598075315047e-07,
667
+ "loss": 0.4569,
668
+ "step": 850
669
+ },
670
+ {
671
+ "epoch": 1.376,
672
+ "grad_norm": 4.815459728240967,
673
+ "learning_rate": 6.646865752677186e-07,
674
+ "loss": 0.4209,
675
+ "step": 860
676
+ },
677
+ {
678
+ "epoch": 1.392,
679
+ "grad_norm": 4.491628170013428,
680
+ "learning_rate": 6.336409448665989e-07,
681
+ "loss": 0.4768,
682
+ "step": 870
683
+ },
684
+ {
685
+ "epoch": 1.408,
686
+ "grad_norm": 5.085056304931641,
687
+ "learning_rate": 6.031425254137223e-07,
688
+ "loss": 0.4539,
689
+ "step": 880
690
+ },
691
+ {
692
+ "epoch": 1.424,
693
+ "grad_norm": 4.508426189422607,
694
+ "learning_rate": 5.732105803644987e-07,
695
+ "loss": 0.4588,
696
+ "step": 890
697
+ },
698
+ {
699
+ "epoch": 1.44,
700
+ "grad_norm": 4.900942325592041,
701
+ "learning_rate": 5.438640153769653e-07,
702
+ "loss": 0.4817,
703
+ "step": 900
704
+ },
705
+ {
706
+ "epoch": 1.44,
707
+ "eval_loss": 0.7055138945579529,
708
+ "eval_runtime": 9.5816,
709
+ "eval_samples_per_second": 208.733,
710
+ "eval_steps_per_second": 13.046,
711
+ "step": 900
712
+ },
713
+ {
714
+ "epoch": 1.456,
715
+ "grad_norm": 5.169991970062256,
716
+ "learning_rate": 5.151213663705655e-07,
717
+ "loss": 0.4608,
718
+ "step": 910
719
+ },
720
+ {
721
+ "epoch": 1.472,
722
+ "grad_norm": 5.345458030700684,
723
+ "learning_rate": 4.870007878184633e-07,
724
+ "loss": 0.4687,
725
+ "step": 920
726
+ },
727
+ {
728
+ "epoch": 1.488,
729
+ "grad_norm": 5.129513263702393,
730
+ "learning_rate": 4.5952004128079276e-07,
731
+ "loss": 0.4677,
732
+ "step": 930
733
+ },
734
+ {
735
+ "epoch": 1.504,
736
+ "grad_norm": 4.392848491668701,
737
+ "learning_rate": 4.3269648418607197e-07,
738
+ "loss": 0.4612,
739
+ "step": 940
740
+ },
741
+ {
742
+ "epoch": 1.52,
743
+ "grad_norm": 4.760442733764648,
744
+ "learning_rate": 4.06547058867883e-07,
745
+ "loss": 0.4864,
746
+ "step": 950
747
+ },
748
+ {
749
+ "epoch": 1.536,
750
+ "grad_norm": 5.239096164703369,
751
+ "learning_rate": 3.8108828186372685e-07,
752
+ "loss": 0.4576,
753
+ "step": 960
754
+ },
755
+ {
756
+ "epoch": 1.552,
757
+ "grad_norm": 4.897254467010498,
758
+ "learning_rate": 3.56336233482828e-07,
759
+ "loss": 0.4518,
760
+ "step": 970
761
+ },
762
+ {
763
+ "epoch": 1.568,
764
+ "grad_norm": 5.0241804122924805,
765
+ "learning_rate": 3.32306547649465e-07,
766
+ "loss": 0.454,
767
+ "step": 980
768
+ },
769
+ {
770
+ "epoch": 1.584,
771
+ "grad_norm": 5.055388450622559,
772
+ "learning_rate": 3.0901440202824693e-07,
773
+ "loss": 0.4808,
774
+ "step": 990
775
+ },
776
+ {
777
+ "epoch": 1.6,
778
+ "grad_norm": 4.961503982543945,
779
+ "learning_rate": 2.86474508437579e-07,
780
+ "loss": 0.4324,
781
+ "step": 1000
782
+ },
783
+ {
784
+ "epoch": 1.6,
785
+ "eval_loss": 0.7080456614494324,
786
+ "eval_runtime": 7.9696,
787
+ "eval_samples_per_second": 250.955,
788
+ "eval_steps_per_second": 15.685,
789
+ "step": 1000
790
+ },
791
+ {
792
+ "epoch": 1.616,
793
+ "grad_norm": 4.625386714935303,
794
+ "learning_rate": 2.647011035573588e-07,
795
+ "loss": 0.4707,
796
+ "step": 1010
797
+ },
798
+ {
799
+ "epoch": 1.6320000000000001,
800
+ "grad_norm": 5.237055778503418,
801
+ "learning_rate": 2.437079399367875e-07,
802
+ "loss": 0.4575,
803
+ "step": 1020
804
+ },
805
+ {
806
+ "epoch": 1.6480000000000001,
807
+ "grad_norm": 5.412177562713623,
808
+ "learning_rate": 2.235082773079624e-07,
809
+ "loss": 0.4786,
810
+ "step": 1030
811
+ },
812
+ {
813
+ "epoch": 1.6640000000000001,
814
+ "grad_norm": 5.122972011566162,
815
+ "learning_rate": 2.0411487421074708e-07,
816
+ "loss": 0.4961,
817
+ "step": 1040
818
+ },
819
+ {
820
+ "epoch": 1.6800000000000002,
821
+ "grad_norm": 4.920769214630127,
822
+ "learning_rate": 1.8553997993420495e-07,
823
+ "loss": 0.4494,
824
+ "step": 1050
825
+ },
826
+ {
827
+ "epoch": 1.696,
828
+ "grad_norm": 4.990310192108154,
829
+ "learning_rate": 1.6779532677968329e-07,
830
+ "loss": 0.4713,
831
+ "step": 1060
832
+ },
833
+ {
834
+ "epoch": 1.712,
835
+ "grad_norm": 4.39508581161499,
836
+ "learning_rate": 1.508921226504434e-07,
837
+ "loss": 0.4633,
838
+ "step": 1070
839
+ },
840
+ {
841
+ "epoch": 1.728,
842
+ "grad_norm": 5.249621391296387,
843
+ "learning_rate": 1.348410439725065e-07,
844
+ "loss": 0.4764,
845
+ "step": 1080
846
+ },
847
+ {
848
+ "epoch": 1.744,
849
+ "grad_norm": 5.508373260498047,
850
+ "learning_rate": 1.1965222895119444e-07,
851
+ "loss": 0.4793,
852
+ "step": 1090
853
+ },
854
+ {
855
+ "epoch": 1.76,
856
+ "grad_norm": 5.088216781616211,
857
+ "learning_rate": 1.0533527116762298e-07,
858
+ "loss": 0.4693,
859
+ "step": 1100
860
+ },
861
+ {
862
+ "epoch": 1.76,
863
+ "eval_loss": 0.7081010937690735,
864
+ "eval_runtime": 7.6853,
865
+ "eval_samples_per_second": 260.237,
866
+ "eval_steps_per_second": 16.265,
867
+ "step": 1100
868
+ },
869
+ {
870
+ "epoch": 1.776,
871
+ "grad_norm": 5.710309028625488,
872
+ "learning_rate": 9.18992135191889e-08,
873
+ "loss": 0.4405,
874
+ "step": 1110
875
+ },
876
+ {
877
+ "epoch": 1.792,
878
+ "grad_norm": 4.6689982414245605,
879
+ "learning_rate": 7.935254250788366e-08,
880
+ "loss": 0.4484,
881
+ "step": 1120
882
+ },
883
+ {
884
+ "epoch": 1.808,
885
+ "grad_norm": 4.682315349578857,
886
+ "learning_rate": 6.770318288003558e-08,
887
+ "loss": 0.4458,
888
+ "step": 1130
889
+ },
890
+ {
891
+ "epoch": 1.8239999999999998,
892
+ "grad_norm": 5.43618106842041,
893
+ "learning_rate": 5.6958492620871105e-08,
894
+ "loss": 0.4914,
895
+ "step": 1140
896
+ },
897
+ {
898
+ "epoch": 1.8399999999999999,
899
+ "grad_norm": 4.98464822769165,
900
+ "learning_rate": 4.712525830705339e-08,
901
+ "loss": 0.4567,
902
+ "step": 1150
903
+ },
904
+ {
905
+ "epoch": 1.8559999999999999,
906
+ "grad_norm": 4.657787799835205,
907
+ "learning_rate": 3.820969082013415e-08,
908
+ "loss": 0.452,
909
+ "step": 1160
910
+ },
911
+ {
912
+ "epoch": 1.8719999999999999,
913
+ "grad_norm": 4.8955559730529785,
914
+ "learning_rate": 3.021742142362971e-08,
915
+ "loss": 0.463,
916
+ "step": 1170
917
+ },
918
+ {
919
+ "epoch": 1.888,
920
+ "grad_norm": 5.363377094268799,
921
+ "learning_rate": 2.3153498206192002e-08,
922
+ "loss": 0.4489,
923
+ "step": 1180
924
+ },
925
+ {
926
+ "epoch": 1.904,
927
+ "grad_norm": 4.830941677093506,
928
+ "learning_rate": 1.7022382893129074e-08,
929
+ "loss": 0.4594,
930
+ "step": 1190
931
+ },
932
+ {
933
+ "epoch": 1.92,
934
+ "grad_norm": 5.2024827003479,
935
+ "learning_rate": 1.1827948028283353e-08,
936
+ "loss": 0.4475,
937
+ "step": 1200
938
+ },
939
+ {
940
+ "epoch": 1.92,
941
+ "eval_loss": 0.707037091255188,
942
+ "eval_runtime": 7.654,
943
+ "eval_samples_per_second": 261.3,
944
+ "eval_steps_per_second": 16.331,
945
+ "step": 1200
946
+ },
947
+ {
948
+ "epoch": 1.936,
949
+ "grad_norm": 4.656322956085205,
950
+ "learning_rate": 7.57347452804974e-09,
951
+ "loss": 0.4583,
952
+ "step": 1210
953
+ },
954
+ {
955
+ "epoch": 1.952,
956
+ "grad_norm": 5.396835803985596,
957
+ "learning_rate": 4.261649609079099e-09,
958
+ "loss": 0.4136,
959
+ "step": 1220
960
+ },
961
+ {
962
+ "epoch": 1.968,
963
+ "grad_norm": 4.812420845031738,
964
+ "learning_rate": 1.8945650909737986e-09,
965
+ "loss": 0.4706,
966
+ "step": 1230
967
+ },
968
+ {
969
+ "epoch": 1.984,
970
+ "grad_norm": 4.871044635772705,
971
+ "learning_rate": 4.737160750500902e-10,
972
+ "loss": 0.4637,
973
+ "step": 1240
974
+ },
975
+ {
976
+ "epoch": 2.0,
977
+ "grad_norm": 4.772403240203857,
978
+ "learning_rate": 0.0,
979
+ "loss": 0.4707,
980
+ "step": 1250
981
+ },
982
+ {
983
+ "epoch": 2.0,
984
+ "step": 1250,
985
+ "total_flos": 4.566411071245517e+16,
986
+ "train_loss": 0.5696872653961181,
987
+ "train_runtime": 1519.873,
988
+ "train_samples_per_second": 13.159,
989
+ "train_steps_per_second": 0.822
990
  }
991
  ],
992
+ "logging_steps": 10,
993
+ "max_steps": 1250,
994
  "num_input_tokens_seen": 0,
995
+ "num_train_epochs": 2,
996
  "save_steps": 500,
997
  "stateful_callbacks": {
998
  "TrainerControl": {
 
1006
  "attributes": {}
1007
  }
1008
  },
1009
+ "total_flos": 4.566411071245517e+16,
1010
  "train_batch_size": 2,
1011
  "trial_name": null,
1012
  "trial_params": null