mtzig commited on
Commit
e5389fd
·
verified ·
1 Parent(s): 65a2282

Training in progress, step 100, checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ last-checkpoint/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
37
+ last-checkpoint/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ last-checkpoint/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
39
+ last-checkpoint/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
40
+ last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
41
+ last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
42
+ last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
43
+ last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
last-checkpoint/optimizer_0/.metadata ADDED
Binary file (369 kB). View file
 
last-checkpoint/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b25a1e2c53b75599076cb6d6fd1857506ef4b6ac7784425822df2ef48781558
3
+ size 13934748
last-checkpoint/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:506afc9b93ebc7f63cc1b3b5708b8defde0806cf9607b26885c46ce2009d72f8
3
+ size 13999412
last-checkpoint/optimizer_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b345a56b70a32d18af648554ef1104bd9d0a34f1d1e4e1faa790b9e0e647fc5f
3
+ size 13990904
last-checkpoint/optimizer_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:338d5c145f9b668e0efb195681abd213c08975bf33dafe4116ec17bf2dbb4db5
3
+ size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata ADDED
Binary file (135 kB). View file
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb1a3f0e62cda38d84605dcd5372725de3379507ae887967f2443005c3792748
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58c8c061e035966d524a8ae26d80b7b01b0719017ad2832d13f060b90a01dd3b
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5b842bf48600db62730c2f959a7922d76f8299355557886bb65da3ef624fbb7
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e570b00b72533f7c7d7931f91793dad7441b0b77e61a0e6d15f86d1448f0f5c1
3
+ size 6966784
last-checkpoint/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e4ead7a9090a756b7fd44dcd84a0128fc3e073a0556a840016ee79c554e0b80
3
+ size 14960
last-checkpoint/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45746def86d7a8510cabcc16531091de91eaf8f9bdd39d725096005db8ee2a1a
3
+ size 14960
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6f6f333666bf8e00dd613a01077ed8920391e394339b3ae8687718cf5f788c2
3
+ size 14960
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:111993f0c7702ea7b86533de3410a44aff0126390ec01a74930984eb2b182a72
3
+ size 14960
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a104c065133d085f18edd3e5b4057dbd861eb3e31968053f10edac0d68e4236e
3
+ size 1064
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,805 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.28735632183908044,
5
+ "eval_steps": 20,
6
+ "global_step": 100,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0,
13
+ "eval_accuracy": 0.7376237623762376,
14
+ "eval_f1": 0.11666666666666667,
15
+ "eval_loss": 0.595770537853241,
16
+ "eval_precision": 0.5,
17
+ "eval_recall": 0.0660377358490566,
18
+ "eval_runtime": 16.3774,
19
+ "eval_samples_per_second": 6.472,
20
+ "eval_steps_per_second": 0.244,
21
+ "step": 0
22
+ },
23
+ {
24
+ "epoch": 0.0028735632183908046,
25
+ "grad_norm": 2.3969345092773438,
26
+ "learning_rate": 5.714285714285715e-07,
27
+ "loss": 0.7755,
28
+ "step": 1
29
+ },
30
+ {
31
+ "epoch": 0.005747126436781609,
32
+ "grad_norm": 2.4285762310028076,
33
+ "learning_rate": 1.142857142857143e-06,
34
+ "loss": 0.7738,
35
+ "step": 2
36
+ },
37
+ {
38
+ "epoch": 0.008620689655172414,
39
+ "grad_norm": 2.5854382514953613,
40
+ "learning_rate": 1.7142857142857145e-06,
41
+ "loss": 0.7642,
42
+ "step": 3
43
+ },
44
+ {
45
+ "epoch": 0.011494252873563218,
46
+ "grad_norm": 2.7728841304779053,
47
+ "learning_rate": 2.285714285714286e-06,
48
+ "loss": 0.7878,
49
+ "step": 4
50
+ },
51
+ {
52
+ "epoch": 0.014367816091954023,
53
+ "grad_norm": 2.6096584796905518,
54
+ "learning_rate": 2.8571428571428573e-06,
55
+ "loss": 0.7831,
56
+ "step": 5
57
+ },
58
+ {
59
+ "epoch": 0.017241379310344827,
60
+ "grad_norm": 2.421935558319092,
61
+ "learning_rate": 3.428571428571429e-06,
62
+ "loss": 0.7508,
63
+ "step": 6
64
+ },
65
+ {
66
+ "epoch": 0.020114942528735632,
67
+ "grad_norm": 2.579468250274658,
68
+ "learning_rate": 4.000000000000001e-06,
69
+ "loss": 0.774,
70
+ "step": 7
71
+ },
72
+ {
73
+ "epoch": 0.022988505747126436,
74
+ "grad_norm": 2.328829288482666,
75
+ "learning_rate": 4.571428571428572e-06,
76
+ "loss": 0.7371,
77
+ "step": 8
78
+ },
79
+ {
80
+ "epoch": 0.02586206896551724,
81
+ "grad_norm": 2.5165493488311768,
82
+ "learning_rate": 5.142857142857142e-06,
83
+ "loss": 0.7402,
84
+ "step": 9
85
+ },
86
+ {
87
+ "epoch": 0.028735632183908046,
88
+ "grad_norm": 2.5865883827209473,
89
+ "learning_rate": 5.7142857142857145e-06,
90
+ "loss": 0.7545,
91
+ "step": 10
92
+ },
93
+ {
94
+ "epoch": 0.031609195402298854,
95
+ "grad_norm": 2.6125497817993164,
96
+ "learning_rate": 6.285714285714286e-06,
97
+ "loss": 0.7422,
98
+ "step": 11
99
+ },
100
+ {
101
+ "epoch": 0.034482758620689655,
102
+ "grad_norm": 2.3310539722442627,
103
+ "learning_rate": 6.857142857142858e-06,
104
+ "loss": 0.7349,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 0.03735632183908046,
109
+ "grad_norm": 2.4077725410461426,
110
+ "learning_rate": 7.428571428571429e-06,
111
+ "loss": 0.7256,
112
+ "step": 13
113
+ },
114
+ {
115
+ "epoch": 0.040229885057471264,
116
+ "grad_norm": 2.2570087909698486,
117
+ "learning_rate": 8.000000000000001e-06,
118
+ "loss": 0.7078,
119
+ "step": 14
120
+ },
121
+ {
122
+ "epoch": 0.04310344827586207,
123
+ "grad_norm": 2.242189407348633,
124
+ "learning_rate": 8.571428571428571e-06,
125
+ "loss": 0.686,
126
+ "step": 15
127
+ },
128
+ {
129
+ "epoch": 0.04597701149425287,
130
+ "grad_norm": 2.1012094020843506,
131
+ "learning_rate": 9.142857142857144e-06,
132
+ "loss": 0.7336,
133
+ "step": 16
134
+ },
135
+ {
136
+ "epoch": 0.04885057471264368,
137
+ "grad_norm": 1.9637709856033325,
138
+ "learning_rate": 9.714285714285715e-06,
139
+ "loss": 0.7025,
140
+ "step": 17
141
+ },
142
+ {
143
+ "epoch": 0.05172413793103448,
144
+ "grad_norm": 2.24760365486145,
145
+ "learning_rate": 1.0285714285714285e-05,
146
+ "loss": 0.6973,
147
+ "step": 18
148
+ },
149
+ {
150
+ "epoch": 0.05459770114942529,
151
+ "grad_norm": 1.9541856050491333,
152
+ "learning_rate": 1.0857142857142858e-05,
153
+ "loss": 0.6621,
154
+ "step": 19
155
+ },
156
+ {
157
+ "epoch": 0.05747126436781609,
158
+ "grad_norm": 1.9117401838302612,
159
+ "learning_rate": 1.1428571428571429e-05,
160
+ "loss": 0.6808,
161
+ "step": 20
162
+ },
163
+ {
164
+ "epoch": 0.05747126436781609,
165
+ "eval_accuracy": 0.7400990099009901,
166
+ "eval_f1": 0.11764705882352941,
167
+ "eval_loss": 0.57041996717453,
168
+ "eval_precision": 0.5384615384615384,
169
+ "eval_recall": 0.0660377358490566,
170
+ "eval_runtime": 16.3739,
171
+ "eval_samples_per_second": 6.474,
172
+ "eval_steps_per_second": 0.244,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 0.0603448275862069,
177
+ "grad_norm": 2.0838022232055664,
178
+ "learning_rate": 1.2e-05,
179
+ "loss": 0.6775,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 0.06321839080459771,
184
+ "grad_norm": 1.8101274967193604,
185
+ "learning_rate": 1.2571428571428572e-05,
186
+ "loss": 0.6722,
187
+ "step": 22
188
+ },
189
+ {
190
+ "epoch": 0.06609195402298851,
191
+ "grad_norm": 1.6239138841629028,
192
+ "learning_rate": 1.3142857142857145e-05,
193
+ "loss": 0.6205,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 0.06896551724137931,
198
+ "grad_norm": 1.581398606300354,
199
+ "learning_rate": 1.3714285714285716e-05,
200
+ "loss": 0.6114,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.07183908045977011,
205
+ "grad_norm": 1.7053773403167725,
206
+ "learning_rate": 1.4285714285714287e-05,
207
+ "loss": 0.6215,
208
+ "step": 25
209
+ },
210
+ {
211
+ "epoch": 0.07471264367816093,
212
+ "grad_norm": 1.6182948350906372,
213
+ "learning_rate": 1.4857142857142858e-05,
214
+ "loss": 0.6062,
215
+ "step": 26
216
+ },
217
+ {
218
+ "epoch": 0.07758620689655173,
219
+ "grad_norm": 1.4925391674041748,
220
+ "learning_rate": 1.542857142857143e-05,
221
+ "loss": 0.5855,
222
+ "step": 27
223
+ },
224
+ {
225
+ "epoch": 0.08045977011494253,
226
+ "grad_norm": 1.4214599132537842,
227
+ "learning_rate": 1.6000000000000003e-05,
228
+ "loss": 0.5605,
229
+ "step": 28
230
+ },
231
+ {
232
+ "epoch": 0.08333333333333333,
233
+ "grad_norm": 1.5156348943710327,
234
+ "learning_rate": 1.6571428571428574e-05,
235
+ "loss": 0.5704,
236
+ "step": 29
237
+ },
238
+ {
239
+ "epoch": 0.08620689655172414,
240
+ "grad_norm": 1.218806266784668,
241
+ "learning_rate": 1.7142857142857142e-05,
242
+ "loss": 0.544,
243
+ "step": 30
244
+ },
245
+ {
246
+ "epoch": 0.08908045977011494,
247
+ "grad_norm": 1.1243335008621216,
248
+ "learning_rate": 1.7714285714285717e-05,
249
+ "loss": 0.5393,
250
+ "step": 31
251
+ },
252
+ {
253
+ "epoch": 0.09195402298850575,
254
+ "grad_norm": 1.1667400598526,
255
+ "learning_rate": 1.8285714285714288e-05,
256
+ "loss": 0.5428,
257
+ "step": 32
258
+ },
259
+ {
260
+ "epoch": 0.09482758620689655,
261
+ "grad_norm": 1.4552701711654663,
262
+ "learning_rate": 1.885714285714286e-05,
263
+ "loss": 0.5218,
264
+ "step": 33
265
+ },
266
+ {
267
+ "epoch": 0.09770114942528736,
268
+ "grad_norm": 1.2645764350891113,
269
+ "learning_rate": 1.942857142857143e-05,
270
+ "loss": 0.5398,
271
+ "step": 34
272
+ },
273
+ {
274
+ "epoch": 0.10057471264367816,
275
+ "grad_norm": 1.2053171396255493,
276
+ "learning_rate": 2e-05,
277
+ "loss": 0.5286,
278
+ "step": 35
279
+ },
280
+ {
281
+ "epoch": 0.10344827586206896,
282
+ "grad_norm": 1.3919466733932495,
283
+ "learning_rate": 1.9999496293646753e-05,
284
+ "loss": 0.5388,
285
+ "step": 36
286
+ },
287
+ {
288
+ "epoch": 0.10632183908045977,
289
+ "grad_norm": 1.0895131826400757,
290
+ "learning_rate": 1.999798522533102e-05,
291
+ "loss": 0.5138,
292
+ "step": 37
293
+ },
294
+ {
295
+ "epoch": 0.10919540229885058,
296
+ "grad_norm": 1.1765999794006348,
297
+ "learning_rate": 1.9995466947279753e-05,
298
+ "loss": 0.4781,
299
+ "step": 38
300
+ },
301
+ {
302
+ "epoch": 0.11206896551724138,
303
+ "grad_norm": 1.329185128211975,
304
+ "learning_rate": 1.9991941713187477e-05,
305
+ "loss": 0.4821,
306
+ "step": 39
307
+ },
308
+ {
309
+ "epoch": 0.11494252873563218,
310
+ "grad_norm": 1.8006553649902344,
311
+ "learning_rate": 1.9987409878190752e-05,
312
+ "loss": 0.4764,
313
+ "step": 40
314
+ },
315
+ {
316
+ "epoch": 0.11494252873563218,
317
+ "eval_accuracy": 0.7574257425742574,
318
+ "eval_f1": 0.5882352941176471,
319
+ "eval_loss": 0.4768294095993042,
320
+ "eval_precision": 0.5303030303030303,
321
+ "eval_recall": 0.660377358490566,
322
+ "eval_runtime": 17.5901,
323
+ "eval_samples_per_second": 6.026,
324
+ "eval_steps_per_second": 0.227,
325
+ "step": 40
326
+ },
327
+ {
328
+ "epoch": 0.11781609195402298,
329
+ "grad_norm": 1.338883876800537,
330
+ "learning_rate": 1.99818718988324e-05,
331
+ "loss": 0.474,
332
+ "step": 41
333
+ },
334
+ {
335
+ "epoch": 0.1206896551724138,
336
+ "grad_norm": 1.211243987083435,
337
+ "learning_rate": 1.9975328333015497e-05,
338
+ "loss": 0.4644,
339
+ "step": 42
340
+ },
341
+ {
342
+ "epoch": 0.1235632183908046,
343
+ "grad_norm": 1.1209361553192139,
344
+ "learning_rate": 1.9967779839947172e-05,
345
+ "loss": 0.4748,
346
+ "step": 43
347
+ },
348
+ {
349
+ "epoch": 0.12643678160919541,
350
+ "grad_norm": 1.2608532905578613,
351
+ "learning_rate": 1.9959227180072216e-05,
352
+ "loss": 0.4162,
353
+ "step": 44
354
+ },
355
+ {
356
+ "epoch": 0.12931034482758622,
357
+ "grad_norm": 1.2170352935791016,
358
+ "learning_rate": 1.9949671214996448e-05,
359
+ "loss": 0.4661,
360
+ "step": 45
361
+ },
362
+ {
363
+ "epoch": 0.13218390804597702,
364
+ "grad_norm": 1.0206319093704224,
365
+ "learning_rate": 1.993911290739993e-05,
366
+ "loss": 0.4412,
367
+ "step": 46
368
+ },
369
+ {
370
+ "epoch": 0.13505747126436782,
371
+ "grad_norm": 1.0381529331207275,
372
+ "learning_rate": 1.992755332093999e-05,
373
+ "loss": 0.3974,
374
+ "step": 47
375
+ },
376
+ {
377
+ "epoch": 0.13793103448275862,
378
+ "grad_norm": 1.6168910264968872,
379
+ "learning_rate": 1.9914993620144055e-05,
380
+ "loss": 0.4304,
381
+ "step": 48
382
+ },
383
+ {
384
+ "epoch": 0.14080459770114942,
385
+ "grad_norm": 1.1238210201263428,
386
+ "learning_rate": 1.990143507029234e-05,
387
+ "loss": 0.4392,
388
+ "step": 49
389
+ },
390
+ {
391
+ "epoch": 0.14367816091954022,
392
+ "grad_norm": 1.129978895187378,
393
+ "learning_rate": 1.9886879037290385e-05,
394
+ "loss": 0.4034,
395
+ "step": 50
396
+ },
397
+ {
398
+ "epoch": 0.14655172413793102,
399
+ "grad_norm": 1.1116799116134644,
400
+ "learning_rate": 1.9871326987531453e-05,
401
+ "loss": 0.3649,
402
+ "step": 51
403
+ },
404
+ {
405
+ "epoch": 0.14942528735632185,
406
+ "grad_norm": 1.2544704675674438,
407
+ "learning_rate": 1.98547804877488e-05,
408
+ "loss": 0.3968,
409
+ "step": 52
410
+ },
411
+ {
412
+ "epoch": 0.15229885057471265,
413
+ "grad_norm": 1.2192368507385254,
414
+ "learning_rate": 1.983724120485783e-05,
415
+ "loss": 0.4372,
416
+ "step": 53
417
+ },
418
+ {
419
+ "epoch": 0.15517241379310345,
420
+ "grad_norm": 1.0894333124160767,
421
+ "learning_rate": 1.9818710905788195e-05,
422
+ "loss": 0.399,
423
+ "step": 54
424
+ },
425
+ {
426
+ "epoch": 0.15804597701149425,
427
+ "grad_norm": 1.7048630714416504,
428
+ "learning_rate": 1.9799191457305767e-05,
429
+ "loss": 0.4092,
430
+ "step": 55
431
+ },
432
+ {
433
+ "epoch": 0.16091954022988506,
434
+ "grad_norm": 1.6605204343795776,
435
+ "learning_rate": 1.977868482582459e-05,
436
+ "loss": 0.3609,
437
+ "step": 56
438
+ },
439
+ {
440
+ "epoch": 0.16379310344827586,
441
+ "grad_norm": 1.3857808113098145,
442
+ "learning_rate": 1.9757193077208776e-05,
443
+ "loss": 0.4289,
444
+ "step": 57
445
+ },
446
+ {
447
+ "epoch": 0.16666666666666666,
448
+ "grad_norm": 1.4144093990325928,
449
+ "learning_rate": 1.9734718376564386e-05,
450
+ "loss": 0.3706,
451
+ "step": 58
452
+ },
453
+ {
454
+ "epoch": 0.16954022988505746,
455
+ "grad_norm": 1.634936809539795,
456
+ "learning_rate": 1.9711262988021322e-05,
457
+ "loss": 0.4022,
458
+ "step": 59
459
+ },
460
+ {
461
+ "epoch": 0.1724137931034483,
462
+ "grad_norm": 1.4971150159835815,
463
+ "learning_rate": 1.968682927450523e-05,
464
+ "loss": 0.4099,
465
+ "step": 60
466
+ },
467
+ {
468
+ "epoch": 0.1724137931034483,
469
+ "eval_accuracy": 0.801980198019802,
470
+ "eval_f1": 0.6153846153846154,
471
+ "eval_loss": 0.4052402079105377,
472
+ "eval_precision": 0.6274509803921569,
473
+ "eval_recall": 0.6037735849056604,
474
+ "eval_runtime": 16.8779,
475
+ "eval_samples_per_second": 6.28,
476
+ "eval_steps_per_second": 0.237,
477
+ "step": 60
478
+ },
479
+ {
480
+ "epoch": 0.1752873563218391,
481
+ "grad_norm": 1.365148901939392,
482
+ "learning_rate": 1.9661419697499455e-05,
483
+ "loss": 0.3956,
484
+ "step": 61
485
+ },
486
+ {
487
+ "epoch": 0.1781609195402299,
488
+ "grad_norm": 1.4328665733337402,
489
+ "learning_rate": 1.9635036816797072e-05,
490
+ "loss": 0.4113,
491
+ "step": 62
492
+ },
493
+ {
494
+ "epoch": 0.1810344827586207,
495
+ "grad_norm": 1.5210320949554443,
496
+ "learning_rate": 1.960768329024301e-05,
497
+ "loss": 0.3848,
498
+ "step": 63
499
+ },
500
+ {
501
+ "epoch": 0.1839080459770115,
502
+ "grad_norm": 1.6699368953704834,
503
+ "learning_rate": 1.957936187346628e-05,
504
+ "loss": 0.3515,
505
+ "step": 64
506
+ },
507
+ {
508
+ "epoch": 0.1867816091954023,
509
+ "grad_norm": 1.5539981126785278,
510
+ "learning_rate": 1.955007541960241e-05,
511
+ "loss": 0.416,
512
+ "step": 65
513
+ },
514
+ {
515
+ "epoch": 0.1896551724137931,
516
+ "grad_norm": 1.75135338306427,
517
+ "learning_rate": 1.9519826879005964e-05,
518
+ "loss": 0.4134,
519
+ "step": 66
520
+ },
521
+ {
522
+ "epoch": 0.1925287356321839,
523
+ "grad_norm": 2.172255516052246,
524
+ "learning_rate": 1.948861929895336e-05,
525
+ "loss": 0.4107,
526
+ "step": 67
527
+ },
528
+ {
529
+ "epoch": 0.19540229885057472,
530
+ "grad_norm": 1.7174351215362549,
531
+ "learning_rate": 1.945645582333587e-05,
532
+ "loss": 0.3827,
533
+ "step": 68
534
+ },
535
+ {
536
+ "epoch": 0.19827586206896552,
537
+ "grad_norm": 2.1698808670043945,
538
+ "learning_rate": 1.9423339692342885e-05,
539
+ "loss": 0.3816,
540
+ "step": 69
541
+ },
542
+ {
543
+ "epoch": 0.20114942528735633,
544
+ "grad_norm": 1.6693840026855469,
545
+ "learning_rate": 1.9389274242135528e-05,
546
+ "loss": 0.3552,
547
+ "step": 70
548
+ },
549
+ {
550
+ "epoch": 0.20402298850574713,
551
+ "grad_norm": 2.666456937789917,
552
+ "learning_rate": 1.9354262904510544e-05,
553
+ "loss": 0.4101,
554
+ "step": 71
555
+ },
556
+ {
557
+ "epoch": 0.20689655172413793,
558
+ "grad_norm": 1.946183681488037,
559
+ "learning_rate": 1.9318309206554567e-05,
560
+ "loss": 0.3497,
561
+ "step": 72
562
+ },
563
+ {
564
+ "epoch": 0.20977011494252873,
565
+ "grad_norm": 1.5909761190414429,
566
+ "learning_rate": 1.9281416770288806e-05,
567
+ "loss": 0.3599,
568
+ "step": 73
569
+ },
570
+ {
571
+ "epoch": 0.21264367816091953,
572
+ "grad_norm": 1.677031397819519,
573
+ "learning_rate": 1.924358931230418e-05,
574
+ "loss": 0.3452,
575
+ "step": 74
576
+ },
577
+ {
578
+ "epoch": 0.21551724137931033,
579
+ "grad_norm": 2.067196846008301,
580
+ "learning_rate": 1.920483064338687e-05,
581
+ "loss": 0.3568,
582
+ "step": 75
583
+ },
584
+ {
585
+ "epoch": 0.21839080459770116,
586
+ "grad_norm": 2.4643149375915527,
587
+ "learning_rate": 1.9165144668134426e-05,
588
+ "loss": 0.3731,
589
+ "step": 76
590
+ },
591
+ {
592
+ "epoch": 0.22126436781609196,
593
+ "grad_norm": 3.8383142948150635,
594
+ "learning_rate": 1.9124535384562423e-05,
595
+ "loss": 0.3795,
596
+ "step": 77
597
+ },
598
+ {
599
+ "epoch": 0.22413793103448276,
600
+ "grad_norm": 3.8934013843536377,
601
+ "learning_rate": 1.9083006883701688e-05,
602
+ "loss": 0.3726,
603
+ "step": 78
604
+ },
605
+ {
606
+ "epoch": 0.22701149425287356,
607
+ "grad_norm": 2.358133554458618,
608
+ "learning_rate": 1.904056334918617e-05,
609
+ "loss": 0.3108,
610
+ "step": 79
611
+ },
612
+ {
613
+ "epoch": 0.22988505747126436,
614
+ "grad_norm": 1.4917255640029907,
615
+ "learning_rate": 1.8997209056831462e-05,
616
+ "loss": 0.346,
617
+ "step": 80
618
+ },
619
+ {
620
+ "epoch": 0.22988505747126436,
621
+ "eval_accuracy": 0.8366336633663366,
622
+ "eval_f1": 0.6826923076923077,
623
+ "eval_loss": 0.3761025071144104,
624
+ "eval_precision": 0.696078431372549,
625
+ "eval_recall": 0.6698113207547169,
626
+ "eval_runtime": 16.4137,
627
+ "eval_samples_per_second": 6.458,
628
+ "eval_steps_per_second": 0.244,
629
+ "step": 80
630
+ },
631
+ {
632
+ "epoch": 0.23275862068965517,
633
+ "grad_norm": 2.120394706726074,
634
+ "learning_rate": 1.8952948374204066e-05,
635
+ "loss": 0.4094,
636
+ "step": 81
637
+ },
638
+ {
639
+ "epoch": 0.23563218390804597,
640
+ "grad_norm": 1.4860031604766846,
641
+ "learning_rate": 1.8907785760181392e-05,
642
+ "loss": 0.3428,
643
+ "step": 82
644
+ },
645
+ {
646
+ "epoch": 0.23850574712643677,
647
+ "grad_norm": 2.5330207347869873,
648
+ "learning_rate": 1.8861725764502557e-05,
649
+ "loss": 0.3856,
650
+ "step": 83
651
+ },
652
+ {
653
+ "epoch": 0.2413793103448276,
654
+ "grad_norm": 1.8291746377944946,
655
+ "learning_rate": 1.881477302731006e-05,
656
+ "loss": 0.3613,
657
+ "step": 84
658
+ },
659
+ {
660
+ "epoch": 0.2442528735632184,
661
+ "grad_norm": 2.459777355194092,
662
+ "learning_rate": 1.87669322786823e-05,
663
+ "loss": 0.3607,
664
+ "step": 85
665
+ },
666
+ {
667
+ "epoch": 0.2471264367816092,
668
+ "grad_norm": 1.6540309190750122,
669
+ "learning_rate": 1.8718208338157082e-05,
670
+ "loss": 0.3602,
671
+ "step": 86
672
+ },
673
+ {
674
+ "epoch": 0.25,
675
+ "grad_norm": 1.5972404479980469,
676
+ "learning_rate": 1.866860611424609e-05,
677
+ "loss": 0.3323,
678
+ "step": 87
679
+ },
680
+ {
681
+ "epoch": 0.25287356321839083,
682
+ "grad_norm": 3.115286111831665,
683
+ "learning_rate": 1.8618130603940386e-05,
684
+ "loss": 0.3163,
685
+ "step": 88
686
+ },
687
+ {
688
+ "epoch": 0.2557471264367816,
689
+ "grad_norm": 2.2507760524749756,
690
+ "learning_rate": 1.856678689220701e-05,
691
+ "loss": 0.3266,
692
+ "step": 89
693
+ },
694
+ {
695
+ "epoch": 0.25862068965517243,
696
+ "grad_norm": 3.702547073364258,
697
+ "learning_rate": 1.851458015147673e-05,
698
+ "loss": 0.3787,
699
+ "step": 90
700
+ },
701
+ {
702
+ "epoch": 0.2614942528735632,
703
+ "grad_norm": 2.5684680938720703,
704
+ "learning_rate": 1.846151564112294e-05,
705
+ "loss": 0.3113,
706
+ "step": 91
707
+ },
708
+ {
709
+ "epoch": 0.26436781609195403,
710
+ "grad_norm": 1.9638134241104126,
711
+ "learning_rate": 1.840759870693184e-05,
712
+ "loss": 0.3725,
713
+ "step": 92
714
+ },
715
+ {
716
+ "epoch": 0.2672413793103448,
717
+ "grad_norm": 2.0520501136779785,
718
+ "learning_rate": 1.8352834780563888e-05,
719
+ "loss": 0.372,
720
+ "step": 93
721
+ },
722
+ {
723
+ "epoch": 0.27011494252873564,
724
+ "grad_norm": 2.0618882179260254,
725
+ "learning_rate": 1.8297229379006614e-05,
726
+ "loss": 0.2768,
727
+ "step": 94
728
+ },
729
+ {
730
+ "epoch": 0.27298850574712646,
731
+ "grad_norm": 2.1019253730773926,
732
+ "learning_rate": 1.8240788104018824e-05,
733
+ "loss": 0.3435,
734
+ "step": 95
735
+ },
736
+ {
737
+ "epoch": 0.27586206896551724,
738
+ "grad_norm": 4.472621440887451,
739
+ "learning_rate": 1.8183516641566278e-05,
740
+ "loss": 0.3146,
741
+ "step": 96
742
+ },
743
+ {
744
+ "epoch": 0.27873563218390807,
745
+ "grad_norm": 2.1167545318603516,
746
+ "learning_rate": 1.8125420761248878e-05,
747
+ "loss": 0.3804,
748
+ "step": 97
749
+ },
750
+ {
751
+ "epoch": 0.28160919540229884,
752
+ "grad_norm": 1.9784914255142212,
753
+ "learning_rate": 1.806650631571943e-05,
754
+ "loss": 0.3514,
755
+ "step": 98
756
+ },
757
+ {
758
+ "epoch": 0.28448275862068967,
759
+ "grad_norm": 2.407283306121826,
760
+ "learning_rate": 1.8006779240094024e-05,
761
+ "loss": 0.301,
762
+ "step": 99
763
+ },
764
+ {
765
+ "epoch": 0.28735632183908044,
766
+ "grad_norm": 1.7405933141708374,
767
+ "learning_rate": 1.7946245551354156e-05,
768
+ "loss": 0.2929,
769
+ "step": 100
770
+ },
771
+ {
772
+ "epoch": 0.28735632183908044,
773
+ "eval_accuracy": 0.8366336633663366,
774
+ "eval_f1": 0.6886792452830188,
775
+ "eval_loss": 0.3663554787635803,
776
+ "eval_precision": 0.6886792452830188,
777
+ "eval_recall": 0.6886792452830188,
778
+ "eval_runtime": 16.6875,
779
+ "eval_samples_per_second": 6.352,
780
+ "eval_steps_per_second": 0.24,
781
+ "step": 100
782
+ }
783
+ ],
784
+ "logging_steps": 1,
785
+ "max_steps": 348,
786
+ "num_input_tokens_seen": 0,
787
+ "num_train_epochs": 1,
788
+ "save_steps": 100,
789
+ "stateful_callbacks": {
790
+ "TrainerControl": {
791
+ "args": {
792
+ "should_epoch_stop": false,
793
+ "should_evaluate": false,
794
+ "should_log": false,
795
+ "should_save": true,
796
+ "should_training_stop": false
797
+ },
798
+ "attributes": {}
799
+ }
800
+ },
801
+ "total_flos": 3.1415830310813696e+16,
802
+ "train_batch_size": 8,
803
+ "trial_name": null,
804
+ "trial_params": null
805
+ }