mtzig commited on
Commit
aafb541
·
verified ·
1 Parent(s): f6de213

Training in progress, step 200, checkpoint

Browse files
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ last-checkpoint/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
37
+ last-checkpoint/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ last-checkpoint/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
39
+ last-checkpoint/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
40
+ last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
41
+ last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
42
+ last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
43
+ last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
last-checkpoint/optimizer_0/.metadata ADDED
Binary file (369 kB). View file
 
last-checkpoint/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4eee0bc20bfe612a2406db1927bad535b871029a1459cdfff99c1d8c6c7f3b63
3
+ size 13934748
last-checkpoint/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8480d8b71bc4ba12fadce2b7092485478b8c309ecce318c15ffc6f83a418ea33
3
+ size 13999412
last-checkpoint/optimizer_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:865ffb2bdf7738b5a7a48e25068e631a1f4cfd3495ea1df1c76166542115412a
3
+ size 13990904
last-checkpoint/optimizer_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bc6404ab67370a58b70ca5d2e8919c5e01e34f1cb289a4a6bd798d70aee2dbd
3
+ size 13990904
last-checkpoint/pytorch_model_fsdp_0/.metadata ADDED
Binary file (135 kB). View file
 
last-checkpoint/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3498e0b6a4e7ed2241f24f000b2120ffa644d285a44cfde97745c9efb6ed358b
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d06a365662a6d32a03d081ca66ae94093585c255a49fe32e4fc6101155e341c
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44818d96fc5cb3fb73cb12c5017e94708a24961757ad115fff879a4c54351a1b
3
+ size 6966784
last-checkpoint/pytorch_model_fsdp_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7503aeea618e7970daff2e762d6b9cc3c0b593f25c7e566d92c8b37634b729e0
3
+ size 6966784
last-checkpoint/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7be0f10bff4b59eb4d3472c8dc5f6f8b12c709dd561a83d4586f3461ec1745a5
3
+ size 14960
last-checkpoint/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e61888020fafc126b7e547b5961b63a5561eea0a9665cf9acb78e192fc0856bc
3
+ size 14960
last-checkpoint/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:065119fcdbace59dd30c03371fc097ed8d58b83537d1b5e3a1f5c321afd26dfd
3
+ size 14960
last-checkpoint/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:061f461111f5cd0052d853db52e46aef61f148d9da594c2cc07a97c23921266c
3
+ size 14960
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2f6d7e0e198940381bc01669f2b59ed3c54273b38889812ff9b29559c995120
3
+ size 1064
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,1565 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.24752475247524752,
5
+ "eval_steps": 20,
6
+ "global_step": 200,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0,
13
+ "eval_accuracy": 0.7339246119733924,
14
+ "eval_f1": 0.24528301886792453,
15
+ "eval_loss": 0.6025775074958801,
16
+ "eval_precision": 0.6,
17
+ "eval_recall": 0.1541501976284585,
18
+ "eval_runtime": 47.5679,
19
+ "eval_samples_per_second": 5.802,
20
+ "eval_steps_per_second": 0.189,
21
+ "step": 0
22
+ },
23
+ {
24
+ "epoch": 0.0012376237623762376,
25
+ "grad_norm": 2.056412935256958,
26
+ "learning_rate": 2.469135802469136e-07,
27
+ "loss": 0.6505,
28
+ "step": 1
29
+ },
30
+ {
31
+ "epoch": 0.0024752475247524753,
32
+ "grad_norm": 2.1361210346221924,
33
+ "learning_rate": 4.938271604938272e-07,
34
+ "loss": 0.7395,
35
+ "step": 2
36
+ },
37
+ {
38
+ "epoch": 0.0037128712871287127,
39
+ "grad_norm": 2.2638471126556396,
40
+ "learning_rate": 7.407407407407407e-07,
41
+ "loss": 0.6948,
42
+ "step": 3
43
+ },
44
+ {
45
+ "epoch": 0.0049504950495049506,
46
+ "grad_norm": 1.881201148033142,
47
+ "learning_rate": 9.876543209876544e-07,
48
+ "loss": 0.6427,
49
+ "step": 4
50
+ },
51
+ {
52
+ "epoch": 0.006188118811881188,
53
+ "grad_norm": 2.1328437328338623,
54
+ "learning_rate": 1.234567901234568e-06,
55
+ "loss": 0.6554,
56
+ "step": 5
57
+ },
58
+ {
59
+ "epoch": 0.007425742574257425,
60
+ "grad_norm": 2.2691922187805176,
61
+ "learning_rate": 1.4814814814814815e-06,
62
+ "loss": 0.7034,
63
+ "step": 6
64
+ },
65
+ {
66
+ "epoch": 0.008663366336633664,
67
+ "grad_norm": 2.424414873123169,
68
+ "learning_rate": 1.7283950617283952e-06,
69
+ "loss": 0.6598,
70
+ "step": 7
71
+ },
72
+ {
73
+ "epoch": 0.009900990099009901,
74
+ "grad_norm": 2.1118245124816895,
75
+ "learning_rate": 1.9753086419753087e-06,
76
+ "loss": 0.668,
77
+ "step": 8
78
+ },
79
+ {
80
+ "epoch": 0.011138613861386138,
81
+ "grad_norm": 1.8890514373779297,
82
+ "learning_rate": 2.222222222222222e-06,
83
+ "loss": 0.6658,
84
+ "step": 9
85
+ },
86
+ {
87
+ "epoch": 0.012376237623762377,
88
+ "grad_norm": 2.2101762294769287,
89
+ "learning_rate": 2.469135802469136e-06,
90
+ "loss": 0.6984,
91
+ "step": 10
92
+ },
93
+ {
94
+ "epoch": 0.013613861386138614,
95
+ "grad_norm": 2.1789631843566895,
96
+ "learning_rate": 2.7160493827160496e-06,
97
+ "loss": 0.6483,
98
+ "step": 11
99
+ },
100
+ {
101
+ "epoch": 0.01485148514851485,
102
+ "grad_norm": 2.1754183769226074,
103
+ "learning_rate": 2.962962962962963e-06,
104
+ "loss": 0.6328,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 0.01608910891089109,
109
+ "grad_norm": 1.9709060192108154,
110
+ "learning_rate": 3.2098765432098767e-06,
111
+ "loss": 0.6425,
112
+ "step": 13
113
+ },
114
+ {
115
+ "epoch": 0.017326732673267328,
116
+ "grad_norm": 2.338000535964966,
117
+ "learning_rate": 3.4567901234567904e-06,
118
+ "loss": 0.7665,
119
+ "step": 14
120
+ },
121
+ {
122
+ "epoch": 0.018564356435643563,
123
+ "grad_norm": 1.9738425016403198,
124
+ "learning_rate": 3.7037037037037037e-06,
125
+ "loss": 0.6994,
126
+ "step": 15
127
+ },
128
+ {
129
+ "epoch": 0.019801980198019802,
130
+ "grad_norm": 1.9872663021087646,
131
+ "learning_rate": 3.9506172839506175e-06,
132
+ "loss": 0.6101,
133
+ "step": 16
134
+ },
135
+ {
136
+ "epoch": 0.02103960396039604,
137
+ "grad_norm": 1.9945553541183472,
138
+ "learning_rate": 4.197530864197531e-06,
139
+ "loss": 0.641,
140
+ "step": 17
141
+ },
142
+ {
143
+ "epoch": 0.022277227722772276,
144
+ "grad_norm": 2.1487791538238525,
145
+ "learning_rate": 4.444444444444444e-06,
146
+ "loss": 0.6871,
147
+ "step": 18
148
+ },
149
+ {
150
+ "epoch": 0.023514851485148515,
151
+ "grad_norm": 2.6171352863311768,
152
+ "learning_rate": 4.691358024691358e-06,
153
+ "loss": 0.6863,
154
+ "step": 19
155
+ },
156
+ {
157
+ "epoch": 0.024752475247524754,
158
+ "grad_norm": 1.7834933996200562,
159
+ "learning_rate": 4.938271604938272e-06,
160
+ "loss": 0.6391,
161
+ "step": 20
162
+ },
163
+ {
164
+ "epoch": 0.024752475247524754,
165
+ "eval_accuracy": 0.7361419068736141,
166
+ "eval_f1": 0.25625,
167
+ "eval_loss": 0.5953530669212341,
168
+ "eval_precision": 0.6119402985074627,
169
+ "eval_recall": 0.16205533596837945,
170
+ "eval_runtime": 50.5471,
171
+ "eval_samples_per_second": 5.46,
172
+ "eval_steps_per_second": 0.178,
173
+ "step": 20
174
+ },
175
+ {
176
+ "epoch": 0.02599009900990099,
177
+ "grad_norm": 2.140673875808716,
178
+ "learning_rate": 5.185185185185185e-06,
179
+ "loss": 0.6099,
180
+ "step": 21
181
+ },
182
+ {
183
+ "epoch": 0.027227722772277228,
184
+ "grad_norm": 1.9627602100372314,
185
+ "learning_rate": 5.432098765432099e-06,
186
+ "loss": 0.6677,
187
+ "step": 22
188
+ },
189
+ {
190
+ "epoch": 0.028465346534653466,
191
+ "grad_norm": 1.9993869066238403,
192
+ "learning_rate": 5.6790123456790125e-06,
193
+ "loss": 0.6015,
194
+ "step": 23
195
+ },
196
+ {
197
+ "epoch": 0.0297029702970297,
198
+ "grad_norm": 1.7692540884017944,
199
+ "learning_rate": 5.925925925925926e-06,
200
+ "loss": 0.5969,
201
+ "step": 24
202
+ },
203
+ {
204
+ "epoch": 0.03094059405940594,
205
+ "grad_norm": 2.137422561645508,
206
+ "learning_rate": 6.17283950617284e-06,
207
+ "loss": 0.6501,
208
+ "step": 25
209
+ },
210
+ {
211
+ "epoch": 0.03217821782178218,
212
+ "grad_norm": 1.9657728672027588,
213
+ "learning_rate": 6.419753086419753e-06,
214
+ "loss": 0.6085,
215
+ "step": 26
216
+ },
217
+ {
218
+ "epoch": 0.03341584158415842,
219
+ "grad_norm": 1.7881442308425903,
220
+ "learning_rate": 6.666666666666667e-06,
221
+ "loss": 0.635,
222
+ "step": 27
223
+ },
224
+ {
225
+ "epoch": 0.034653465346534656,
226
+ "grad_norm": 2.832048177719116,
227
+ "learning_rate": 6.913580246913581e-06,
228
+ "loss": 0.7251,
229
+ "step": 28
230
+ },
231
+ {
232
+ "epoch": 0.03589108910891089,
233
+ "grad_norm": 1.9947174787521362,
234
+ "learning_rate": 7.160493827160494e-06,
235
+ "loss": 0.6394,
236
+ "step": 29
237
+ },
238
+ {
239
+ "epoch": 0.03712871287128713,
240
+ "grad_norm": 2.0211126804351807,
241
+ "learning_rate": 7.4074074074074075e-06,
242
+ "loss": 0.6082,
243
+ "step": 30
244
+ },
245
+ {
246
+ "epoch": 0.038366336633663366,
247
+ "grad_norm": 1.9397317171096802,
248
+ "learning_rate": 7.654320987654322e-06,
249
+ "loss": 0.6465,
250
+ "step": 31
251
+ },
252
+ {
253
+ "epoch": 0.039603960396039604,
254
+ "grad_norm": 2.2408998012542725,
255
+ "learning_rate": 7.901234567901235e-06,
256
+ "loss": 0.643,
257
+ "step": 32
258
+ },
259
+ {
260
+ "epoch": 0.04084158415841584,
261
+ "grad_norm": 1.9772993326187134,
262
+ "learning_rate": 8.148148148148148e-06,
263
+ "loss": 0.618,
264
+ "step": 33
265
+ },
266
+ {
267
+ "epoch": 0.04207920792079208,
268
+ "grad_norm": 1.6278493404388428,
269
+ "learning_rate": 8.395061728395062e-06,
270
+ "loss": 0.6425,
271
+ "step": 34
272
+ },
273
+ {
274
+ "epoch": 0.043316831683168314,
275
+ "grad_norm": 1.9789159297943115,
276
+ "learning_rate": 8.641975308641975e-06,
277
+ "loss": 0.6046,
278
+ "step": 35
279
+ },
280
+ {
281
+ "epoch": 0.04455445544554455,
282
+ "grad_norm": 1.801087498664856,
283
+ "learning_rate": 8.888888888888888e-06,
284
+ "loss": 0.6561,
285
+ "step": 36
286
+ },
287
+ {
288
+ "epoch": 0.04579207920792079,
289
+ "grad_norm": 1.5089136362075806,
290
+ "learning_rate": 9.135802469135803e-06,
291
+ "loss": 0.5883,
292
+ "step": 37
293
+ },
294
+ {
295
+ "epoch": 0.04702970297029703,
296
+ "grad_norm": 1.676107406616211,
297
+ "learning_rate": 9.382716049382717e-06,
298
+ "loss": 0.5684,
299
+ "step": 38
300
+ },
301
+ {
302
+ "epoch": 0.04826732673267327,
303
+ "grad_norm": 1.8138374090194702,
304
+ "learning_rate": 9.62962962962963e-06,
305
+ "loss": 0.6034,
306
+ "step": 39
307
+ },
308
+ {
309
+ "epoch": 0.04950495049504951,
310
+ "grad_norm": 1.7539325952529907,
311
+ "learning_rate": 9.876543209876543e-06,
312
+ "loss": 0.5891,
313
+ "step": 40
314
+ },
315
+ {
316
+ "epoch": 0.04950495049504951,
317
+ "eval_accuracy": 0.7549889135254989,
318
+ "eval_f1": 0.4318766066838046,
319
+ "eval_loss": 0.556958794593811,
320
+ "eval_precision": 0.6176470588235294,
321
+ "eval_recall": 0.33201581027667987,
322
+ "eval_runtime": 48.6708,
323
+ "eval_samples_per_second": 5.671,
324
+ "eval_steps_per_second": 0.185,
325
+ "step": 40
326
+ },
327
+ {
328
+ "epoch": 0.050742574257425746,
329
+ "grad_norm": 1.4187287092208862,
330
+ "learning_rate": 1.0123456790123458e-05,
331
+ "loss": 0.5636,
332
+ "step": 41
333
+ },
334
+ {
335
+ "epoch": 0.05198019801980198,
336
+ "grad_norm": 1.9447287321090698,
337
+ "learning_rate": 1.037037037037037e-05,
338
+ "loss": 0.5496,
339
+ "step": 42
340
+ },
341
+ {
342
+ "epoch": 0.053217821782178217,
343
+ "grad_norm": 1.6454174518585205,
344
+ "learning_rate": 1.0617283950617285e-05,
345
+ "loss": 0.5807,
346
+ "step": 43
347
+ },
348
+ {
349
+ "epoch": 0.054455445544554455,
350
+ "grad_norm": 1.7853933572769165,
351
+ "learning_rate": 1.0864197530864198e-05,
352
+ "loss": 0.6028,
353
+ "step": 44
354
+ },
355
+ {
356
+ "epoch": 0.055693069306930694,
357
+ "grad_norm": 1.6090970039367676,
358
+ "learning_rate": 1.1111111111111113e-05,
359
+ "loss": 0.5838,
360
+ "step": 45
361
+ },
362
+ {
363
+ "epoch": 0.05693069306930693,
364
+ "grad_norm": 2.3328471183776855,
365
+ "learning_rate": 1.1358024691358025e-05,
366
+ "loss": 0.5993,
367
+ "step": 46
368
+ },
369
+ {
370
+ "epoch": 0.05816831683168317,
371
+ "grad_norm": 2.4744842052459717,
372
+ "learning_rate": 1.160493827160494e-05,
373
+ "loss": 0.6092,
374
+ "step": 47
375
+ },
376
+ {
377
+ "epoch": 0.0594059405940594,
378
+ "grad_norm": 1.7244300842285156,
379
+ "learning_rate": 1.1851851851851852e-05,
380
+ "loss": 0.5969,
381
+ "step": 48
382
+ },
383
+ {
384
+ "epoch": 0.06064356435643564,
385
+ "grad_norm": 1.6698678731918335,
386
+ "learning_rate": 1.2098765432098767e-05,
387
+ "loss": 0.5254,
388
+ "step": 49
389
+ },
390
+ {
391
+ "epoch": 0.06188118811881188,
392
+ "grad_norm": 1.591994285583496,
393
+ "learning_rate": 1.234567901234568e-05,
394
+ "loss": 0.5509,
395
+ "step": 50
396
+ },
397
+ {
398
+ "epoch": 0.06311881188118812,
399
+ "grad_norm": 1.9688084125518799,
400
+ "learning_rate": 1.2592592592592593e-05,
401
+ "loss": 0.5232,
402
+ "step": 51
403
+ },
404
+ {
405
+ "epoch": 0.06435643564356436,
406
+ "grad_norm": 2.0831687450408936,
407
+ "learning_rate": 1.2839506172839507e-05,
408
+ "loss": 0.5141,
409
+ "step": 52
410
+ },
411
+ {
412
+ "epoch": 0.0655940594059406,
413
+ "grad_norm": 2.0480973720550537,
414
+ "learning_rate": 1.3086419753086422e-05,
415
+ "loss": 0.5669,
416
+ "step": 53
417
+ },
418
+ {
419
+ "epoch": 0.06683168316831684,
420
+ "grad_norm": 1.5781453847885132,
421
+ "learning_rate": 1.3333333333333333e-05,
422
+ "loss": 0.5065,
423
+ "step": 54
424
+ },
425
+ {
426
+ "epoch": 0.06806930693069307,
427
+ "grad_norm": 2.123061180114746,
428
+ "learning_rate": 1.3580246913580248e-05,
429
+ "loss": 0.4856,
430
+ "step": 55
431
+ },
432
+ {
433
+ "epoch": 0.06930693069306931,
434
+ "grad_norm": 2.2889890670776367,
435
+ "learning_rate": 1.3827160493827162e-05,
436
+ "loss": 0.4936,
437
+ "step": 56
438
+ },
439
+ {
440
+ "epoch": 0.07054455445544554,
441
+ "grad_norm": 2.201887607574463,
442
+ "learning_rate": 1.4074074074074075e-05,
443
+ "loss": 0.538,
444
+ "step": 57
445
+ },
446
+ {
447
+ "epoch": 0.07178217821782178,
448
+ "grad_norm": 1.8556184768676758,
449
+ "learning_rate": 1.4320987654320988e-05,
450
+ "loss": 0.5091,
451
+ "step": 58
452
+ },
453
+ {
454
+ "epoch": 0.07301980198019802,
455
+ "grad_norm": 1.5986840724945068,
456
+ "learning_rate": 1.4567901234567903e-05,
457
+ "loss": 0.4939,
458
+ "step": 59
459
+ },
460
+ {
461
+ "epoch": 0.07425742574257425,
462
+ "grad_norm": 2.35420560836792,
463
+ "learning_rate": 1.4814814814814815e-05,
464
+ "loss": 0.4606,
465
+ "step": 60
466
+ },
467
+ {
468
+ "epoch": 0.07425742574257425,
469
+ "eval_accuracy": 0.779379157427938,
470
+ "eval_f1": 0.5204819277108433,
471
+ "eval_loss": 0.4962254464626312,
472
+ "eval_precision": 0.6666666666666666,
473
+ "eval_recall": 0.4268774703557312,
474
+ "eval_runtime": 47.7725,
475
+ "eval_samples_per_second": 5.777,
476
+ "eval_steps_per_second": 0.188,
477
+ "step": 60
478
+ },
479
+ {
480
+ "epoch": 0.07549504950495049,
481
+ "grad_norm": 2.571995496749878,
482
+ "learning_rate": 1.506172839506173e-05,
483
+ "loss": 0.538,
484
+ "step": 61
485
+ },
486
+ {
487
+ "epoch": 0.07673267326732673,
488
+ "grad_norm": 2.467172622680664,
489
+ "learning_rate": 1.5308641975308643e-05,
490
+ "loss": 0.5176,
491
+ "step": 62
492
+ },
493
+ {
494
+ "epoch": 0.07797029702970297,
495
+ "grad_norm": 1.9836307764053345,
496
+ "learning_rate": 1.555555555555556e-05,
497
+ "loss": 0.544,
498
+ "step": 63
499
+ },
500
+ {
501
+ "epoch": 0.07920792079207921,
502
+ "grad_norm": 1.576439380645752,
503
+ "learning_rate": 1.580246913580247e-05,
504
+ "loss": 0.4453,
505
+ "step": 64
506
+ },
507
+ {
508
+ "epoch": 0.08044554455445545,
509
+ "grad_norm": 1.6136027574539185,
510
+ "learning_rate": 1.6049382716049385e-05,
511
+ "loss": 0.46,
512
+ "step": 65
513
+ },
514
+ {
515
+ "epoch": 0.08168316831683169,
516
+ "grad_norm": 2.130403518676758,
517
+ "learning_rate": 1.6296296296296297e-05,
518
+ "loss": 0.4797,
519
+ "step": 66
520
+ },
521
+ {
522
+ "epoch": 0.08292079207920793,
523
+ "grad_norm": 2.6445112228393555,
524
+ "learning_rate": 1.654320987654321e-05,
525
+ "loss": 0.5095,
526
+ "step": 67
527
+ },
528
+ {
529
+ "epoch": 0.08415841584158416,
530
+ "grad_norm": 2.384965658187866,
531
+ "learning_rate": 1.6790123456790123e-05,
532
+ "loss": 0.478,
533
+ "step": 68
534
+ },
535
+ {
536
+ "epoch": 0.0853960396039604,
537
+ "grad_norm": 1.9021402597427368,
538
+ "learning_rate": 1.7037037037037038e-05,
539
+ "loss": 0.4508,
540
+ "step": 69
541
+ },
542
+ {
543
+ "epoch": 0.08663366336633663,
544
+ "grad_norm": 2.2608911991119385,
545
+ "learning_rate": 1.728395061728395e-05,
546
+ "loss": 0.4828,
547
+ "step": 70
548
+ },
549
+ {
550
+ "epoch": 0.08787128712871287,
551
+ "grad_norm": 2.5560309886932373,
552
+ "learning_rate": 1.7530864197530865e-05,
553
+ "loss": 0.4429,
554
+ "step": 71
555
+ },
556
+ {
557
+ "epoch": 0.0891089108910891,
558
+ "grad_norm": 3.586392879486084,
559
+ "learning_rate": 1.7777777777777777e-05,
560
+ "loss": 0.393,
561
+ "step": 72
562
+ },
563
+ {
564
+ "epoch": 0.09034653465346534,
565
+ "grad_norm": 2.5128958225250244,
566
+ "learning_rate": 1.802469135802469e-05,
567
+ "loss": 0.4795,
568
+ "step": 73
569
+ },
570
+ {
571
+ "epoch": 0.09158415841584158,
572
+ "grad_norm": 2.255323886871338,
573
+ "learning_rate": 1.8271604938271607e-05,
574
+ "loss": 0.3733,
575
+ "step": 74
576
+ },
577
+ {
578
+ "epoch": 0.09282178217821782,
579
+ "grad_norm": 1.9865373373031616,
580
+ "learning_rate": 1.851851851851852e-05,
581
+ "loss": 0.3899,
582
+ "step": 75
583
+ },
584
+ {
585
+ "epoch": 0.09405940594059406,
586
+ "grad_norm": 2.985546588897705,
587
+ "learning_rate": 1.8765432098765433e-05,
588
+ "loss": 0.3784,
589
+ "step": 76
590
+ },
591
+ {
592
+ "epoch": 0.0952970297029703,
593
+ "grad_norm": 3.0742247104644775,
594
+ "learning_rate": 1.901234567901235e-05,
595
+ "loss": 0.4457,
596
+ "step": 77
597
+ },
598
+ {
599
+ "epoch": 0.09653465346534654,
600
+ "grad_norm": 2.365544319152832,
601
+ "learning_rate": 1.925925925925926e-05,
602
+ "loss": 0.3507,
603
+ "step": 78
604
+ },
605
+ {
606
+ "epoch": 0.09777227722772278,
607
+ "grad_norm": 3.4621968269348145,
608
+ "learning_rate": 1.9506172839506175e-05,
609
+ "loss": 0.405,
610
+ "step": 79
611
+ },
612
+ {
613
+ "epoch": 0.09900990099009901,
614
+ "grad_norm": 3.251645088195801,
615
+ "learning_rate": 1.9753086419753087e-05,
616
+ "loss": 0.4229,
617
+ "step": 80
618
+ },
619
+ {
620
+ "epoch": 0.09900990099009901,
621
+ "eval_accuracy": 0.7904656319290465,
622
+ "eval_f1": 0.5771812080536913,
623
+ "eval_loss": 0.4432809352874756,
624
+ "eval_precision": 0.6649484536082474,
625
+ "eval_recall": 0.5098814229249012,
626
+ "eval_runtime": 48.2096,
627
+ "eval_samples_per_second": 5.725,
628
+ "eval_steps_per_second": 0.187,
629
+ "step": 80
630
+ },
631
+ {
632
+ "epoch": 0.10024752475247525,
633
+ "grad_norm": 3.5432498455047607,
634
+ "learning_rate": 2e-05,
635
+ "loss": 0.3498,
636
+ "step": 81
637
+ },
638
+ {
639
+ "epoch": 0.10148514851485149,
640
+ "grad_norm": 4.109142303466797,
641
+ "learning_rate": 1.9999906631527858e-05,
642
+ "loss": 0.3289,
643
+ "step": 82
644
+ },
645
+ {
646
+ "epoch": 0.10272277227722772,
647
+ "grad_norm": 3.4147417545318604,
648
+ "learning_rate": 1.9999626527854966e-05,
649
+ "loss": 0.2813,
650
+ "step": 83
651
+ },
652
+ {
653
+ "epoch": 0.10396039603960396,
654
+ "grad_norm": 5.5374436378479,
655
+ "learning_rate": 1.9999159694211894e-05,
656
+ "loss": 0.3393,
657
+ "step": 84
658
+ },
659
+ {
660
+ "epoch": 0.1051980198019802,
661
+ "grad_norm": 4.537343502044678,
662
+ "learning_rate": 1.999850613931615e-05,
663
+ "loss": 0.4392,
664
+ "step": 85
665
+ },
666
+ {
667
+ "epoch": 0.10643564356435643,
668
+ "grad_norm": 3.075702428817749,
669
+ "learning_rate": 1.999766587537202e-05,
670
+ "loss": 0.3329,
671
+ "step": 86
672
+ },
673
+ {
674
+ "epoch": 0.10767326732673267,
675
+ "grad_norm": 6.164308071136475,
676
+ "learning_rate": 1.9996638918070336e-05,
677
+ "loss": 0.3292,
678
+ "step": 87
679
+ },
680
+ {
681
+ "epoch": 0.10891089108910891,
682
+ "grad_norm": 3.1993377208709717,
683
+ "learning_rate": 1.9995425286588187e-05,
684
+ "loss": 0.318,
685
+ "step": 88
686
+ },
687
+ {
688
+ "epoch": 0.11014851485148515,
689
+ "grad_norm": 3.789552927017212,
690
+ "learning_rate": 1.9994025003588547e-05,
691
+ "loss": 0.3504,
692
+ "step": 89
693
+ },
694
+ {
695
+ "epoch": 0.11138613861386139,
696
+ "grad_norm": 4.15277624130249,
697
+ "learning_rate": 1.9992438095219886e-05,
698
+ "loss": 0.2838,
699
+ "step": 90
700
+ },
701
+ {
702
+ "epoch": 0.11262376237623763,
703
+ "grad_norm": 3.4878060817718506,
704
+ "learning_rate": 1.9990664591115637e-05,
705
+ "loss": 0.3165,
706
+ "step": 91
707
+ },
708
+ {
709
+ "epoch": 0.11386138613861387,
710
+ "grad_norm": 5.2607035636901855,
711
+ "learning_rate": 1.9988704524393678e-05,
712
+ "loss": 0.3229,
713
+ "step": 92
714
+ },
715
+ {
716
+ "epoch": 0.1150990099009901,
717
+ "grad_norm": 6.290886878967285,
718
+ "learning_rate": 1.9986557931655688e-05,
719
+ "loss": 0.3629,
720
+ "step": 93
721
+ },
722
+ {
723
+ "epoch": 0.11633663366336634,
724
+ "grad_norm": 7.600953102111816,
725
+ "learning_rate": 1.9984224852986494e-05,
726
+ "loss": 0.3405,
727
+ "step": 94
728
+ },
729
+ {
730
+ "epoch": 0.11757425742574257,
731
+ "grad_norm": 4.730844974517822,
732
+ "learning_rate": 1.9981705331953295e-05,
733
+ "loss": 0.3718,
734
+ "step": 95
735
+ },
736
+ {
737
+ "epoch": 0.1188118811881188,
738
+ "grad_norm": 5.086641788482666,
739
+ "learning_rate": 1.9978999415604847e-05,
740
+ "loss": 0.2757,
741
+ "step": 96
742
+ },
743
+ {
744
+ "epoch": 0.12004950495049505,
745
+ "grad_norm": 6.739199161529541,
746
+ "learning_rate": 1.9976107154470613e-05,
747
+ "loss": 0.2859,
748
+ "step": 97
749
+ },
750
+ {
751
+ "epoch": 0.12128712871287128,
752
+ "grad_norm": 4.352366924285889,
753
+ "learning_rate": 1.9973028602559787e-05,
754
+ "loss": 0.3398,
755
+ "step": 98
756
+ },
757
+ {
758
+ "epoch": 0.12252475247524752,
759
+ "grad_norm": 7.858609199523926,
760
+ "learning_rate": 1.9969763817360314e-05,
761
+ "loss": 0.471,
762
+ "step": 99
763
+ },
764
+ {
765
+ "epoch": 0.12376237623762376,
766
+ "grad_norm": 5.571165561676025,
767
+ "learning_rate": 1.996631285983779e-05,
768
+ "loss": 0.3836,
769
+ "step": 100
770
+ },
771
+ {
772
+ "epoch": 0.12376237623762376,
773
+ "eval_accuracy": 0.8159645232815964,
774
+ "eval_f1": 0.6047619047619047,
775
+ "eval_loss": 0.42972368001937866,
776
+ "eval_precision": 0.7604790419161677,
777
+ "eval_recall": 0.5019762845849802,
778
+ "eval_runtime": 48.4236,
779
+ "eval_samples_per_second": 5.7,
780
+ "eval_steps_per_second": 0.186,
781
+ "step": 100
782
+ },
783
+ {
784
+ "epoch": 0.125,
785
+ "grad_norm": 4.134688854217529,
786
+ "learning_rate": 1.9962675794434342e-05,
787
+ "loss": 0.2516,
788
+ "step": 101
789
+ },
790
+ {
791
+ "epoch": 0.12623762376237624,
792
+ "grad_norm": 3.988821506500244,
793
+ "learning_rate": 1.9958852689067423e-05,
794
+ "loss": 0.2509,
795
+ "step": 102
796
+ },
797
+ {
798
+ "epoch": 0.12747524752475248,
799
+ "grad_norm": 5.836869716644287,
800
+ "learning_rate": 1.9954843615128528e-05,
801
+ "loss": 0.3183,
802
+ "step": 103
803
+ },
804
+ {
805
+ "epoch": 0.12871287128712872,
806
+ "grad_norm": 9.7975492477417,
807
+ "learning_rate": 1.995064864748188e-05,
808
+ "loss": 0.3471,
809
+ "step": 104
810
+ },
811
+ {
812
+ "epoch": 0.12995049504950495,
813
+ "grad_norm": 5.1211066246032715,
814
+ "learning_rate": 1.9946267864463027e-05,
815
+ "loss": 0.3466,
816
+ "step": 105
817
+ },
818
+ {
819
+ "epoch": 0.1311881188118812,
820
+ "grad_norm": 5.172476291656494,
821
+ "learning_rate": 1.994170134787737e-05,
822
+ "loss": 0.3442,
823
+ "step": 106
824
+ },
825
+ {
826
+ "epoch": 0.13242574257425743,
827
+ "grad_norm": 4.703874111175537,
828
+ "learning_rate": 1.993694918299864e-05,
829
+ "loss": 0.3027,
830
+ "step": 107
831
+ },
832
+ {
833
+ "epoch": 0.13366336633663367,
834
+ "grad_norm": 3.981438398361206,
835
+ "learning_rate": 1.9932011458567315e-05,
836
+ "loss": 0.2803,
837
+ "step": 108
838
+ },
839
+ {
840
+ "epoch": 0.1349009900990099,
841
+ "grad_norm": 3.627497911453247,
842
+ "learning_rate": 1.9926888266788955e-05,
843
+ "loss": 0.3011,
844
+ "step": 109
845
+ },
846
+ {
847
+ "epoch": 0.13613861386138615,
848
+ "grad_norm": 5.726022720336914,
849
+ "learning_rate": 1.9921579703332475e-05,
850
+ "loss": 0.3463,
851
+ "step": 110
852
+ },
853
+ {
854
+ "epoch": 0.1373762376237624,
855
+ "grad_norm": 3.9661319255828857,
856
+ "learning_rate": 1.991608586732837e-05,
857
+ "loss": 0.3455,
858
+ "step": 111
859
+ },
860
+ {
861
+ "epoch": 0.13861386138613863,
862
+ "grad_norm": 4.330716133117676,
863
+ "learning_rate": 1.991040686136685e-05,
864
+ "loss": 0.2888,
865
+ "step": 112
866
+ },
867
+ {
868
+ "epoch": 0.13985148514851486,
869
+ "grad_norm": 2.6466479301452637,
870
+ "learning_rate": 1.9904542791495938e-05,
871
+ "loss": 0.2423,
872
+ "step": 113
873
+ },
874
+ {
875
+ "epoch": 0.14108910891089108,
876
+ "grad_norm": 3.5607573986053467,
877
+ "learning_rate": 1.9898493767219486e-05,
878
+ "loss": 0.2481,
879
+ "step": 114
880
+ },
881
+ {
882
+ "epoch": 0.14232673267326731,
883
+ "grad_norm": 3.259629011154175,
884
+ "learning_rate": 1.989225990149512e-05,
885
+ "loss": 0.2707,
886
+ "step": 115
887
+ },
888
+ {
889
+ "epoch": 0.14356435643564355,
890
+ "grad_norm": 3.952185869216919,
891
+ "learning_rate": 1.988584131073215e-05,
892
+ "loss": 0.2607,
893
+ "step": 116
894
+ },
895
+ {
896
+ "epoch": 0.1448019801980198,
897
+ "grad_norm": 2.9898970127105713,
898
+ "learning_rate": 1.9879238114789375e-05,
899
+ "loss": 0.2234,
900
+ "step": 117
901
+ },
902
+ {
903
+ "epoch": 0.14603960396039603,
904
+ "grad_norm": 3.857395648956299,
905
+ "learning_rate": 1.9872450436972856e-05,
906
+ "loss": 0.2691,
907
+ "step": 118
908
+ },
909
+ {
910
+ "epoch": 0.14727722772277227,
911
+ "grad_norm": 4.034820079803467,
912
+ "learning_rate": 1.986547840403362e-05,
913
+ "loss": 0.3632,
914
+ "step": 119
915
+ },
916
+ {
917
+ "epoch": 0.1485148514851485,
918
+ "grad_norm": 3.5433619022369385,
919
+ "learning_rate": 1.9858322146165272e-05,
920
+ "loss": 0.3363,
921
+ "step": 120
922
+ },
923
+ {
924
+ "epoch": 0.1485148514851485,
925
+ "eval_accuracy": 0.8381374722838137,
926
+ "eval_f1": 0.6666666666666666,
927
+ "eval_loss": 0.36761781573295593,
928
+ "eval_precision": 0.7891891891891892,
929
+ "eval_recall": 0.5770750988142292,
930
+ "eval_runtime": 48.4565,
931
+ "eval_samples_per_second": 5.696,
932
+ "eval_steps_per_second": 0.186,
933
+ "step": 120
934
+ },
935
+ {
936
+ "epoch": 0.14975247524752475,
937
+ "grad_norm": 4.58292818069458,
938
+ "learning_rate": 1.9850981797001593e-05,
939
+ "loss": 0.2657,
940
+ "step": 121
941
+ },
942
+ {
943
+ "epoch": 0.15099009900990099,
944
+ "grad_norm": 4.649030685424805,
945
+ "learning_rate": 1.9843457493614016e-05,
946
+ "loss": 0.2851,
947
+ "step": 122
948
+ },
949
+ {
950
+ "epoch": 0.15222772277227722,
951
+ "grad_norm": 4.370965957641602,
952
+ "learning_rate": 1.9835749376509084e-05,
953
+ "loss": 0.2917,
954
+ "step": 123
955
+ },
956
+ {
957
+ "epoch": 0.15346534653465346,
958
+ "grad_norm": 5.558561325073242,
959
+ "learning_rate": 1.9827857589625817e-05,
960
+ "loss": 0.2922,
961
+ "step": 124
962
+ },
963
+ {
964
+ "epoch": 0.1547029702970297,
965
+ "grad_norm": 3.4896552562713623,
966
+ "learning_rate": 1.981978228033304e-05,
967
+ "loss": 0.2478,
968
+ "step": 125
969
+ },
970
+ {
971
+ "epoch": 0.15594059405940594,
972
+ "grad_norm": 5.457974910736084,
973
+ "learning_rate": 1.9811523599426604e-05,
974
+ "loss": 0.3341,
975
+ "step": 126
976
+ },
977
+ {
978
+ "epoch": 0.15717821782178218,
979
+ "grad_norm": 3.6488845348358154,
980
+ "learning_rate": 1.980308170112659e-05,
981
+ "loss": 0.2577,
982
+ "step": 127
983
+ },
984
+ {
985
+ "epoch": 0.15841584158415842,
986
+ "grad_norm": 3.6894092559814453,
987
+ "learning_rate": 1.979445674307444e-05,
988
+ "loss": 0.2544,
989
+ "step": 128
990
+ },
991
+ {
992
+ "epoch": 0.15965346534653466,
993
+ "grad_norm": 5.288538455963135,
994
+ "learning_rate": 1.9785648886329974e-05,
995
+ "loss": 0.2452,
996
+ "step": 129
997
+ },
998
+ {
999
+ "epoch": 0.1608910891089109,
1000
+ "grad_norm": 6.3318305015563965,
1001
+ "learning_rate": 1.977665829536842e-05,
1002
+ "loss": 0.2628,
1003
+ "step": 130
1004
+ },
1005
+ {
1006
+ "epoch": 0.16212871287128713,
1007
+ "grad_norm": 5.06384801864624,
1008
+ "learning_rate": 1.9767485138077327e-05,
1009
+ "loss": 0.337,
1010
+ "step": 131
1011
+ },
1012
+ {
1013
+ "epoch": 0.16336633663366337,
1014
+ "grad_norm": 3.954658269882202,
1015
+ "learning_rate": 1.9758129585753433e-05,
1016
+ "loss": 0.2729,
1017
+ "step": 132
1018
+ },
1019
+ {
1020
+ "epoch": 0.1646039603960396,
1021
+ "grad_norm": 3.3781790733337402,
1022
+ "learning_rate": 1.9748591813099457e-05,
1023
+ "loss": 0.2204,
1024
+ "step": 133
1025
+ },
1026
+ {
1027
+ "epoch": 0.16584158415841585,
1028
+ "grad_norm": 5.148495674133301,
1029
+ "learning_rate": 1.9738871998220857e-05,
1030
+ "loss": 0.2585,
1031
+ "step": 134
1032
+ },
1033
+ {
1034
+ "epoch": 0.1670792079207921,
1035
+ "grad_norm": 4.203769207000732,
1036
+ "learning_rate": 1.9728970322622485e-05,
1037
+ "loss": 0.3102,
1038
+ "step": 135
1039
+ },
1040
+ {
1041
+ "epoch": 0.16831683168316833,
1042
+ "grad_norm": 3.7691049575805664,
1043
+ "learning_rate": 1.9718886971205206e-05,
1044
+ "loss": 0.2592,
1045
+ "step": 136
1046
+ },
1047
+ {
1048
+ "epoch": 0.16955445544554457,
1049
+ "grad_norm": 5.7634711265563965,
1050
+ "learning_rate": 1.970862213226244e-05,
1051
+ "loss": 0.2607,
1052
+ "step": 137
1053
+ },
1054
+ {
1055
+ "epoch": 0.1707920792079208,
1056
+ "grad_norm": 4.632352828979492,
1057
+ "learning_rate": 1.9698175997476657e-05,
1058
+ "loss": 0.2914,
1059
+ "step": 138
1060
+ },
1061
+ {
1062
+ "epoch": 0.17202970297029702,
1063
+ "grad_norm": 5.2901434898376465,
1064
+ "learning_rate": 1.968754876191578e-05,
1065
+ "loss": 0.2874,
1066
+ "step": 139
1067
+ },
1068
+ {
1069
+ "epoch": 0.17326732673267325,
1070
+ "grad_norm": 3.2094457149505615,
1071
+ "learning_rate": 1.9676740624029566e-05,
1072
+ "loss": 0.2483,
1073
+ "step": 140
1074
+ },
1075
+ {
1076
+ "epoch": 0.17326732673267325,
1077
+ "eval_accuracy": 0.8403547671840355,
1078
+ "eval_f1": 0.6587677725118484,
1079
+ "eval_loss": 0.35367104411125183,
1080
+ "eval_precision": 0.8224852071005917,
1081
+ "eval_recall": 0.549407114624506,
1082
+ "eval_runtime": 49.1165,
1083
+ "eval_samples_per_second": 5.619,
1084
+ "eval_steps_per_second": 0.183,
1085
+ "step": 140
1086
+ },
1087
+ {
1088
+ "epoch": 0.1745049504950495,
1089
+ "grad_norm": 3.4511711597442627,
1090
+ "learning_rate": 1.9665751785645874e-05,
1091
+ "loss": 0.2277,
1092
+ "step": 141
1093
+ },
1094
+ {
1095
+ "epoch": 0.17574257425742573,
1096
+ "grad_norm": 3.3621718883514404,
1097
+ "learning_rate": 1.9654582451966915e-05,
1098
+ "loss": 0.2893,
1099
+ "step": 142
1100
+ },
1101
+ {
1102
+ "epoch": 0.17698019801980197,
1103
+ "grad_norm": 4.829539775848389,
1104
+ "learning_rate": 1.9643232831565417e-05,
1105
+ "loss": 0.2127,
1106
+ "step": 143
1107
+ },
1108
+ {
1109
+ "epoch": 0.1782178217821782,
1110
+ "grad_norm": 4.233989715576172,
1111
+ "learning_rate": 1.9631703136380716e-05,
1112
+ "loss": 0.2133,
1113
+ "step": 144
1114
+ },
1115
+ {
1116
+ "epoch": 0.17945544554455445,
1117
+ "grad_norm": 9.943169593811035,
1118
+ "learning_rate": 1.961999358171482e-05,
1119
+ "loss": 0.442,
1120
+ "step": 145
1121
+ },
1122
+ {
1123
+ "epoch": 0.1806930693069307,
1124
+ "grad_norm": 4.362405300140381,
1125
+ "learning_rate": 1.960810438622838e-05,
1126
+ "loss": 0.2677,
1127
+ "step": 146
1128
+ },
1129
+ {
1130
+ "epoch": 0.18193069306930693,
1131
+ "grad_norm": 4.714008808135986,
1132
+ "learning_rate": 1.959603577193659e-05,
1133
+ "loss": 0.3213,
1134
+ "step": 147
1135
+ },
1136
+ {
1137
+ "epoch": 0.18316831683168316,
1138
+ "grad_norm": 3.655679702758789,
1139
+ "learning_rate": 1.9583787964205073e-05,
1140
+ "loss": 0.199,
1141
+ "step": 148
1142
+ },
1143
+ {
1144
+ "epoch": 0.1844059405940594,
1145
+ "grad_norm": 4.397619247436523,
1146
+ "learning_rate": 1.9571361191745647e-05,
1147
+ "loss": 0.2728,
1148
+ "step": 149
1149
+ },
1150
+ {
1151
+ "epoch": 0.18564356435643564,
1152
+ "grad_norm": 4.055555820465088,
1153
+ "learning_rate": 1.955875568661206e-05,
1154
+ "loss": 0.2461,
1155
+ "step": 150
1156
+ },
1157
+ {
1158
+ "epoch": 0.18688118811881188,
1159
+ "grad_norm": 4.366605281829834,
1160
+ "learning_rate": 1.9545971684195664e-05,
1161
+ "loss": 0.2026,
1162
+ "step": 151
1163
+ },
1164
+ {
1165
+ "epoch": 0.18811881188118812,
1166
+ "grad_norm": 3.7074687480926514,
1167
+ "learning_rate": 1.9533009423221014e-05,
1168
+ "loss": 0.2817,
1169
+ "step": 152
1170
+ },
1171
+ {
1172
+ "epoch": 0.18935643564356436,
1173
+ "grad_norm": 4.276401996612549,
1174
+ "learning_rate": 1.951986914574141e-05,
1175
+ "loss": 0.2661,
1176
+ "step": 153
1177
+ },
1178
+ {
1179
+ "epoch": 0.1905940594059406,
1180
+ "grad_norm": 3.917130708694458,
1181
+ "learning_rate": 1.9506551097134384e-05,
1182
+ "loss": 0.3005,
1183
+ "step": 154
1184
+ },
1185
+ {
1186
+ "epoch": 0.19183168316831684,
1187
+ "grad_norm": 6.731651306152344,
1188
+ "learning_rate": 1.94930555260971e-05,
1189
+ "loss": 0.2892,
1190
+ "step": 155
1191
+ },
1192
+ {
1193
+ "epoch": 0.19306930693069307,
1194
+ "grad_norm": 4.87600564956665,
1195
+ "learning_rate": 1.947938268464173e-05,
1196
+ "loss": 0.1983,
1197
+ "step": 156
1198
+ },
1199
+ {
1200
+ "epoch": 0.1943069306930693,
1201
+ "grad_norm": 4.437981605529785,
1202
+ "learning_rate": 1.9465532828090735e-05,
1203
+ "loss": 0.2479,
1204
+ "step": 157
1205
+ },
1206
+ {
1207
+ "epoch": 0.19554455445544555,
1208
+ "grad_norm": 3.6721622943878174,
1209
+ "learning_rate": 1.9451506215072106e-05,
1210
+ "loss": 0.243,
1211
+ "step": 158
1212
+ },
1213
+ {
1214
+ "epoch": 0.1967821782178218,
1215
+ "grad_norm": 3.8687756061553955,
1216
+ "learning_rate": 1.943730310751453e-05,
1217
+ "loss": 0.2619,
1218
+ "step": 159
1219
+ },
1220
+ {
1221
+ "epoch": 0.19801980198019803,
1222
+ "grad_norm": 4.864063739776611,
1223
+ "learning_rate": 1.9422923770642494e-05,
1224
+ "loss": 0.2803,
1225
+ "step": 160
1226
+ },
1227
+ {
1228
+ "epoch": 0.19801980198019803,
1229
+ "eval_accuracy": 0.8414634146341463,
1230
+ "eval_f1": 0.6520681265206812,
1231
+ "eval_loss": 0.34682103991508484,
1232
+ "eval_precision": 0.8481012658227848,
1233
+ "eval_recall": 0.5296442687747036,
1234
+ "eval_runtime": 49.8936,
1235
+ "eval_samples_per_second": 5.532,
1236
+ "eval_steps_per_second": 0.18,
1237
+ "step": 160
1238
+ },
1239
+ {
1240
+ "epoch": 0.19925742574257427,
1241
+ "grad_norm": 3.036126136779785,
1242
+ "learning_rate": 1.9408368472971344e-05,
1243
+ "loss": 0.2777,
1244
+ "step": 161
1245
+ },
1246
+ {
1247
+ "epoch": 0.2004950495049505,
1248
+ "grad_norm": 3.19771409034729,
1249
+ "learning_rate": 1.9393637486302257e-05,
1250
+ "loss": 0.2741,
1251
+ "step": 162
1252
+ },
1253
+ {
1254
+ "epoch": 0.20173267326732675,
1255
+ "grad_norm": 4.557991027832031,
1256
+ "learning_rate": 1.937873108571718e-05,
1257
+ "loss": 0.2677,
1258
+ "step": 163
1259
+ },
1260
+ {
1261
+ "epoch": 0.20297029702970298,
1262
+ "grad_norm": 4.806491374969482,
1263
+ "learning_rate": 1.936364954957368e-05,
1264
+ "loss": 0.2728,
1265
+ "step": 164
1266
+ },
1267
+ {
1268
+ "epoch": 0.2042079207920792,
1269
+ "grad_norm": 5.901110649108887,
1270
+ "learning_rate": 1.934839315949976e-05,
1271
+ "loss": 0.2406,
1272
+ "step": 165
1273
+ },
1274
+ {
1275
+ "epoch": 0.20544554455445543,
1276
+ "grad_norm": 3.7812883853912354,
1277
+ "learning_rate": 1.933296220038858e-05,
1278
+ "loss": 0.2857,
1279
+ "step": 166
1280
+ },
1281
+ {
1282
+ "epoch": 0.20668316831683167,
1283
+ "grad_norm": 4.161533832550049,
1284
+ "learning_rate": 1.9317356960393158e-05,
1285
+ "loss": 0.2132,
1286
+ "step": 167
1287
+ },
1288
+ {
1289
+ "epoch": 0.2079207920792079,
1290
+ "grad_norm": 3.8676390647888184,
1291
+ "learning_rate": 1.9301577730920975e-05,
1292
+ "loss": 0.2486,
1293
+ "step": 168
1294
+ },
1295
+ {
1296
+ "epoch": 0.20915841584158415,
1297
+ "grad_norm": 4.488946437835693,
1298
+ "learning_rate": 1.9285624806628543e-05,
1299
+ "loss": 0.2859,
1300
+ "step": 169
1301
+ },
1302
+ {
1303
+ "epoch": 0.2103960396039604,
1304
+ "grad_norm": 3.541072130203247,
1305
+ "learning_rate": 1.9269498485415897e-05,
1306
+ "loss": 0.2522,
1307
+ "step": 170
1308
+ },
1309
+ {
1310
+ "epoch": 0.21163366336633663,
1311
+ "grad_norm": 3.683732509613037,
1312
+ "learning_rate": 1.925319906842103e-05,
1313
+ "loss": 0.223,
1314
+ "step": 171
1315
+ },
1316
+ {
1317
+ "epoch": 0.21287128712871287,
1318
+ "grad_norm": 3.875123977661133,
1319
+ "learning_rate": 1.923672686001427e-05,
1320
+ "loss": 0.2906,
1321
+ "step": 172
1322
+ },
1323
+ {
1324
+ "epoch": 0.2141089108910891,
1325
+ "grad_norm": 4.992143630981445,
1326
+ "learning_rate": 1.922008216779261e-05,
1327
+ "loss": 0.2183,
1328
+ "step": 173
1329
+ },
1330
+ {
1331
+ "epoch": 0.21534653465346534,
1332
+ "grad_norm": 5.165887355804443,
1333
+ "learning_rate": 1.920326530257394e-05,
1334
+ "loss": 0.2291,
1335
+ "step": 174
1336
+ },
1337
+ {
1338
+ "epoch": 0.21658415841584158,
1339
+ "grad_norm": 3.6516168117523193,
1340
+ "learning_rate": 1.9186276578391268e-05,
1341
+ "loss": 0.2092,
1342
+ "step": 175
1343
+ },
1344
+ {
1345
+ "epoch": 0.21782178217821782,
1346
+ "grad_norm": 3.7098777294158936,
1347
+ "learning_rate": 1.9169116312486835e-05,
1348
+ "loss": 0.2635,
1349
+ "step": 176
1350
+ },
1351
+ {
1352
+ "epoch": 0.21905940594059406,
1353
+ "grad_norm": 6.8240180015563965,
1354
+ "learning_rate": 1.9151784825306205e-05,
1355
+ "loss": 0.2545,
1356
+ "step": 177
1357
+ },
1358
+ {
1359
+ "epoch": 0.2202970297029703,
1360
+ "grad_norm": 4.409351348876953,
1361
+ "learning_rate": 1.9134282440492272e-05,
1362
+ "loss": 0.2505,
1363
+ "step": 178
1364
+ },
1365
+ {
1366
+ "epoch": 0.22153465346534654,
1367
+ "grad_norm": 3.2560315132141113,
1368
+ "learning_rate": 1.911660948487922e-05,
1369
+ "loss": 0.2857,
1370
+ "step": 179
1371
+ },
1372
+ {
1373
+ "epoch": 0.22277227722772278,
1374
+ "grad_norm": 5.461050987243652,
1375
+ "learning_rate": 1.9098766288486426e-05,
1376
+ "loss": 0.2782,
1377
+ "step": 180
1378
+ },
1379
+ {
1380
+ "epoch": 0.22277227722772278,
1381
+ "eval_accuracy": 0.8237250554323725,
1382
+ "eval_f1": 0.5974683544303797,
1383
+ "eval_loss": 0.34932276606559753,
1384
+ "eval_precision": 0.8309859154929577,
1385
+ "eval_recall": 0.466403162055336,
1386
+ "eval_runtime": 49.2509,
1387
+ "eval_samples_per_second": 5.604,
1388
+ "eval_steps_per_second": 0.183,
1389
+ "step": 180
1390
+ },
1391
+ {
1392
+ "epoch": 0.22400990099009901,
1393
+ "grad_norm": 3.929197072982788,
1394
+ "learning_rate": 1.9080753184512284e-05,
1395
+ "loss": 0.2682,
1396
+ "step": 181
1397
+ },
1398
+ {
1399
+ "epoch": 0.22524752475247525,
1400
+ "grad_norm": 4.4159393310546875,
1401
+ "learning_rate": 1.9062570509327993e-05,
1402
+ "loss": 0.2503,
1403
+ "step": 182
1404
+ },
1405
+ {
1406
+ "epoch": 0.2264851485148515,
1407
+ "grad_norm": 5.622183799743652,
1408
+ "learning_rate": 1.9044218602471275e-05,
1409
+ "loss": 0.3253,
1410
+ "step": 183
1411
+ },
1412
+ {
1413
+ "epoch": 0.22772277227722773,
1414
+ "grad_norm": 3.281792402267456,
1415
+ "learning_rate": 1.9025697806640035e-05,
1416
+ "loss": 0.2018,
1417
+ "step": 184
1418
+ },
1419
+ {
1420
+ "epoch": 0.22896039603960397,
1421
+ "grad_norm": 3.431208372116089,
1422
+ "learning_rate": 1.9007008467685947e-05,
1423
+ "loss": 0.2012,
1424
+ "step": 185
1425
+ },
1426
+ {
1427
+ "epoch": 0.2301980198019802,
1428
+ "grad_norm": 5.277952671051025,
1429
+ "learning_rate": 1.8988150934608014e-05,
1430
+ "loss": 0.2031,
1431
+ "step": 186
1432
+ },
1433
+ {
1434
+ "epoch": 0.23143564356435645,
1435
+ "grad_norm": 4.322801113128662,
1436
+ "learning_rate": 1.8969125559546054e-05,
1437
+ "loss": 0.2626,
1438
+ "step": 187
1439
+ },
1440
+ {
1441
+ "epoch": 0.23267326732673269,
1442
+ "grad_norm": 4.021146297454834,
1443
+ "learning_rate": 1.894993269777411e-05,
1444
+ "loss": 0.2343,
1445
+ "step": 188
1446
+ },
1447
+ {
1448
+ "epoch": 0.23391089108910892,
1449
+ "grad_norm": 3.045038938522339,
1450
+ "learning_rate": 1.893057270769381e-05,
1451
+ "loss": 0.1718,
1452
+ "step": 189
1453
+ },
1454
+ {
1455
+ "epoch": 0.23514851485148514,
1456
+ "grad_norm": 4.587369441986084,
1457
+ "learning_rate": 1.8911045950827693e-05,
1458
+ "loss": 0.2377,
1459
+ "step": 190
1460
+ },
1461
+ {
1462
+ "epoch": 0.23638613861386137,
1463
+ "grad_norm": 5.442078590393066,
1464
+ "learning_rate": 1.8891352791812452e-05,
1465
+ "loss": 0.2796,
1466
+ "step": 191
1467
+ },
1468
+ {
1469
+ "epoch": 0.2376237623762376,
1470
+ "grad_norm": 6.258726596832275,
1471
+ "learning_rate": 1.8871493598392122e-05,
1472
+ "loss": 0.2856,
1473
+ "step": 192
1474
+ },
1475
+ {
1476
+ "epoch": 0.23886138613861385,
1477
+ "grad_norm": 6.618675231933594,
1478
+ "learning_rate": 1.885146874141121e-05,
1479
+ "loss": 0.256,
1480
+ "step": 193
1481
+ },
1482
+ {
1483
+ "epoch": 0.2400990099009901,
1484
+ "grad_norm": 4.947834491729736,
1485
+ "learning_rate": 1.8831278594807783e-05,
1486
+ "loss": 0.2452,
1487
+ "step": 194
1488
+ },
1489
+ {
1490
+ "epoch": 0.24133663366336633,
1491
+ "grad_norm": 3.6348724365234375,
1492
+ "learning_rate": 1.881092353560646e-05,
1493
+ "loss": 0.2141,
1494
+ "step": 195
1495
+ },
1496
+ {
1497
+ "epoch": 0.24257425742574257,
1498
+ "grad_norm": 7.256039619445801,
1499
+ "learning_rate": 1.8790403943911403e-05,
1500
+ "loss": 0.2617,
1501
+ "step": 196
1502
+ },
1503
+ {
1504
+ "epoch": 0.2438118811881188,
1505
+ "grad_norm": 4.058467864990234,
1506
+ "learning_rate": 1.8769720202899196e-05,
1507
+ "loss": 0.2119,
1508
+ "step": 197
1509
+ },
1510
+ {
1511
+ "epoch": 0.24504950495049505,
1512
+ "grad_norm": 8.09382438659668,
1513
+ "learning_rate": 1.8748872698811695e-05,
1514
+ "loss": 0.2156,
1515
+ "step": 198
1516
+ },
1517
+ {
1518
+ "epoch": 0.24628712871287128,
1519
+ "grad_norm": 5.703820705413818,
1520
+ "learning_rate": 1.872786182094882e-05,
1521
+ "loss": 0.1883,
1522
+ "step": 199
1523
+ },
1524
+ {
1525
+ "epoch": 0.24752475247524752,
1526
+ "grad_norm": 6.104684352874756,
1527
+ "learning_rate": 1.870668796166129e-05,
1528
+ "loss": 0.2174,
1529
+ "step": 200
1530
+ },
1531
+ {
1532
+ "epoch": 0.24752475247524752,
1533
+ "eval_accuracy": 0.8492239467849224,
1534
+ "eval_f1": 0.6866359447004609,
1535
+ "eval_loss": 0.33290114998817444,
1536
+ "eval_precision": 0.8232044198895028,
1537
+ "eval_recall": 0.5889328063241107,
1538
+ "eval_runtime": 48.1855,
1539
+ "eval_samples_per_second": 5.728,
1540
+ "eval_steps_per_second": 0.187,
1541
+ "step": 200
1542
+ }
1543
+ ],
1544
+ "logging_steps": 1,
1545
+ "max_steps": 808,
1546
+ "num_input_tokens_seen": 0,
1547
+ "num_train_epochs": 1,
1548
+ "save_steps": 100,
1549
+ "stateful_callbacks": {
1550
+ "TrainerControl": {
1551
+ "args": {
1552
+ "should_epoch_stop": false,
1553
+ "should_evaluate": false,
1554
+ "should_log": false,
1555
+ "should_save": true,
1556
+ "should_training_stop": false
1557
+ },
1558
+ "attributes": {}
1559
+ }
1560
+ },
1561
+ "total_flos": 6.099415773216768e+16,
1562
+ "train_batch_size": 8,
1563
+ "trial_name": null,
1564
+ "trial_params": null
1565
+ }