fengyao1909 commited on
Commit
9eebd67
·
verified ·
1 Parent(s): 79427b2

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -31,7 +31,7 @@
31
  "sliding_window": null,
32
  "tie_word_embeddings": false,
33
  "torch_dtype": "bfloat16",
34
- "transformers_version": "4.52.0.dev0",
35
  "use_cache": false,
36
  "use_sliding_window": false,
37
  "vocab_size": 151936
 
31
  "sliding_window": null,
32
  "tie_word_embeddings": false,
33
  "torch_dtype": "bfloat16",
34
+ "transformers_version": "4.51.3",
35
  "use_cache": false,
36
  "use_sliding_window": false,
37
  "vocab_size": 151936
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "bos_token_id": 151643,
3
  "eos_token_id": 151643,
4
  "max_new_tokens": 2048,
5
- "transformers_version": "4.52.0.dev0"
6
  }
 
2
  "bos_token_id": 151643,
3
  "eos_token_id": 151643,
4
  "max_new_tokens": 2048,
5
+ "transformers_version": "4.51.3"
6
  }
model-00001-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4ec39b8c542dec46c584428bcc41f239ddfa2d7f5f5e01282aa8fc17cfd43f7
3
  size 4997184968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2cb82b1e4227f90eb539df67a23906a0d386529bd90e4416518981a939d50df4
3
  size 4997184968
model-00002-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:888bc516d004ed68e4d137a0425fc721f903558076e2c5eb3091f98f13c2f7d7
3
  size 4997741608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:165c9b97ecf4122feed064367305d6b4ee747f28ff4fc7e8d94e393844c8fee0
3
  size 4997741608
model-00003-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9098519c5209091cdbf2ae6ad89e6f8623b2733f39415bb5e58e18a16c81182c
3
  size 4997742208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21c28227bd44c09f75a1cc23b6223b02c78147ebfe2c9d137f0f3f06257be640
3
  size 4997742208
model-00004-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:98285851600cd3b86069655c14976bdd8da7e969d489bdd46d0e0b37ba6c3ad0
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fce7c4b609b2199ba4bc2760712d703c3f2a4560ae5c3d32282c20d2abd1c67b
3
  size 4997743184
model-00005-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f05637820a3e97cbe0c767eb03e0f7d83cff36fd50d4c8bc584d936d7e068e3b
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:308943706770822f3f0e3c55998232a8c0632e9d10eda02c9106273735d1b726
3
  size 4997743184
model-00006-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9bc5f4b89fbc1cfde43a03bcbd497bc48c2e5664e8d5d0ff18b5f4f99968b881
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:adb6b63d7f05f576a681ed78e82162d3b8416dfbb288ae68e3f68b1dd6a8ea2f
3
  size 4997743184
model-00007-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0cdcf393112aee1f1b1979b9a8d317ecbd3f30ba04b45640981768ef8cc8641b
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6c55451276d19052507d488e74846ea84bd17a3ea982b7e1c8c5d4502b120fe
3
  size 4997743184
model-00008-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a0c698599c34c6ebca6bce94f7d2bf4b01ba907b12dc3687ce69cb08e302ed0
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:493720f40234c7d1c82dded0266793cc1d77bfe3d3cf196ba6d16dc197a4330a
3
  size 4997743184
model-00009-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9fc388c6012d6ddba57665bfe588a5c3791d0ed33a18117ef0e8fe13364a81bc
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24f9ebf5e271e1b068f8fbf4312a124a5855857286fd063e78a25c6c46f735d9
3
  size 4997743184
model-00010-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ab710fe0b2f24fffdb29679d2eb5d3224e762cb3615c74d37f405bb3c3a9b18b
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0fc67b140de70461152ffe0d1d4f39d610273d498a180520eb93519a9578a1f
3
  size 4997743184
model-00011-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12e18dc22e8b72e7a489b13911b19ccf42f66d360b823f5f92a005bdedcc827e
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c15508f97bd308053a4316b31a4eac5c24e60a9e1df4d762ebfaac10e9e0ed58
3
  size 4997743184
model-00012-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b94eec06606cc616588201353704c6b13b76af4a4c9631cbd86d6baa60107a32
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da7e4a9da619d34efd17daed024a177a1251ff995b46630f755b6699327d912d
3
  size 4997743184
model-00013-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:70e617bdbabf28f382f1b6c67e6031a9e42bed1fd93899be3c91232a9ebbd627
3
  size 1094220288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f27829fb905e82ca6602bfe1c29fed13cf8084e97dab950a5ccc8b35ef990aa
3
  size 1094220288
tokenizer_config.json CHANGED
@@ -227,6 +227,7 @@
227
  "<|video_pad|>"
228
  ],
229
  "bos_token": null,
 
230
  "clean_up_tokenization_spaces": false,
231
  "eos_token": "<|im_end|>",
232
  "errors": "replace",
 
227
  "<|video_pad|>"
228
  ],
229
  "bos_token": null,
230
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set content = message.content %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in message.content %}\n {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}",
231
  "clean_up_tokenization_spaces": false,
232
  "eos_token": "<|im_end|>",
233
  "errors": "replace",
trainer_state.json CHANGED
@@ -11,800 +11,800 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.017543859649122806,
14
- "grad_norm": 2.1737315354989266,
15
  "learning_rate": 0.0,
16
  "loss": 0.8483,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.03508771929824561,
21
- "grad_norm": 2.0657831287103607,
22
  "learning_rate": 3.3333333333333333e-06,
23
  "loss": 0.8763,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.05263157894736842,
28
- "grad_norm": 2.2705565927471225,
29
  "learning_rate": 6.666666666666667e-06,
30
- "loss": 1.019,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.07017543859649122,
35
- "grad_norm": 1.8847905520566954,
36
  "learning_rate": 1e-05,
37
- "loss": 1.0442,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.08771929824561403,
42
- "grad_norm": 1.6215868467951315,
43
  "learning_rate": 1.3333333333333333e-05,
44
- "loss": 0.8301,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.10526315789473684,
49
- "grad_norm": 1.3337654525321527,
50
  "learning_rate": 1.6666666666666667e-05,
51
- "loss": 0.967,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.12280701754385964,
56
- "grad_norm": 2.3177821608360234,
57
  "learning_rate": 2e-05,
58
- "loss": 0.9657,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.14035087719298245,
63
- "grad_norm": 1.9407807377668538,
64
  "learning_rate": 2.3333333333333336e-05,
65
- "loss": 0.9063,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.15789473684210525,
70
- "grad_norm": 2.333688544412328,
71
  "learning_rate": 2.6666666666666667e-05,
72
- "loss": 0.9327,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.17543859649122806,
77
- "grad_norm": 1.8911158364164133,
78
  "learning_rate": 3e-05,
79
- "loss": 0.9615,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.19298245614035087,
84
- "grad_norm": 1.559026746068459,
85
  "learning_rate": 3.3333333333333335e-05,
86
- "loss": 0.9983,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.21052631578947367,
91
- "grad_norm": 1.0012950206512936,
92
  "learning_rate": 3.6666666666666666e-05,
93
- "loss": 0.8713,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.22807017543859648,
98
- "grad_norm": 1.0896746007190306,
99
  "learning_rate": 4e-05,
100
- "loss": 0.9018,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.24561403508771928,
105
- "grad_norm": 1.1472331615890097,
106
  "learning_rate": 4.3333333333333334e-05,
107
- "loss": 0.8905,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.2631578947368421,
112
- "grad_norm": 1.0827434164812821,
113
  "learning_rate": 4.666666666666667e-05,
114
- "loss": 0.9487,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.2807017543859649,
119
- "grad_norm": 0.8407939026078357,
120
  "learning_rate": 5e-05,
121
- "loss": 0.9016,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.2982456140350877,
126
- "grad_norm": 1.256351670462119,
127
  "learning_rate": 4.999830770009406e-05,
128
- "loss": 1.0016,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.3157894736842105,
133
- "grad_norm": 1.010708781438488,
134
  "learning_rate": 4.9993231029486544e-05,
135
- "loss": 0.8827,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 0.3333333333333333,
140
- "grad_norm": 0.7503718658890081,
141
  "learning_rate": 4.99847706754774e-05,
142
- "loss": 0.9054,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 0.3508771929824561,
147
- "grad_norm": 0.8427918773123176,
148
  "learning_rate": 4.997292778346312e-05,
149
- "loss": 0.9468,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 0.3684210526315789,
154
- "grad_norm": 0.9103198542357233,
155
  "learning_rate": 4.995770395678171e-05,
156
- "loss": 0.8832,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 0.38596491228070173,
161
- "grad_norm": 0.743975757226857,
162
  "learning_rate": 4.993910125649561e-05,
163
- "loss": 0.7529,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 0.40350877192982454,
168
- "grad_norm": 0.8055578797506145,
169
  "learning_rate": 4.9917122201112656e-05,
170
- "loss": 0.889,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 0.42105263157894735,
175
- "grad_norm": 0.8546590892160185,
176
  "learning_rate": 4.989176976624511e-05,
177
- "loss": 0.7896,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 0.43859649122807015,
182
- "grad_norm": 0.6051439965679499,
183
  "learning_rate": 4.9863047384206835e-05,
184
- "loss": 0.9219,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 0.45614035087719296,
189
- "grad_norm": 0.7522440995111975,
190
  "learning_rate": 4.983095894354858e-05,
191
- "loss": 0.8917,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 0.47368421052631576,
196
- "grad_norm": 0.7072008972448397,
197
  "learning_rate": 4.979550878853154e-05,
198
- "loss": 0.8044,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 0.49122807017543857,
203
- "grad_norm": 0.5952822120665937,
204
  "learning_rate": 4.975670171853926e-05,
205
- "loss": 0.8763,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 0.5087719298245614,
210
- "grad_norm": 0.5803355862918257,
211
  "learning_rate": 4.971454298742779e-05,
212
- "loss": 0.8983,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 0.5263157894736842,
217
- "grad_norm": 0.666437418274057,
218
  "learning_rate": 4.966903830281449e-05,
219
- "loss": 0.7866,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 0.543859649122807,
224
- "grad_norm": 0.6919189164588267,
225
  "learning_rate": 4.962019382530521e-05,
226
- "loss": 0.8412,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 0.5614035087719298,
231
- "grad_norm": 0.5772289061045422,
232
  "learning_rate": 4.9568016167660334e-05,
233
- "loss": 0.7779,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 0.5789473684210527,
238
- "grad_norm": 0.6384456727598309,
239
  "learning_rate": 4.951251239389948e-05,
240
- "loss": 0.8722,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 0.5964912280701754,
245
- "grad_norm": 0.7003155529960633,
246
  "learning_rate": 4.9453690018345144e-05,
247
- "loss": 0.9772,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 0.6140350877192983,
252
- "grad_norm": 0.5892482774139678,
253
  "learning_rate": 4.939155700460536e-05,
254
- "loss": 0.8181,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 0.631578947368421,
259
- "grad_norm": 0.7141350627505272,
260
  "learning_rate": 4.9326121764495596e-05,
261
- "loss": 0.8379,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 0.6491228070175439,
266
- "grad_norm": 0.6685379755417092,
267
  "learning_rate": 4.925739315689991e-05,
268
- "loss": 0.876,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 0.6666666666666666,
273
- "grad_norm": 0.5887972405354546,
274
  "learning_rate": 4.9185380486571595e-05,
275
- "loss": 0.8834,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 0.6842105263157895,
280
- "grad_norm": 0.6547850047970274,
281
  "learning_rate": 4.9110093502873476e-05,
282
- "loss": 0.9068,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 0.7017543859649122,
287
- "grad_norm": 0.5095372843368391,
288
  "learning_rate": 4.9031542398457974e-05,
289
- "loss": 0.871,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 0.7192982456140351,
294
- "grad_norm": 0.5803092931334194,
295
  "learning_rate": 4.894973780788722e-05,
296
- "loss": 0.8804,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 0.7368421052631579,
301
- "grad_norm": 0.5157094405208372,
302
  "learning_rate": 4.88646908061933e-05,
303
- "loss": 0.8112,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 0.7543859649122807,
308
- "grad_norm": 0.5763799658576816,
309
  "learning_rate": 4.877641290737884e-05,
310
- "loss": 0.8485,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 0.7719298245614035,
315
- "grad_norm": 0.5206239155896253,
316
  "learning_rate": 4.868491606285823e-05,
317
- "loss": 0.7658,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 0.7894736842105263,
322
- "grad_norm": 0.5163119284367372,
323
  "learning_rate": 4.859021265983959e-05,
324
- "loss": 0.8806,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 0.8070175438596491,
329
- "grad_norm": 0.5555872744357858,
330
  "learning_rate": 4.849231551964771e-05,
331
- "loss": 0.8559,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 0.8245614035087719,
336
- "grad_norm": 0.670329020537153,
337
  "learning_rate": 4.839123789598829e-05,
338
- "loss": 0.8667,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 0.8421052631578947,
343
- "grad_norm": 0.5517318833150326,
344
  "learning_rate": 4.828699347315356e-05,
345
- "loss": 0.8424,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 0.8596491228070176,
350
- "grad_norm": 0.6399166130889407,
351
  "learning_rate": 4.817959636416969e-05,
352
- "loss": 0.9417,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 0.8771929824561403,
357
- "grad_norm": 0.460883037522045,
358
  "learning_rate": 4.806906110888606e-05,
359
- "loss": 0.6751,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 0.8947368421052632,
364
- "grad_norm": 0.6278603703151171,
365
  "learning_rate": 4.7955402672006854e-05,
366
- "loss": 0.9086,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 0.9122807017543859,
371
- "grad_norm": 0.5482062880943421,
372
  "learning_rate": 4.783863644106502e-05,
373
- "loss": 0.8832,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 0.9298245614035088,
378
- "grad_norm": 0.5538960900799985,
379
  "learning_rate": 4.771877822433911e-05,
380
- "loss": 0.8332,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 0.9473684210526315,
385
- "grad_norm": 0.5020357523197391,
386
  "learning_rate": 4.759584424871302e-05,
387
- "loss": 0.7833,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 0.9649122807017544,
392
- "grad_norm": 0.5299810912852558,
393
  "learning_rate": 4.7469851157479177e-05,
394
- "loss": 0.8219,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 0.9824561403508771,
399
- "grad_norm": 0.48572413576489937,
400
  "learning_rate": 4.734081600808531e-05,
401
- "loss": 0.8327,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 1.0,
406
- "grad_norm": 0.7694567676131596,
407
  "learning_rate": 4.7208756269825104e-05,
408
- "loss": 0.6932,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 1.0175438596491229,
413
- "grad_norm": 0.8079734171770722,
414
  "learning_rate": 4.707368982147318e-05,
415
- "loss": 0.7363,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 1.0350877192982457,
420
- "grad_norm": 0.647606443516366,
421
  "learning_rate": 4.693563494886455e-05,
422
- "loss": 0.628,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 1.0526315789473684,
427
- "grad_norm": 1.0392055903524327,
428
  "learning_rate": 4.679461034241906e-05,
429
- "loss": 0.7284,
430
  "step": 60
431
  },
432
  {
433
  "epoch": 1.0701754385964912,
434
- "grad_norm": 0.8936107628035881,
435
  "learning_rate": 4.665063509461097e-05,
436
- "loss": 0.6199,
437
  "step": 61
438
  },
439
  {
440
  "epoch": 1.087719298245614,
441
- "grad_norm": 0.8579847990954221,
442
  "learning_rate": 4.650372869738414e-05,
443
- "loss": 0.6542,
444
  "step": 62
445
  },
446
  {
447
  "epoch": 1.1052631578947367,
448
- "grad_norm": 1.2135497198079113,
449
  "learning_rate": 4.6353911039513145e-05,
450
- "loss": 0.7282,
451
  "step": 63
452
  },
453
  {
454
  "epoch": 1.1228070175438596,
455
- "grad_norm": 0.8104073129740915,
456
  "learning_rate": 4.620120240391065e-05,
457
- "loss": 0.6731,
458
  "step": 64
459
  },
460
  {
461
  "epoch": 1.1403508771929824,
462
- "grad_norm": 0.6601313754612921,
463
  "learning_rate": 4.604562346488144e-05,
464
- "loss": 0.6216,
465
  "step": 65
466
  },
467
  {
468
  "epoch": 1.1578947368421053,
469
- "grad_norm": 0.7521560385174539,
470
  "learning_rate": 4.588719528532342e-05,
471
- "loss": 0.6368,
472
  "step": 66
473
  },
474
  {
475
  "epoch": 1.1754385964912282,
476
- "grad_norm": 0.6696118869453084,
477
  "learning_rate": 4.572593931387604e-05,
478
- "loss": 0.6243,
479
  "step": 67
480
  },
481
  {
482
  "epoch": 1.1929824561403508,
483
- "grad_norm": 0.7323191679088321,
484
  "learning_rate": 4.556187738201656e-05,
485
- "loss": 0.6631,
486
  "step": 68
487
  },
488
  {
489
  "epoch": 1.2105263157894737,
490
- "grad_norm": 0.7881911923428006,
491
  "learning_rate": 4.539503170110431e-05,
492
- "loss": 0.702,
493
  "step": 69
494
  },
495
  {
496
  "epoch": 1.2280701754385965,
497
- "grad_norm": 0.5443171520898024,
498
  "learning_rate": 4.522542485937369e-05,
499
- "loss": 0.6556,
500
  "step": 70
501
  },
502
  {
503
  "epoch": 1.2456140350877192,
504
- "grad_norm": 0.7465685103291306,
505
  "learning_rate": 4.50530798188761e-05,
506
- "loss": 0.7226,
507
  "step": 71
508
  },
509
  {
510
  "epoch": 1.263157894736842,
511
- "grad_norm": 1.531802448485134,
512
  "learning_rate": 4.48780199123712e-05,
513
- "loss": 0.6821,
514
  "step": 72
515
  },
516
  {
517
  "epoch": 1.280701754385965,
518
- "grad_norm": 0.624551106382425,
519
  "learning_rate": 4.4700268840168045e-05,
520
- "loss": 0.6704,
521
  "step": 73
522
  },
523
  {
524
  "epoch": 1.2982456140350878,
525
- "grad_norm": 0.5646709447787752,
526
  "learning_rate": 4.4519850666916484e-05,
527
- "loss": 0.6031,
528
  "step": 74
529
  },
530
  {
531
  "epoch": 1.3157894736842106,
532
- "grad_norm": 0.6162637626547116,
533
  "learning_rate": 4.43367898183491e-05,
534
- "loss": 0.7309,
535
  "step": 75
536
  },
537
  {
538
  "epoch": 1.3333333333333333,
539
- "grad_norm": 0.484728084443174,
540
  "learning_rate": 4.415111107797445e-05,
541
- "loss": 0.6089,
542
  "step": 76
543
  },
544
  {
545
  "epoch": 1.3508771929824561,
546
- "grad_norm": 0.5348661093893772,
547
  "learning_rate": 4.396283958372173e-05,
548
- "loss": 0.6013,
549
  "step": 77
550
  },
551
  {
552
  "epoch": 1.368421052631579,
553
- "grad_norm": 0.6972365743462106,
554
  "learning_rate": 4.377200082453749e-05,
555
- "loss": 0.7103,
556
  "step": 78
557
  },
558
  {
559
  "epoch": 1.3859649122807016,
560
- "grad_norm": 0.6386714887467159,
561
  "learning_rate": 4.357862063693486e-05,
562
- "loss": 0.6876,
563
  "step": 79
564
  },
565
  {
566
  "epoch": 1.4035087719298245,
567
- "grad_norm": 0.6068938883006068,
568
  "learning_rate": 4.3382725201495723e-05,
569
- "loss": 0.6655,
570
  "step": 80
571
  },
572
  {
573
  "epoch": 1.4210526315789473,
574
- "grad_norm": 0.5578279802500624,
575
  "learning_rate": 4.318434103932622e-05,
576
- "loss": 0.6688,
577
  "step": 81
578
  },
579
  {
580
  "epoch": 1.4385964912280702,
581
- "grad_norm": 0.5450559810011418,
582
  "learning_rate": 4.2983495008466276e-05,
583
- "loss": 0.6545,
584
  "step": 82
585
  },
586
  {
587
  "epoch": 1.456140350877193,
588
- "grad_norm": 0.549555813592061,
589
  "learning_rate": 4.278021430025343e-05,
590
- "loss": 0.5803,
591
  "step": 83
592
  },
593
  {
594
  "epoch": 1.4736842105263157,
595
- "grad_norm": 0.44599243421537427,
596
  "learning_rate": 4.257452643564155e-05,
597
- "loss": 0.5056,
598
  "step": 84
599
  },
600
  {
601
  "epoch": 1.4912280701754386,
602
- "grad_norm": 0.6512205627641299,
603
  "learning_rate": 4.2366459261474933e-05,
604
- "loss": 0.6704,
605
  "step": 85
606
  },
607
  {
608
  "epoch": 1.5087719298245614,
609
- "grad_norm": 0.5031784031963774,
610
  "learning_rate": 4.215604094671835e-05,
611
- "loss": 0.6411,
612
  "step": 86
613
  },
614
  {
615
  "epoch": 1.526315789473684,
616
- "grad_norm": 0.5371693274835859,
617
  "learning_rate": 4.194329997864331e-05,
618
- "loss": 0.7647,
619
  "step": 87
620
  },
621
  {
622
  "epoch": 1.543859649122807,
623
- "grad_norm": 0.5952220323734707,
624
  "learning_rate": 4.172826515897146e-05,
625
- "loss": 0.6663,
626
  "step": 88
627
  },
628
  {
629
  "epoch": 1.5614035087719298,
630
- "grad_norm": 0.5194062067573968,
631
  "learning_rate": 4.1510965599975196e-05,
632
- "loss": 0.7005,
633
  "step": 89
634
  },
635
  {
636
  "epoch": 1.5789473684210527,
637
- "grad_norm": 0.5666524807032483,
638
  "learning_rate": 4.129143072053638e-05,
639
- "loss": 0.604,
640
  "step": 90
641
  },
642
  {
643
  "epoch": 1.5964912280701755,
644
- "grad_norm": 0.5389713669315125,
645
  "learning_rate": 4.1069690242163484e-05,
646
- "loss": 0.6349,
647
  "step": 91
648
  },
649
  {
650
  "epoch": 1.6140350877192984,
651
- "grad_norm": 0.5311149694642213,
652
  "learning_rate": 4.0845774184967754e-05,
653
- "loss": 0.652,
654
  "step": 92
655
  },
656
  {
657
  "epoch": 1.631578947368421,
658
- "grad_norm": 0.5012325622032431,
659
  "learning_rate": 4.0619712863599e-05,
660
- "loss": 0.5946,
661
  "step": 93
662
  },
663
  {
664
  "epoch": 1.6491228070175439,
665
- "grad_norm": 0.4814283230741368,
666
  "learning_rate": 4.039153688314145e-05,
667
- "loss": 0.6008,
668
  "step": 94
669
  },
670
  {
671
  "epoch": 1.6666666666666665,
672
- "grad_norm": 0.5273571145503247,
673
  "learning_rate": 4.0161277134970345e-05,
674
- "loss": 0.6998,
675
  "step": 95
676
  },
677
  {
678
  "epoch": 1.6842105263157894,
679
- "grad_norm": 0.6037954834513913,
680
  "learning_rate": 3.9928964792569655e-05,
681
- "loss": 0.6152,
682
  "step": 96
683
  },
684
  {
685
  "epoch": 1.7017543859649122,
686
- "grad_norm": 0.5375890783535183,
687
  "learning_rate": 3.969463130731183e-05,
688
- "loss": 0.7677,
689
  "step": 97
690
  },
691
  {
692
  "epoch": 1.719298245614035,
693
- "grad_norm": 0.6267427489443681,
694
  "learning_rate": 3.945830840419966e-05,
695
- "loss": 0.6621,
696
  "step": 98
697
  },
698
  {
699
  "epoch": 1.736842105263158,
700
- "grad_norm": 0.5070654701641488,
701
  "learning_rate": 3.9220028077571295e-05,
702
- "loss": 0.679,
703
  "step": 99
704
  },
705
  {
706
  "epoch": 1.7543859649122808,
707
- "grad_norm": 0.49591448977957925,
708
  "learning_rate": 3.897982258676867e-05,
709
- "loss": 0.6153,
710
  "step": 100
711
  },
712
  {
713
  "epoch": 1.7719298245614035,
714
- "grad_norm": 0.5563015062976071,
715
  "learning_rate": 3.873772445177015e-05,
716
- "loss": 0.7054,
717
  "step": 101
718
  },
719
  {
720
  "epoch": 1.7894736842105263,
721
- "grad_norm": 0.4874896431926999,
722
  "learning_rate": 3.8493766448787825e-05,
723
- "loss": 0.6942,
724
  "step": 102
725
  },
726
  {
727
  "epoch": 1.807017543859649,
728
- "grad_norm": 0.6557911386705954,
729
  "learning_rate": 3.824798160583012e-05,
730
- "loss": 0.695,
731
  "step": 103
732
  },
733
  {
734
  "epoch": 1.8245614035087718,
735
- "grad_norm": 0.5154404737450541,
736
  "learning_rate": 3.8000403198230387e-05,
737
- "loss": 0.5788,
738
  "step": 104
739
  },
740
  {
741
  "epoch": 1.8421052631578947,
742
- "grad_norm": 0.4665270737354506,
743
  "learning_rate": 3.775106474414188e-05,
744
- "loss": 0.5437,
745
  "step": 105
746
  },
747
  {
748
  "epoch": 1.8596491228070176,
749
- "grad_norm": 0.7148660834996573,
750
  "learning_rate": 3.7500000000000003e-05,
751
- "loss": 0.758,
752
  "step": 106
753
  },
754
  {
755
  "epoch": 1.8771929824561404,
756
- "grad_norm": 0.7335976271467938,
757
  "learning_rate": 3.7247242955952175e-05,
758
- "loss": 0.6691,
759
  "step": 107
760
  },
761
  {
762
  "epoch": 1.8947368421052633,
763
- "grad_norm": 0.5963174425512796,
764
  "learning_rate": 3.699282783125616e-05,
765
- "loss": 0.668,
766
  "step": 108
767
  },
768
  {
769
  "epoch": 1.912280701754386,
770
- "grad_norm": 0.5791219305747107,
771
  "learning_rate": 3.673678906964727e-05,
772
- "loss": 0.5826,
773
  "step": 109
774
  },
775
  {
776
  "epoch": 1.9298245614035088,
777
- "grad_norm": 0.5583303920187029,
778
  "learning_rate": 3.6479161334675296e-05,
779
- "loss": 0.7001,
780
  "step": 110
781
  },
782
  {
783
  "epoch": 1.9473684210526314,
784
- "grad_norm": 0.6110054211158291,
785
  "learning_rate": 3.621997950501156e-05,
786
- "loss": 0.7133,
787
  "step": 111
788
  },
789
  {
790
  "epoch": 1.9649122807017543,
791
- "grad_norm": 0.617058874054503,
792
  "learning_rate": 3.5959278669726935e-05,
793
- "loss": 0.6776,
794
  "step": 112
795
  },
796
  {
797
  "epoch": 1.9824561403508771,
798
- "grad_norm": 0.5818691639875067,
799
  "learning_rate": 3.569709412354136e-05,
800
- "loss": 0.7395,
801
  "step": 113
802
  },
803
  {
804
  "epoch": 2.0,
805
- "grad_norm": 0.7383826511530396,
806
  "learning_rate": 3.543346136204545e-05,
807
- "loss": 0.5666,
808
  "step": 114
809
  }
810
  ],
 
11
  "log_history": [
12
  {
13
  "epoch": 0.017543859649122806,
14
+ "grad_norm": 1.8317057887019643,
15
  "learning_rate": 0.0,
16
  "loss": 0.8483,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.03508771929824561,
21
+ "grad_norm": 1.7353440418305668,
22
  "learning_rate": 3.3333333333333333e-06,
23
  "loss": 0.8763,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.05263157894736842,
28
+ "grad_norm": 1.9110385217084043,
29
  "learning_rate": 6.666666666666667e-06,
30
+ "loss": 1.0195,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.07017543859649122,
35
+ "grad_norm": 1.650455460756132,
36
  "learning_rate": 1e-05,
37
+ "loss": 1.049,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.08771929824561403,
42
+ "grad_norm": 0.9846115915202341,
43
  "learning_rate": 1.3333333333333333e-05,
44
+ "loss": 0.8378,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.10526315789473684,
49
+ "grad_norm": 1.2896673412488742,
50
  "learning_rate": 1.6666666666666667e-05,
51
+ "loss": 0.9736,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.12280701754385964,
56
+ "grad_norm": 2.011543412600694,
57
  "learning_rate": 2e-05,
58
+ "loss": 0.9678,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.14035087719298245,
63
+ "grad_norm": 1.6250329547144573,
64
  "learning_rate": 2.3333333333333336e-05,
65
+ "loss": 0.9089,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.15789473684210525,
70
+ "grad_norm": 1.6751272065587126,
71
  "learning_rate": 2.6666666666666667e-05,
72
+ "loss": 0.9359,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.17543859649122806,
77
+ "grad_norm": 1.4234656524715992,
78
  "learning_rate": 3e-05,
79
+ "loss": 0.9585,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.19298245614035087,
84
+ "grad_norm": 1.4638280218342359,
85
  "learning_rate": 3.3333333333333335e-05,
86
+ "loss": 1.0041,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.21052631578947367,
91
+ "grad_norm": 1.0419301809036907,
92
  "learning_rate": 3.6666666666666666e-05,
93
+ "loss": 0.8763,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.22807017543859648,
98
+ "grad_norm": 1.198346639614849,
99
  "learning_rate": 4e-05,
100
+ "loss": 0.9129,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.24561403508771928,
105
+ "grad_norm": 1.0034373969603534,
106
  "learning_rate": 4.3333333333333334e-05,
107
+ "loss": 0.8965,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.2631578947368421,
112
+ "grad_norm": 1.0380657919294447,
113
  "learning_rate": 4.666666666666667e-05,
114
+ "loss": 0.957,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.2807017543859649,
119
+ "grad_norm": 0.9533939175115029,
120
  "learning_rate": 5e-05,
121
+ "loss": 0.91,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.2982456140350877,
126
+ "grad_norm": 1.2098882485474858,
127
  "learning_rate": 4.999830770009406e-05,
128
+ "loss": 1.0061,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.3157894736842105,
133
+ "grad_norm": 0.9892702554456709,
134
  "learning_rate": 4.9993231029486544e-05,
135
+ "loss": 0.8912,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 0.3333333333333333,
140
+ "grad_norm": 0.715562209343003,
141
  "learning_rate": 4.99847706754774e-05,
142
+ "loss": 0.9125,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 0.3508771929824561,
147
+ "grad_norm": 0.8839740963896582,
148
  "learning_rate": 4.997292778346312e-05,
149
+ "loss": 0.9544,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 0.3684210526315789,
154
+ "grad_norm": 0.8672090781221823,
155
  "learning_rate": 4.995770395678171e-05,
156
+ "loss": 0.8931,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 0.38596491228070173,
161
+ "grad_norm": 0.5657183889732781,
162
  "learning_rate": 4.993910125649561e-05,
163
+ "loss": 0.7591,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 0.40350877192982454,
168
+ "grad_norm": 0.8395194599838961,
169
  "learning_rate": 4.9917122201112656e-05,
170
+ "loss": 0.8958,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 0.42105263157894735,
175
+ "grad_norm": 0.7949156783532444,
176
  "learning_rate": 4.989176976624511e-05,
177
+ "loss": 0.7957,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 0.43859649122807015,
182
+ "grad_norm": 0.6233078811754251,
183
  "learning_rate": 4.9863047384206835e-05,
184
+ "loss": 0.9287,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 0.45614035087719296,
189
+ "grad_norm": 0.6592512357205284,
190
  "learning_rate": 4.983095894354858e-05,
191
+ "loss": 0.8961,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 0.47368421052631576,
196
+ "grad_norm": 0.7540428801849153,
197
  "learning_rate": 4.979550878853154e-05,
198
+ "loss": 0.8113,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 0.49122807017543857,
203
+ "grad_norm": 0.6894407810210612,
204
  "learning_rate": 4.975670171853926e-05,
205
+ "loss": 0.8837,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 0.5087719298245614,
210
+ "grad_norm": 0.5962438335142802,
211
  "learning_rate": 4.971454298742779e-05,
212
+ "loss": 0.9039,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 0.5263157894736842,
217
+ "grad_norm": 0.6159211013371156,
218
  "learning_rate": 4.966903830281449e-05,
219
+ "loss": 0.7894,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 0.543859649122807,
224
+ "grad_norm": 0.7929356664184409,
225
  "learning_rate": 4.962019382530521e-05,
226
+ "loss": 0.8463,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 0.5614035087719298,
231
+ "grad_norm": 0.5952896775612254,
232
  "learning_rate": 4.9568016167660334e-05,
233
+ "loss": 0.7816,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 0.5789473684210527,
238
+ "grad_norm": 0.6758915434113398,
239
  "learning_rate": 4.951251239389948e-05,
240
+ "loss": 0.8776,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 0.5964912280701754,
245
+ "grad_norm": 0.6318291971598253,
246
  "learning_rate": 4.9453690018345144e-05,
247
+ "loss": 0.9796,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 0.6140350877192983,
252
+ "grad_norm": 0.7115801773839961,
253
  "learning_rate": 4.939155700460536e-05,
254
+ "loss": 0.8228,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 0.631578947368421,
259
+ "grad_norm": 0.5864293536264706,
260
  "learning_rate": 4.9326121764495596e-05,
261
+ "loss": 0.839,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 0.6491228070175439,
266
+ "grad_norm": 0.6287778513114669,
267
  "learning_rate": 4.925739315689991e-05,
268
+ "loss": 0.8778,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 0.6666666666666666,
273
+ "grad_norm": 0.6735952098345708,
274
  "learning_rate": 4.9185380486571595e-05,
275
+ "loss": 0.885,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 0.6842105263157895,
280
+ "grad_norm": 0.6067535888888641,
281
  "learning_rate": 4.9110093502873476e-05,
282
+ "loss": 0.9082,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 0.7017543859649122,
287
+ "grad_norm": 0.5247508841910119,
288
  "learning_rate": 4.9031542398457974e-05,
289
+ "loss": 0.8739,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 0.7192982456140351,
294
+ "grad_norm": 0.5918453184670143,
295
  "learning_rate": 4.894973780788722e-05,
296
+ "loss": 0.8829,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 0.7368421052631579,
301
+ "grad_norm": 0.6372269077212606,
302
  "learning_rate": 4.88646908061933e-05,
303
+ "loss": 0.8152,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 0.7543859649122807,
308
+ "grad_norm": 0.5430663295315261,
309
  "learning_rate": 4.877641290737884e-05,
310
+ "loss": 0.8514,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 0.7719298245614035,
315
+ "grad_norm": 0.511597332007924,
316
  "learning_rate": 4.868491606285823e-05,
317
+ "loss": 0.7675,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 0.7894736842105263,
322
+ "grad_norm": 0.48967659833861754,
323
  "learning_rate": 4.859021265983959e-05,
324
+ "loss": 0.8826,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 0.8070175438596491,
329
+ "grad_norm": 0.5856811530030098,
330
  "learning_rate": 4.849231551964771e-05,
331
+ "loss": 0.8585,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 0.8245614035087719,
336
+ "grad_norm": 0.5845644414839125,
337
  "learning_rate": 4.839123789598829e-05,
338
+ "loss": 0.8677,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 0.8421052631578947,
343
+ "grad_norm": 0.4974332648938187,
344
  "learning_rate": 4.828699347315356e-05,
345
+ "loss": 0.845,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 0.8596491228070176,
350
+ "grad_norm": 0.5566174918152419,
351
  "learning_rate": 4.817959636416969e-05,
352
+ "loss": 0.9432,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 0.8771929824561403,
357
+ "grad_norm": 0.4921351014911049,
358
  "learning_rate": 4.806906110888606e-05,
359
+ "loss": 0.6756,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 0.8947368421052632,
364
+ "grad_norm": 0.6352002986003276,
365
  "learning_rate": 4.7955402672006854e-05,
366
+ "loss": 0.9077,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 0.9122807017543859,
371
+ "grad_norm": 0.5228777570085404,
372
  "learning_rate": 4.783863644106502e-05,
373
+ "loss": 0.8815,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 0.9298245614035088,
378
+ "grad_norm": 0.5753314530576694,
379
  "learning_rate": 4.771877822433911e-05,
380
+ "loss": 0.8342,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 0.9473684210526315,
385
+ "grad_norm": 0.5127814989448164,
386
  "learning_rate": 4.759584424871302e-05,
387
+ "loss": 0.7837,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 0.9649122807017544,
392
+ "grad_norm": 0.545931046230859,
393
  "learning_rate": 4.7469851157479177e-05,
394
+ "loss": 0.8229,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 0.9824561403508771,
399
+ "grad_norm": 0.4669374807712089,
400
  "learning_rate": 4.734081600808531e-05,
401
+ "loss": 0.8341,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 1.0,
406
+ "grad_norm": 0.5386650743226552,
407
  "learning_rate": 4.7208756269825104e-05,
408
+ "loss": 0.7062,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 1.0175438596491229,
413
+ "grad_norm": 0.6793013357179898,
414
  "learning_rate": 4.707368982147318e-05,
415
+ "loss": 0.7762,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 1.0350877192982457,
420
+ "grad_norm": 0.627446069759414,
421
  "learning_rate": 4.693563494886455e-05,
422
+ "loss": 0.6663,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 1.0526315789473684,
427
+ "grad_norm": 0.6603925054843989,
428
  "learning_rate": 4.679461034241906e-05,
429
+ "loss": 0.7633,
430
  "step": 60
431
  },
432
  {
433
  "epoch": 1.0701754385964912,
434
+ "grad_norm": 0.8788262913431679,
435
  "learning_rate": 4.665063509461097e-05,
436
+ "loss": 0.6608,
437
  "step": 61
438
  },
439
  {
440
  "epoch": 1.087719298245614,
441
+ "grad_norm": 0.6735739896088276,
442
  "learning_rate": 4.650372869738414e-05,
443
+ "loss": 0.7057,
444
  "step": 62
445
  },
446
  {
447
  "epoch": 1.1052631578947367,
448
+ "grad_norm": 1.1748331139699308,
449
  "learning_rate": 4.6353911039513145e-05,
450
+ "loss": 0.7607,
451
  "step": 63
452
  },
453
  {
454
  "epoch": 1.1228070175438596,
455
+ "grad_norm": 0.8665469540278155,
456
  "learning_rate": 4.620120240391065e-05,
457
+ "loss": 0.7229,
458
  "step": 64
459
  },
460
  {
461
  "epoch": 1.1403508771929824,
462
+ "grad_norm": 0.6395824700230515,
463
  "learning_rate": 4.604562346488144e-05,
464
+ "loss": 0.6613,
465
  "step": 65
466
  },
467
  {
468
  "epoch": 1.1578947368421053,
469
+ "grad_norm": 0.7075740149066272,
470
  "learning_rate": 4.588719528532342e-05,
471
+ "loss": 0.6818,
472
  "step": 66
473
  },
474
  {
475
  "epoch": 1.1754385964912282,
476
+ "grad_norm": 0.6512740767179199,
477
  "learning_rate": 4.572593931387604e-05,
478
+ "loss": 0.6719,
479
  "step": 67
480
  },
481
  {
482
  "epoch": 1.1929824561403508,
483
+ "grad_norm": 0.7391191118171585,
484
  "learning_rate": 4.556187738201656e-05,
485
+ "loss": 0.716,
486
  "step": 68
487
  },
488
  {
489
  "epoch": 1.2105263157894737,
490
+ "grad_norm": 0.6544655450388867,
491
  "learning_rate": 4.539503170110431e-05,
492
+ "loss": 0.7671,
493
  "step": 69
494
  },
495
  {
496
  "epoch": 1.2280701754385965,
497
+ "grad_norm": 0.4890450142415007,
498
  "learning_rate": 4.522542485937369e-05,
499
+ "loss": 0.7089,
500
  "step": 70
501
  },
502
  {
503
  "epoch": 1.2456140350877192,
504
+ "grad_norm": 0.6629174444368172,
505
  "learning_rate": 4.50530798188761e-05,
506
+ "loss": 0.7644,
507
  "step": 71
508
  },
509
  {
510
  "epoch": 1.263157894736842,
511
+ "grad_norm": 1.313611758541734,
512
  "learning_rate": 4.48780199123712e-05,
513
+ "loss": 0.7247,
514
  "step": 72
515
  },
516
  {
517
  "epoch": 1.280701754385965,
518
+ "grad_norm": 0.621594494051855,
519
  "learning_rate": 4.4700268840168045e-05,
520
+ "loss": 0.7177,
521
  "step": 73
522
  },
523
  {
524
  "epoch": 1.2982456140350878,
525
+ "grad_norm": 0.543283643604564,
526
  "learning_rate": 4.4519850666916484e-05,
527
+ "loss": 0.6484,
528
  "step": 74
529
  },
530
  {
531
  "epoch": 1.3157894736842106,
532
+ "grad_norm": 0.5692124563338323,
533
  "learning_rate": 4.43367898183491e-05,
534
+ "loss": 0.7805,
535
  "step": 75
536
  },
537
  {
538
  "epoch": 1.3333333333333333,
539
+ "grad_norm": 0.5140372497115965,
540
  "learning_rate": 4.415111107797445e-05,
541
+ "loss": 0.6551,
542
  "step": 76
543
  },
544
  {
545
  "epoch": 1.3508771929824561,
546
+ "grad_norm": 0.5556993211984754,
547
  "learning_rate": 4.396283958372173e-05,
548
+ "loss": 0.6435,
549
  "step": 77
550
  },
551
  {
552
  "epoch": 1.368421052631579,
553
+ "grad_norm": 0.6297070634295856,
554
  "learning_rate": 4.377200082453749e-05,
555
+ "loss": 0.7551,
556
  "step": 78
557
  },
558
  {
559
  "epoch": 1.3859649122807016,
560
+ "grad_norm": 0.5343607416783988,
561
  "learning_rate": 4.357862063693486e-05,
562
+ "loss": 0.7401,
563
  "step": 79
564
  },
565
  {
566
  "epoch": 1.4035087719298245,
567
+ "grad_norm": 0.6933983776921214,
568
  "learning_rate": 4.3382725201495723e-05,
569
+ "loss": 0.7092,
570
  "step": 80
571
  },
572
  {
573
  "epoch": 1.4210526315789473,
574
+ "grad_norm": 0.48867547087530555,
575
  "learning_rate": 4.318434103932622e-05,
576
+ "loss": 0.7157,
577
  "step": 81
578
  },
579
  {
580
  "epoch": 1.4385964912280702,
581
+ "grad_norm": 0.5074429307644182,
582
  "learning_rate": 4.2983495008466276e-05,
583
+ "loss": 0.6988,
584
  "step": 82
585
  },
586
  {
587
  "epoch": 1.456140350877193,
588
+ "grad_norm": 0.546003726314663,
589
  "learning_rate": 4.278021430025343e-05,
590
+ "loss": 0.6233,
591
  "step": 83
592
  },
593
  {
594
  "epoch": 1.4736842105263157,
595
+ "grad_norm": 0.46092178005444556,
596
  "learning_rate": 4.257452643564155e-05,
597
+ "loss": 0.5562,
598
  "step": 84
599
  },
600
  {
601
  "epoch": 1.4912280701754386,
602
+ "grad_norm": 0.5753767938443312,
603
  "learning_rate": 4.2366459261474933e-05,
604
+ "loss": 0.7155,
605
  "step": 85
606
  },
607
  {
608
  "epoch": 1.5087719298245614,
609
+ "grad_norm": 0.5118274305991928,
610
  "learning_rate": 4.215604094671835e-05,
611
+ "loss": 0.6921,
612
  "step": 86
613
  },
614
  {
615
  "epoch": 1.526315789473684,
616
+ "grad_norm": 0.6380979260118252,
617
  "learning_rate": 4.194329997864331e-05,
618
+ "loss": 0.8256,
619
  "step": 87
620
  },
621
  {
622
  "epoch": 1.543859649122807,
623
+ "grad_norm": 0.5435984291436712,
624
  "learning_rate": 4.172826515897146e-05,
625
+ "loss": 0.7167,
626
  "step": 88
627
  },
628
  {
629
  "epoch": 1.5614035087719298,
630
+ "grad_norm": 0.4654905831182726,
631
  "learning_rate": 4.1510965599975196e-05,
632
+ "loss": 0.7498,
633
  "step": 89
634
  },
635
  {
636
  "epoch": 1.5789473684210527,
637
+ "grad_norm": 0.5655561181667187,
638
  "learning_rate": 4.129143072053638e-05,
639
+ "loss": 0.6566,
640
  "step": 90
641
  },
642
  {
643
  "epoch": 1.5964912280701755,
644
+ "grad_norm": 0.46666240125611763,
645
  "learning_rate": 4.1069690242163484e-05,
646
+ "loss": 0.6798,
647
  "step": 91
648
  },
649
  {
650
  "epoch": 1.6140350877192984,
651
+ "grad_norm": 0.45866681537946846,
652
  "learning_rate": 4.0845774184967754e-05,
653
+ "loss": 0.702,
654
  "step": 92
655
  },
656
  {
657
  "epoch": 1.631578947368421,
658
+ "grad_norm": 0.41578111266812384,
659
  "learning_rate": 4.0619712863599e-05,
660
+ "loss": 0.6448,
661
  "step": 93
662
  },
663
  {
664
  "epoch": 1.6491228070175439,
665
+ "grad_norm": 0.37343214189013274,
666
  "learning_rate": 4.039153688314145e-05,
667
+ "loss": 0.6486,
668
  "step": 94
669
  },
670
  {
671
  "epoch": 1.6666666666666665,
672
+ "grad_norm": 0.4534534413589986,
673
  "learning_rate": 4.0161277134970345e-05,
674
+ "loss": 0.7514,
675
  "step": 95
676
  },
677
  {
678
  "epoch": 1.6842105263157894,
679
+ "grad_norm": 0.5086874484641599,
680
  "learning_rate": 3.9928964792569655e-05,
681
+ "loss": 0.6561,
682
  "step": 96
683
  },
684
  {
685
  "epoch": 1.7017543859649122,
686
+ "grad_norm": 0.5099982692153668,
687
  "learning_rate": 3.969463130731183e-05,
688
+ "loss": 0.818,
689
  "step": 97
690
  },
691
  {
692
  "epoch": 1.719298245614035,
693
+ "grad_norm": 0.5239173636107562,
694
  "learning_rate": 3.945830840419966e-05,
695
+ "loss": 0.707,
696
  "step": 98
697
  },
698
  {
699
  "epoch": 1.736842105263158,
700
+ "grad_norm": 0.5057288153759746,
701
  "learning_rate": 3.9220028077571295e-05,
702
+ "loss": 0.7297,
703
  "step": 99
704
  },
705
  {
706
  "epoch": 1.7543859649122808,
707
+ "grad_norm": 0.4578235248160848,
708
  "learning_rate": 3.897982258676867e-05,
709
+ "loss": 0.6636,
710
  "step": 100
711
  },
712
  {
713
  "epoch": 1.7719298245614035,
714
+ "grad_norm": 0.4741202765186654,
715
  "learning_rate": 3.873772445177015e-05,
716
+ "loss": 0.7528,
717
  "step": 101
718
  },
719
  {
720
  "epoch": 1.7894736842105263,
721
+ "grad_norm": 0.4039174802249478,
722
  "learning_rate": 3.8493766448787825e-05,
723
+ "loss": 0.7435,
724
  "step": 102
725
  },
726
  {
727
  "epoch": 1.807017543859649,
728
+ "grad_norm": 0.589597565267169,
729
  "learning_rate": 3.824798160583012e-05,
730
+ "loss": 0.7506,
731
  "step": 103
732
  },
733
  {
734
  "epoch": 1.8245614035087718,
735
+ "grad_norm": 0.44486881957400587,
736
  "learning_rate": 3.8000403198230387e-05,
737
+ "loss": 0.6197,
738
  "step": 104
739
  },
740
  {
741
  "epoch": 1.8421052631578947,
742
+ "grad_norm": 0.4171720478200468,
743
  "learning_rate": 3.775106474414188e-05,
744
+ "loss": 0.5865,
745
  "step": 105
746
  },
747
  {
748
  "epoch": 1.8596491228070176,
749
+ "grad_norm": 0.6688671369020073,
750
  "learning_rate": 3.7500000000000003e-05,
751
+ "loss": 0.8046,
752
  "step": 106
753
  },
754
  {
755
  "epoch": 1.8771929824561404,
756
+ "grad_norm": 0.594947999839499,
757
  "learning_rate": 3.7247242955952175e-05,
758
+ "loss": 0.7218,
759
  "step": 107
760
  },
761
  {
762
  "epoch": 1.8947368421052633,
763
+ "grad_norm": 0.4541655734242718,
764
  "learning_rate": 3.699282783125616e-05,
765
+ "loss": 0.7191,
766
  "step": 108
767
  },
768
  {
769
  "epoch": 1.912280701754386,
770
+ "grad_norm": 0.44254870382471806,
771
  "learning_rate": 3.673678906964727e-05,
772
+ "loss": 0.6318,
773
  "step": 109
774
  },
775
  {
776
  "epoch": 1.9298245614035088,
777
+ "grad_norm": 0.5177824211881119,
778
  "learning_rate": 3.6479161334675296e-05,
779
+ "loss": 0.7521,
780
  "step": 110
781
  },
782
  {
783
  "epoch": 1.9473684210526314,
784
+ "grad_norm": 0.5109935026690099,
785
  "learning_rate": 3.621997950501156e-05,
786
+ "loss": 0.7555,
787
  "step": 111
788
  },
789
  {
790
  "epoch": 1.9649122807017543,
791
+ "grad_norm": 0.6025137993464594,
792
  "learning_rate": 3.5959278669726935e-05,
793
+ "loss": 0.7303,
794
  "step": 112
795
  },
796
  {
797
  "epoch": 1.9824561403508771,
798
+ "grad_norm": 0.45225891946430735,
799
  "learning_rate": 3.569709412354136e-05,
800
+ "loss": 0.7882,
801
  "step": 113
802
  },
803
  {
804
  "epoch": 2.0,
805
+ "grad_norm": 0.613838203258988,
806
  "learning_rate": 3.543346136204545e-05,
807
+ "loss": 0.6483,
808
  "step": 114
809
  }
810
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d83b4df5d706e4c8bc431f088d2170998a3a8eafe51554ccb621ba77c95afe02
3
- size 8017
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14ce14cb5b4d1f6b7378f3a0d59f90353c316de79d943a6fb969a0b7df2b441c
3
+ size 8081