sravanthib commited on
Commit
07d95cb
·
verified ·
1 Parent(s): 70d9f51

Training completed

Browse files
Files changed (4) hide show
  1. README.md +7 -9
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +345 -30
README.md CHANGED
@@ -1,20 +1,20 @@
1
  ---
2
  library_name: peft
3
- license: apache-2.0
4
- base_model: Qwen/Qwen2.5-7B-Instruct
5
  tags:
6
  - generated_from_trainer
7
  model-index:
8
- - name: refactored-code-llama-3-2-3b
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
- # refactored-code-llama-3-2-3b
16
 
17
- This model is a fine-tuned version of [Qwen/Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) on an unknown dataset.
18
 
19
  ## Model description
20
 
@@ -38,14 +38,12 @@ The following hyperparameters were used during training:
38
  - eval_batch_size: 8
39
  - seed: 42
40
  - distributed_type: multi-GPU
41
- - num_devices: 8
42
  - gradient_accumulation_steps: 10
43
- - total_train_batch_size: 160
44
- - total_eval_batch_size: 64
45
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
46
  - lr_scheduler_type: cosine
47
  - lr_scheduler_warmup_ratio: 0.05
48
- - training_steps: 50
49
 
50
  ### Training results
51
 
 
1
  ---
2
  library_name: peft
3
+ license: other
4
+ base_model: Qwen/Qwen2.5-3B
5
  tags:
6
  - generated_from_trainer
7
  model-index:
8
+ - name: single-node-single-gpu-qwen-custom-sft
9
  results: []
10
  ---
11
 
12
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
  should probably proofread and complete it, then remove this comment. -->
14
 
15
+ # single-node-single-gpu-qwen-custom-sft
16
 
17
+ This model is a fine-tuned version of [Qwen/Qwen2.5-3B](https://huggingface.co/Qwen/Qwen2.5-3B) on an unknown dataset.
18
 
19
  ## Model description
20
 
 
38
  - eval_batch_size: 8
39
  - seed: 42
40
  - distributed_type: multi-GPU
 
41
  - gradient_accumulation_steps: 10
42
+ - total_train_batch_size: 20
 
43
  - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
44
  - lr_scheduler_type: cosine
45
  - lr_scheduler_warmup_ratio: 0.05
46
+ - training_steps: 500
47
 
48
  ### Training results
49
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 7.158730158730159,
3
- "total_flos": 6.287430713700516e+17,
4
- "train_loss": 0.912325621843338,
5
- "train_runtime": 619.2374,
6
- "train_samples_per_second": 12.919,
7
- "train_steps_per_second": 0.081
8
  }
 
1
  {
2
+ "epoch": 0.5,
3
+ "total_flos": 6.856066495152128e+17,
4
+ "train_loss": 0.07727908698283135,
5
+ "train_runtime": 9003.3191,
6
+ "train_samples_per_second": 1.111,
7
+ "train_steps_per_second": 0.056
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 7.158730158730159,
3
- "total_flos": 6.287430713700516e+17,
4
- "train_loss": 0.912325621843338,
5
- "train_runtime": 619.2374,
6
- "train_samples_per_second": 12.919,
7
- "train_steps_per_second": 0.081
8
  }
 
1
  {
2
+ "epoch": 0.5,
3
+ "total_flos": 6.856066495152128e+17,
4
+ "train_loss": 0.07727908698283135,
5
+ "train_runtime": 9003.3191,
6
+ "train_samples_per_second": 1.111,
7
+ "train_steps_per_second": 0.056
8
  }
trainer_state.json CHANGED
@@ -2,63 +2,378 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 7.158730158730159,
6
  "eval_steps": 0,
7
- "global_step": 50,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 1.4761904761904763,
14
- "grad_norm": 0.34064996242523193,
15
- "learning_rate": 0.0001,
16
- "loss": 4.3632,
17
  "step": 10
18
  },
19
  {
20
- "epoch": 2.9523809523809526,
21
- "grad_norm": 0.1575096845626831,
22
- "learning_rate": 0.0001,
23
- "loss": 0.0554,
24
  "step": 20
25
  },
26
  {
27
- "epoch": 4.317460317460317,
28
- "grad_norm": 0.16478388011455536,
29
  "learning_rate": 0.0001,
30
- "loss": 0.0546,
31
  "step": 30
32
  },
33
  {
34
- "epoch": 5.7936507936507935,
35
- "grad_norm": 0.140571728348732,
36
  "learning_rate": 0.0001,
37
- "loss": 0.0479,
38
  "step": 40
39
  },
40
  {
41
- "epoch": 7.158730158730159,
42
- "grad_norm": 0.1423598974943161,
43
  "learning_rate": 0.0001,
44
- "loss": 0.0405,
45
  "step": 50
46
  },
47
  {
48
- "epoch": 7.158730158730159,
49
- "step": 50,
50
- "total_flos": 6.287430713700516e+17,
51
- "train_loss": 0.912325621843338,
52
- "train_runtime": 619.2374,
53
- "train_samples_per_second": 12.919,
54
- "train_steps_per_second": 0.081
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  }
56
  ],
57
  "logging_steps": 10,
58
- "max_steps": 50,
59
  "num_input_tokens_seen": 0,
60
- "num_train_epochs": 9,
61
- "save_steps": 50,
62
  "stateful_callbacks": {
63
  "TrainerControl": {
64
  "args": {
@@ -71,7 +386,7 @@
71
  "attributes": {}
72
  }
73
  },
74
- "total_flos": 6.287430713700516e+17,
75
  "train_batch_size": 2,
76
  "trial_name": null,
77
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5,
6
  "eval_steps": 0,
7
+ "global_step": 500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.01,
14
+ "grad_norm": 6.734264373779297,
15
+ "learning_rate": 8.576691395183485e-05,
16
+ "loss": 3.5009,
17
  "step": 10
18
  },
19
  {
20
+ "epoch": 0.02,
21
+ "grad_norm": 0.29098451137542725,
22
+ "learning_rate": 9.653382790366966e-05,
23
+ "loss": 0.1027,
24
  "step": 20
25
  },
26
  {
27
+ "epoch": 0.03,
28
+ "grad_norm": 0.09549916535615921,
29
  "learning_rate": 0.0001,
30
+ "loss": 0.0191,
31
  "step": 30
32
  },
33
  {
34
+ "epoch": 0.04,
35
+ "grad_norm": 0.3484920263290405,
36
  "learning_rate": 0.0001,
37
+ "loss": 0.0222,
38
  "step": 40
39
  },
40
  {
41
+ "epoch": 0.05,
42
+ "grad_norm": 0.33001908659935,
43
  "learning_rate": 0.0001,
44
+ "loss": 0.021,
45
  "step": 50
46
  },
47
  {
48
+ "epoch": 0.06,
49
+ "grad_norm": 0.057511646300554276,
50
+ "learning_rate": 0.0001,
51
+ "loss": 0.0135,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.07,
56
+ "grad_norm": 0.05701196566224098,
57
+ "learning_rate": 0.0001,
58
+ "loss": 0.0117,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.08,
63
+ "grad_norm": 0.043988876044750214,
64
+ "learning_rate": 0.0001,
65
+ "loss": 0.0107,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.09,
70
+ "grad_norm": 0.03720390796661377,
71
+ "learning_rate": 0.0001,
72
+ "loss": 0.0098,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.1,
77
+ "grad_norm": 0.0470854677259922,
78
+ "learning_rate": 0.0001,
79
+ "loss": 0.0091,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.11,
84
+ "grad_norm": 0.035510435700416565,
85
+ "learning_rate": 0.0001,
86
+ "loss": 0.0075,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.12,
91
+ "grad_norm": 0.0346401073038578,
92
+ "learning_rate": 0.0001,
93
+ "loss": 0.0069,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.13,
98
+ "grad_norm": 0.0329650416970253,
99
+ "learning_rate": 0.0001,
100
+ "loss": 0.0064,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.14,
105
+ "grad_norm": 0.056529607623815536,
106
+ "learning_rate": 0.0001,
107
+ "loss": 0.0061,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.15,
112
+ "grad_norm": 0.049417588859796524,
113
+ "learning_rate": 0.0001,
114
+ "loss": 0.0058,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.16,
119
+ "grad_norm": 0.031275127083063126,
120
+ "learning_rate": 0.0001,
121
+ "loss": 0.0046,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.17,
126
+ "grad_norm": 0.026077693328261375,
127
+ "learning_rate": 0.0001,
128
+ "loss": 0.0043,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.18,
133
+ "grad_norm": 0.03110571764409542,
134
+ "learning_rate": 0.0001,
135
+ "loss": 0.0035,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.19,
140
+ "grad_norm": 0.0256363395601511,
141
+ "learning_rate": 0.0001,
142
+ "loss": 0.0039,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.2,
147
+ "grad_norm": 0.13061155378818512,
148
+ "learning_rate": 0.0001,
149
+ "loss": 0.0042,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.21,
154
+ "grad_norm": 0.022342098876833916,
155
+ "learning_rate": 0.0001,
156
+ "loss": 0.0029,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.22,
161
+ "grad_norm": 0.06658010929822922,
162
+ "learning_rate": 0.0001,
163
+ "loss": 0.0028,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.23,
168
+ "grad_norm": 0.02203432098031044,
169
+ "learning_rate": 0.0001,
170
+ "loss": 0.0028,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.24,
175
+ "grad_norm": 0.04879545792937279,
176
+ "learning_rate": 0.0001,
177
+ "loss": 0.0022,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.25,
182
+ "grad_norm": 0.044768281280994415,
183
+ "learning_rate": 0.0001,
184
+ "loss": 0.0026,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.26,
189
+ "grad_norm": 0.030401039868593216,
190
+ "learning_rate": 0.0001,
191
+ "loss": 0.0018,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.27,
196
+ "grad_norm": 0.10380243510007858,
197
+ "learning_rate": 0.0001,
198
+ "loss": 0.0015,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.28,
203
+ "grad_norm": 0.019732531160116196,
204
+ "learning_rate": 0.0001,
205
+ "loss": 0.0017,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.29,
210
+ "grad_norm": 0.015292245894670486,
211
+ "learning_rate": 0.0001,
212
+ "loss": 0.0019,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.3,
217
+ "grad_norm": 0.030675368383526802,
218
+ "learning_rate": 0.0001,
219
+ "loss": 0.0022,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.31,
224
+ "grad_norm": 0.029702844098210335,
225
+ "learning_rate": 0.0001,
226
+ "loss": 0.0014,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.32,
231
+ "grad_norm": 0.016342662274837494,
232
+ "learning_rate": 0.0001,
233
+ "loss": 0.0014,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.33,
238
+ "grad_norm": 0.013499235734343529,
239
+ "learning_rate": 0.0001,
240
+ "loss": 0.0013,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.34,
245
+ "grad_norm": 0.011413372121751308,
246
+ "learning_rate": 0.0001,
247
+ "loss": 0.0012,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.35,
252
+ "grad_norm": 0.09215894341468811,
253
+ "learning_rate": 0.0001,
254
+ "loss": 0.0073,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.36,
259
+ "grad_norm": 0.06609797477722168,
260
+ "learning_rate": 0.0001,
261
+ "loss": 0.0084,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.37,
266
+ "grad_norm": 0.03970978036522865,
267
+ "learning_rate": 0.0001,
268
+ "loss": 0.0075,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.38,
273
+ "grad_norm": 0.029625259339809418,
274
+ "learning_rate": 0.0001,
275
+ "loss": 0.0059,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.39,
280
+ "grad_norm": 0.02456456422805786,
281
+ "learning_rate": 0.0001,
282
+ "loss": 0.005,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.4,
287
+ "grad_norm": 0.03191933035850525,
288
+ "learning_rate": 0.0001,
289
+ "loss": 0.0045,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.41,
294
+ "grad_norm": 0.01918269693851471,
295
+ "learning_rate": 0.0001,
296
+ "loss": 0.0037,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.42,
301
+ "grad_norm": 0.018161766231060028,
302
+ "learning_rate": 0.0001,
303
+ "loss": 0.0031,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.43,
308
+ "grad_norm": 0.019575210288167,
309
+ "learning_rate": 0.0001,
310
+ "loss": 0.0026,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.44,
315
+ "grad_norm": 0.026317287236452103,
316
+ "learning_rate": 0.0001,
317
+ "loss": 0.0023,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.45,
322
+ "grad_norm": 0.040029872208833694,
323
+ "learning_rate": 0.0001,
324
+ "loss": 0.0026,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.46,
329
+ "grad_norm": 0.013975433073937893,
330
+ "learning_rate": 0.0001,
331
+ "loss": 0.0022,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.47,
336
+ "grad_norm": 0.03210354968905449,
337
+ "learning_rate": 0.0001,
338
+ "loss": 0.0017,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.48,
343
+ "grad_norm": 0.01889188587665558,
344
+ "learning_rate": 0.0001,
345
+ "loss": 0.0019,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.49,
350
+ "grad_norm": 0.013832672499120235,
351
+ "learning_rate": 0.0001,
352
+ "loss": 0.0016,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.5,
357
+ "grad_norm": 0.057756196707487106,
358
+ "learning_rate": 0.0001,
359
+ "loss": 0.0018,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.5,
364
+ "step": 500,
365
+ "total_flos": 6.856066495152128e+17,
366
+ "train_loss": 0.07727908698283135,
367
+ "train_runtime": 9003.3191,
368
+ "train_samples_per_second": 1.111,
369
+ "train_steps_per_second": 0.056
370
  }
371
  ],
372
  "logging_steps": 10,
373
+ "max_steps": 500,
374
  "num_input_tokens_seen": 0,
375
+ "num_train_epochs": 1,
376
+ "save_steps": 1000,
377
  "stateful_callbacks": {
378
  "TrainerControl": {
379
  "args": {
 
386
  "attributes": {}
387
  }
388
  },
389
+ "total_flos": 6.856066495152128e+17,
390
  "train_batch_size": 2,
391
  "trial_name": null,
392
  "trial_params": null