jialicheng commited on
Commit
99d9323
·
verified ·
1 Parent(s): a9979c8

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -6,19 +6,19 @@ tags:
6
  metrics:
7
  - accuracy
8
  model-index:
9
- - name: ddi_42
10
  results: []
11
  ---
12
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
  should probably proofread and complete it, then remove this comment. -->
15
 
16
- # ddi_42
17
 
18
  This model is a fine-tuned version of [microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract](https://huggingface.co/microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 0.2085
21
- - Accuracy: 0.9551
22
 
23
  ## Model description
24
 
@@ -43,22 +43,32 @@ The following hyperparameters were used during training:
43
  - seed: 42
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: linear
46
- - num_epochs: 10
47
 
48
  ### Training results
49
 
50
- | Training Loss | Epoch | Step | Validation Loss | Accuracy |
51
- |:-------------:|:-----:|:----:|:---------------:|:--------:|
52
- | No log | 1.0 | 791 | 0.1986 | 0.9383 |
53
- | 0.1723 | 2.0 | 1582 | 0.2700 | 0.9455 |
54
- | 0.0772 | 3.0 | 2373 | 0.2085 | 0.9551 |
55
- | 0.0516 | 4.0 | 3164 | 0.2970 | 0.9427 |
56
- | 0.0516 | 5.0 | 3955 | 0.2620 | 0.9539 |
57
- | 0.0341 | 6.0 | 4746 | 0.3973 | 0.9423 |
58
- | 0.0203 | 7.0 | 5537 | 0.3637 | 0.9423 |
59
- | 0.0146 | 8.0 | 6328 | 0.4154 | 0.9451 |
60
- | 0.007 | 9.0 | 7119 | 0.4219 | 0.9463 |
61
- | 0.007 | 10.0 | 7910 | 0.4098 | 0.9447 |
 
 
 
 
 
 
 
 
 
 
62
 
63
 
64
  ### Framework versions
 
6
  metrics:
7
  - accuracy
8
  model-index:
9
+ - name: pubmedbert-abstract
10
  results: []
11
  ---
12
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
14
  should probably proofread and complete it, then remove this comment. -->
15
 
16
+ # pubmedbert-abstract
17
 
18
  This model is a fine-tuned version of [microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract](https://huggingface.co/microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract) on the None dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 0.4618
21
+ - Accuracy: 0.9501
22
 
23
  ## Model description
24
 
 
43
  - seed: 42
44
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
45
  - lr_scheduler_type: linear
46
+ - num_epochs: 20
47
 
48
  ### Training results
49
 
50
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
51
+ |:-------------:|:-----:|:-----:|:---------------:|:--------:|
52
+ | No log | 1.0 | 791 | 0.2340 | 0.9386 |
53
+ | 0.1776 | 2.0 | 1582 | 0.2716 | 0.9419 |
54
+ | 0.0855 | 3.0 | 2373 | 0.2730 | 0.9431 |
55
+ | 0.0627 | 4.0 | 3164 | 0.3323 | 0.9382 |
56
+ | 0.0627 | 5.0 | 3955 | 0.3308 | 0.9451 |
57
+ | 0.0463 | 6.0 | 4746 | 0.3986 | 0.9412 |
58
+ | 0.0308 | 7.0 | 5537 | 0.4211 | 0.9419 |
59
+ | 0.0312 | 8.0 | 6328 | 0.3616 | 0.9437 |
60
+ | 0.0221 | 9.0 | 7119 | 0.4310 | 0.9396 |
61
+ | 0.0221 | 10.0 | 7910 | 0.4222 | 0.9438 |
62
+ | 0.0181 | 11.0 | 8701 | 0.4185 | 0.9445 |
63
+ | 0.0141 | 12.0 | 9492 | 0.4678 | 0.9456 |
64
+ | 0.0133 | 13.0 | 10283 | 0.4027 | 0.9503 |
65
+ | 0.0082 | 14.0 | 11074 | 0.4504 | 0.9473 |
66
+ | 0.0082 | 15.0 | 11865 | 0.4760 | 0.9505 |
67
+ | 0.0052 | 16.0 | 12656 | 0.4573 | 0.9449 |
68
+ | 0.0042 | 17.0 | 13447 | 0.4356 | 0.9522 |
69
+ | 0.0037 | 18.0 | 14238 | 0.4577 | 0.9487 |
70
+ | 0.0024 | 19.0 | 15029 | 0.4642 | 0.9493 |
71
+ | 0.0024 | 20.0 | 15820 | 0.4618 | 0.9501 |
72
 
73
 
74
  ### Framework versions
all_results.json CHANGED
@@ -1,20 +1,17 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.9551282051282052,
4
- "eval_loss": 0.20845328271389008,
5
- "eval_runtime": 4.9096,
6
- "eval_samples": 25296,
7
- "eval_samples_per_second": 508.388,
8
- "eval_steps_per_second": 2.037,
9
- "test_accuracy": 0.9391182645206438,
10
- "test_loss": 0.3346391022205353,
11
- "test_runtime": 9.9095,
12
- "test_samples_per_second": 576.823,
13
- "test_steps_per_second": 2.321,
14
- "train_accuracy": 0.9922912713472486,
15
- "train_loss": 0.03253408521413803,
16
- "train_runtime": 40.6672,
17
- "train_samples": 25296,
18
- "train_samples_per_second": 622.025,
19
- "train_steps_per_second": 2.434
20
  }
 
1
  {
2
+ "dr_accuracy": 0.9990512333965844,
3
+ "dr_loss": 0.003322172211483121,
4
+ "dr_runtime": 38.9939,
5
+ "dr_samples_per_second": 648.718,
6
+ "dr_steps_per_second": 2.539,
7
+ "epoch": 20.0,
8
+ "test_accuracy": 0.9522393282015396,
9
+ "test_loss": 0.43564239144325256,
10
+ "test_runtime": 9.413,
11
+ "test_samples_per_second": 607.247,
12
+ "test_steps_per_second": 2.443,
13
+ "train_loss": 0.03330213655864846,
14
+ "train_runtime": 3337.005,
15
+ "train_samples_per_second": 151.609,
16
+ "train_steps_per_second": 4.741
 
 
 
17
  }
dr_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dr_accuracy": 0.9990512333965844,
3
+ "dr_loss": 0.003322172211483121,
4
+ "dr_runtime": 38.9939,
5
+ "dr_samples_per_second": 648.718,
6
+ "dr_steps_per_second": 2.539,
7
+ "epoch": 20.0
8
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8057e2d26cccbc75703b20f2c650e0332ed275e6f805e991e284c0e4044a8905
3
  size 437967876
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8177fdfdfd82ee15073e43ce61ec4066744df2c00a56772f510f614e05b00a41
3
  size 437967876
test_results.json CHANGED
@@ -1,9 +1,8 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_samples": 5716,
4
- "test_accuracy": 0.9391182645206438,
5
- "test_loss": 0.3346391022205353,
6
- "test_runtime": 9.9095,
7
- "test_samples_per_second": 576.823,
8
- "test_steps_per_second": 2.321
9
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "test_accuracy": 0.9522393282015396,
4
+ "test_loss": 0.43564239144325256,
5
+ "test_runtime": 9.413,
6
+ "test_samples_per_second": 607.247,
7
+ "test_steps_per_second": 2.443
 
8
  }
train_results.json CHANGED
@@ -1,9 +1,7 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_samples": 25296,
4
- "train_accuracy": 0.9922912713472486,
5
- "train_loss": 0.03253408521413803,
6
- "train_runtime": 40.6672,
7
- "train_samples_per_second": 622.025,
8
- "train_steps_per_second": 2.434
9
  }
 
1
  {
2
+ "epoch": 20.0,
3
+ "train_loss": 0.03330213655864846,
4
+ "train_runtime": 3337.005,
5
+ "train_samples_per_second": 151.609,
6
+ "train_steps_per_second": 4.741
 
 
7
  }
trainer_state.json CHANGED
@@ -1,168 +1,314 @@
1
  {
2
- "best_metric": 0.9551282051282052,
3
- "best_model_checkpoint": "text/train/checkpoint/pubmedbert-abstract/ddi_42/checkpoint-2373",
4
- "epoch": 10.0,
5
  "eval_steps": 500,
6
- "global_step": 7910,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.938301282051282,
14
- "eval_loss": 0.19862917065620422,
15
- "eval_runtime": 5.4522,
16
- "eval_samples_per_second": 457.799,
17
- "eval_steps_per_second": 1.834,
18
  "step": 791
19
  },
20
  {
21
  "epoch": 1.26,
22
- "grad_norm": 1.910953402519226,
23
- "learning_rate": 4.367888748419722e-05,
24
- "loss": 0.1723,
25
  "step": 1000
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.9455128205128205,
30
- "eval_loss": 0.2700441777706146,
31
- "eval_runtime": 5.0135,
32
- "eval_samples_per_second": 497.859,
33
- "eval_steps_per_second": 1.995,
34
  "step": 1582
35
  },
36
  {
37
  "epoch": 2.53,
38
- "grad_norm": 19.194658279418945,
39
- "learning_rate": 3.735777496839444e-05,
40
- "loss": 0.0772,
41
  "step": 2000
42
  },
43
  {
44
  "epoch": 3.0,
45
- "eval_accuracy": 0.9551282051282052,
46
- "eval_loss": 0.20845328271389008,
47
- "eval_runtime": 4.8197,
48
- "eval_samples_per_second": 517.874,
49
- "eval_steps_per_second": 2.075,
50
  "step": 2373
51
  },
52
  {
53
  "epoch": 3.79,
54
- "grad_norm": 0.3629066050052643,
55
- "learning_rate": 3.1036662452591655e-05,
56
- "loss": 0.0516,
57
  "step": 3000
58
  },
59
  {
60
  "epoch": 4.0,
61
- "eval_accuracy": 0.9427083333333334,
62
- "eval_loss": 0.2969796061515808,
63
- "eval_runtime": 4.7872,
64
- "eval_samples_per_second": 521.395,
65
- "eval_steps_per_second": 2.089,
66
  "step": 3164
67
  },
68
  {
69
  "epoch": 5.0,
70
- "eval_accuracy": 0.953926282051282,
71
- "eval_loss": 0.2620067596435547,
72
- "eval_runtime": 4.7806,
73
- "eval_samples_per_second": 522.107,
74
- "eval_steps_per_second": 2.092,
75
  "step": 3955
76
  },
77
  {
78
  "epoch": 5.06,
79
- "grad_norm": 0.0031597923953086138,
80
- "learning_rate": 2.4715549936788876e-05,
81
- "loss": 0.0341,
82
  "step": 4000
83
  },
84
  {
85
  "epoch": 6.0,
86
- "eval_accuracy": 0.9423076923076923,
87
- "eval_loss": 0.39730244874954224,
88
- "eval_runtime": 4.7836,
89
- "eval_samples_per_second": 521.783,
90
- "eval_steps_per_second": 2.09,
91
  "step": 4746
92
  },
93
  {
94
  "epoch": 6.32,
95
- "grad_norm": 0.006749527528882027,
96
- "learning_rate": 1.8394437420986094e-05,
97
- "loss": 0.0203,
98
  "step": 5000
99
  },
100
  {
101
  "epoch": 7.0,
102
- "eval_accuracy": 0.9423076923076923,
103
- "eval_loss": 0.3636617958545685,
104
- "eval_runtime": 4.7835,
105
- "eval_samples_per_second": 521.797,
106
- "eval_steps_per_second": 2.091,
107
  "step": 5537
108
  },
109
  {
110
  "epoch": 7.59,
111
- "grad_norm": 0.012521493248641491,
112
- "learning_rate": 1.2073324905183313e-05,
113
- "loss": 0.0146,
114
  "step": 6000
115
  },
116
  {
117
  "epoch": 8.0,
118
- "eval_accuracy": 0.9451121794871795,
119
- "eval_loss": 0.4154008626937866,
120
- "eval_runtime": 4.7634,
121
- "eval_samples_per_second": 523.998,
122
- "eval_steps_per_second": 2.099,
123
  "step": 6328
124
  },
125
  {
126
  "epoch": 8.85,
127
- "grad_norm": 0.005341885611414909,
128
- "learning_rate": 5.752212389380531e-06,
129
- "loss": 0.007,
130
  "step": 7000
131
  },
132
  {
133
  "epoch": 9.0,
134
- "eval_accuracy": 0.9463141025641025,
135
- "eval_loss": 0.4218637943267822,
136
- "eval_runtime": 4.8145,
137
- "eval_samples_per_second": 518.431,
138
- "eval_steps_per_second": 2.077,
139
  "step": 7119
140
  },
141
  {
142
  "epoch": 10.0,
143
- "eval_accuracy": 0.9447115384615384,
144
- "eval_loss": 0.4098478853702545,
145
- "eval_runtime": 4.8332,
146
- "eval_samples_per_second": 516.428,
147
- "eval_steps_per_second": 2.069,
148
  "step": 7910
149
  },
150
  {
151
- "epoch": 10.0,
152
- "step": 7910,
153
- "total_flos": 1.663959132942336e+16,
154
- "train_loss": 0.048086456131241864,
155
- "train_runtime": 1425.5671,
156
- "train_samples_per_second": 177.445,
157
- "train_steps_per_second": 5.549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  }
159
  ],
160
  "logging_steps": 1000,
161
- "max_steps": 7910,
162
  "num_input_tokens_seen": 0,
163
- "num_train_epochs": 10,
164
  "save_steps": 500,
165
- "total_flos": 1.663959132942336e+16,
166
  "train_batch_size": 32,
167
  "trial_name": null,
168
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.9522393282015396,
3
+ "best_model_checkpoint": "../../checkpoint/ddi/pubmedbert-abstract/checkpoint-13447",
4
+ "epoch": 20.0,
5
  "eval_steps": 500,
6
+ "global_step": 15820,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.938593421973408,
14
+ "eval_loss": 0.23401664197444916,
15
+ "eval_runtime": 9.3525,
16
+ "eval_samples_per_second": 611.174,
17
+ "eval_steps_per_second": 2.459,
18
  "step": 791
19
  },
20
  {
21
  "epoch": 1.26,
22
+ "grad_norm": 22.48932647705078,
23
+ "learning_rate": 4.683944374209861e-05,
24
+ "loss": 0.1776,
25
  "step": 1000
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.9419174247725682,
30
+ "eval_loss": 0.27159374952316284,
31
+ "eval_runtime": 9.4015,
32
+ "eval_samples_per_second": 607.985,
33
+ "eval_steps_per_second": 2.446,
34
  "step": 1582
35
  },
36
  {
37
  "epoch": 2.53,
38
+ "grad_norm": 0.30297374725341797,
39
+ "learning_rate": 4.367888748419722e-05,
40
+ "loss": 0.0855,
41
  "step": 2000
42
  },
43
  {
44
  "epoch": 3.0,
45
+ "eval_accuracy": 0.9431420573827851,
46
+ "eval_loss": 0.27298399806022644,
47
+ "eval_runtime": 9.4208,
48
+ "eval_samples_per_second": 606.741,
49
+ "eval_steps_per_second": 2.441,
50
  "step": 2373
51
  },
52
  {
53
  "epoch": 3.79,
54
+ "grad_norm": 0.08143208175897598,
55
+ "learning_rate": 4.051833122629583e-05,
56
+ "loss": 0.0627,
57
  "step": 3000
58
  },
59
  {
60
  "epoch": 4.0,
61
+ "eval_accuracy": 0.9382435269419174,
62
+ "eval_loss": 0.33226683735847473,
63
+ "eval_runtime": 9.4056,
64
+ "eval_samples_per_second": 607.725,
65
+ "eval_steps_per_second": 2.445,
66
  "step": 3164
67
  },
68
  {
69
  "epoch": 5.0,
70
+ "eval_accuracy": 0.9450664800559833,
71
+ "eval_loss": 0.3307989239692688,
72
+ "eval_runtime": 9.3945,
73
+ "eval_samples_per_second": 608.444,
74
+ "eval_steps_per_second": 2.448,
75
  "step": 3955
76
  },
77
  {
78
  "epoch": 5.06,
79
+ "grad_norm": 0.012913365848362446,
80
+ "learning_rate": 3.735777496839444e-05,
81
+ "loss": 0.0463,
82
  "step": 4000
83
  },
84
  {
85
  "epoch": 6.0,
86
+ "eval_accuracy": 0.9412176347095871,
87
+ "eval_loss": 0.39864641427993774,
88
+ "eval_runtime": 9.3742,
89
+ "eval_samples_per_second": 609.757,
90
+ "eval_steps_per_second": 2.454,
91
  "step": 4746
92
  },
93
  {
94
  "epoch": 6.32,
95
+ "grad_norm": 2.0644302368164062,
96
+ "learning_rate": 3.419721871049305e-05,
97
+ "loss": 0.0308,
98
  "step": 5000
99
  },
100
  {
101
  "epoch": 7.0,
102
+ "eval_accuracy": 0.9419174247725682,
103
+ "eval_loss": 0.42111822962760925,
104
+ "eval_runtime": 9.3868,
105
+ "eval_samples_per_second": 608.939,
106
+ "eval_steps_per_second": 2.45,
107
  "step": 5537
108
  },
109
  {
110
  "epoch": 7.59,
111
+ "grad_norm": 30.287784576416016,
112
+ "learning_rate": 3.1036662452591655e-05,
113
+ "loss": 0.0312,
114
  "step": 6000
115
  },
116
  {
117
  "epoch": 8.0,
118
+ "eval_accuracy": 0.943666899930021,
119
+ "eval_loss": 0.36164581775665283,
120
+ "eval_runtime": 9.3894,
121
+ "eval_samples_per_second": 608.769,
122
+ "eval_steps_per_second": 2.45,
123
  "step": 6328
124
  },
125
  {
126
  "epoch": 8.85,
127
+ "grad_norm": 0.03705460578203201,
128
+ "learning_rate": 2.7876106194690264e-05,
129
+ "loss": 0.0221,
130
  "step": 7000
131
  },
132
  {
133
  "epoch": 9.0,
134
+ "eval_accuracy": 0.9396431070678797,
135
+ "eval_loss": 0.4309725761413574,
136
+ "eval_runtime": 9.4203,
137
+ "eval_samples_per_second": 606.777,
138
+ "eval_steps_per_second": 2.442,
139
  "step": 7119
140
  },
141
  {
142
  "epoch": 10.0,
143
+ "eval_accuracy": 0.9438418474457663,
144
+ "eval_loss": 0.4221705198287964,
145
+ "eval_runtime": 9.4325,
146
+ "eval_samples_per_second": 605.989,
147
+ "eval_steps_per_second": 2.438,
148
  "step": 7910
149
  },
150
  {
151
+ "epoch": 10.11,
152
+ "grad_norm": 0.11050642281770706,
153
+ "learning_rate": 2.4715549936788876e-05,
154
+ "loss": 0.0181,
155
+ "step": 8000
156
+ },
157
+ {
158
+ "epoch": 11.0,
159
+ "eval_accuracy": 0.9445416375087474,
160
+ "eval_loss": 0.4184616804122925,
161
+ "eval_runtime": 9.4176,
162
+ "eval_samples_per_second": 606.948,
163
+ "eval_steps_per_second": 2.442,
164
+ "step": 8701
165
+ },
166
+ {
167
+ "epoch": 11.38,
168
+ "grad_norm": 0.03253033012151718,
169
+ "learning_rate": 2.1554993678887485e-05,
170
+ "loss": 0.0141,
171
+ "step": 9000
172
+ },
173
+ {
174
+ "epoch": 12.0,
175
+ "eval_accuracy": 0.945591322603219,
176
+ "eval_loss": 0.46782681345939636,
177
+ "eval_runtime": 9.4133,
178
+ "eval_samples_per_second": 607.226,
179
+ "eval_steps_per_second": 2.443,
180
+ "step": 9492
181
+ },
182
+ {
183
+ "epoch": 12.64,
184
+ "grad_norm": 0.005740019958466291,
185
+ "learning_rate": 1.8394437420986094e-05,
186
+ "loss": 0.0133,
187
+ "step": 10000
188
+ },
189
+ {
190
+ "epoch": 13.0,
191
+ "eval_accuracy": 0.9503149055283415,
192
+ "eval_loss": 0.40269356966018677,
193
+ "eval_runtime": 9.4053,
194
+ "eval_samples_per_second": 607.74,
195
+ "eval_steps_per_second": 2.445,
196
+ "step": 10283
197
+ },
198
+ {
199
+ "epoch": 13.91,
200
+ "grad_norm": 0.0024239453487098217,
201
+ "learning_rate": 1.5233881163084704e-05,
202
+ "loss": 0.0082,
203
+ "step": 11000
204
+ },
205
+ {
206
+ "epoch": 14.0,
207
+ "eval_accuracy": 0.9473407977606718,
208
+ "eval_loss": 0.45041143894195557,
209
+ "eval_runtime": 9.639,
210
+ "eval_samples_per_second": 593.006,
211
+ "eval_steps_per_second": 2.386,
212
+ "step": 11074
213
+ },
214
+ {
215
+ "epoch": 15.0,
216
+ "eval_accuracy": 0.9504898530440867,
217
+ "eval_loss": 0.47598323225975037,
218
+ "eval_runtime": 9.4326,
219
+ "eval_samples_per_second": 605.982,
220
+ "eval_steps_per_second": 2.438,
221
+ "step": 11865
222
+ },
223
+ {
224
+ "epoch": 15.17,
225
+ "grad_norm": 0.0034840325824916363,
226
+ "learning_rate": 1.2073324905183313e-05,
227
+ "loss": 0.0052,
228
+ "step": 12000
229
+ },
230
+ {
231
+ "epoch": 16.0,
232
+ "eval_accuracy": 0.9448915325402379,
233
+ "eval_loss": 0.45725104212760925,
234
+ "eval_runtime": 9.4366,
235
+ "eval_samples_per_second": 605.728,
236
+ "eval_steps_per_second": 2.437,
237
+ "step": 12656
238
+ },
239
+ {
240
+ "epoch": 16.43,
241
+ "grad_norm": 0.4831530451774597,
242
+ "learning_rate": 8.912768647281922e-06,
243
+ "loss": 0.0042,
244
+ "step": 13000
245
+ },
246
+ {
247
+ "epoch": 17.0,
248
+ "eval_accuracy": 0.9522393282015396,
249
+ "eval_loss": 0.43564239144325256,
250
+ "eval_runtime": 9.4029,
251
+ "eval_samples_per_second": 607.9,
252
+ "eval_steps_per_second": 2.446,
253
+ "step": 13447
254
+ },
255
+ {
256
+ "epoch": 17.7,
257
+ "grad_norm": 0.0005753316800110042,
258
+ "learning_rate": 5.752212389380531e-06,
259
+ "loss": 0.0037,
260
+ "step": 14000
261
+ },
262
+ {
263
+ "epoch": 18.0,
264
+ "eval_accuracy": 0.948740377886634,
265
+ "eval_loss": 0.45767056941986084,
266
+ "eval_runtime": 9.3872,
267
+ "eval_samples_per_second": 608.913,
268
+ "eval_steps_per_second": 2.45,
269
+ "step": 14238
270
+ },
271
+ {
272
+ "epoch": 18.96,
273
+ "grad_norm": 0.0019946701359003782,
274
+ "learning_rate": 2.59165613147914e-06,
275
+ "loss": 0.0024,
276
+ "step": 15000
277
+ },
278
+ {
279
+ "epoch": 19.0,
280
+ "eval_accuracy": 0.9492652204338698,
281
+ "eval_loss": 0.4641525149345398,
282
+ "eval_runtime": 9.4075,
283
+ "eval_samples_per_second": 607.599,
284
+ "eval_steps_per_second": 2.445,
285
+ "step": 15029
286
+ },
287
+ {
288
+ "epoch": 20.0,
289
+ "eval_accuracy": 0.9501399580125962,
290
+ "eval_loss": 0.46178367733955383,
291
+ "eval_runtime": 9.3958,
292
+ "eval_samples_per_second": 608.356,
293
+ "eval_steps_per_second": 2.448,
294
+ "step": 15820
295
+ },
296
+ {
297
+ "epoch": 20.0,
298
+ "step": 15820,
299
+ "total_flos": 3.327918265884672e+16,
300
+ "train_loss": 0.03330213655864846,
301
+ "train_runtime": 3337.005,
302
+ "train_samples_per_second": 151.609,
303
+ "train_steps_per_second": 4.741
304
  }
305
  ],
306
  "logging_steps": 1000,
307
+ "max_steps": 15820,
308
  "num_input_tokens_seen": 0,
309
+ "num_train_epochs": 20,
310
  "save_steps": 500,
311
+ "total_flos": 3.327918265884672e+16,
312
  "train_batch_size": 32,
313
  "trial_name": null,
314
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4de6cbc7180e01d7bbaac6d209405970b8c1932b419ce007ab9190aeb3348435
3
- size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea63ec5bf6fd89b86ae59f8fb9e574688ea61151666e13315351c52df7380d6c
3
+ size 4984