ar5entum commited on
Commit
a185f17
·
verified ·
1 Parent(s): 48325e1

Upload folder using huggingface_hub

Browse files
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apache-2.0
4
+ base_model: google/vit-base-patch16-224-in21k
5
+ tags:
6
+ - image-classification
7
+ - vision
8
+ - generated_from_trainer
9
+ metrics:
10
+ - accuracy
11
+ model-index:
12
+ - name: beans_outputs
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # beans_outputs
20
+
21
+ This model is a fine-tuned version of [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) on the /home/ubuntu/sdb/astitva/segmentation/classification_ds dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 0.8746
24
+ - Accuracy: 0.9515
25
+
26
+ ## Model description
27
+
28
+ More information needed
29
+
30
+ ## Intended uses & limitations
31
+
32
+ More information needed
33
+
34
+ ## Training and evaluation data
35
+
36
+ More information needed
37
+
38
+ ## Training procedure
39
+
40
+ ### Training hyperparameters
41
+
42
+ The following hyperparameters were used during training:
43
+ - learning_rate: 2e-05
44
+ - train_batch_size: 8
45
+ - eval_batch_size: 8
46
+ - seed: 42
47
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
48
+ - lr_scheduler_type: linear
49
+ - num_epochs: 5.0
50
+
51
+ ### Training results
52
+
53
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
54
+ |:-------------:|:-----:|:----:|:---------------:|:--------:|
55
+ | 2.1775 | 1.0 | 336 | 2.1821 | 0.7616 |
56
+ | 1.4653 | 2.0 | 672 | 1.4698 | 0.8840 |
57
+ | 1.1052 | 3.0 | 1008 | 1.0802 | 0.9304 |
58
+ | 1.0055 | 4.0 | 1344 | 0.9248 | 0.9494 |
59
+ | 0.7847 | 5.0 | 1680 | 0.8746 | 0.9515 |
60
+
61
+
62
+ ### Framework versions
63
+
64
+ - Transformers 4.50.0.dev0
65
+ - Pytorch 2.6.0+cu124
66
+ - Datasets 3.3.2
67
+ - Tokenizers 0.21.0
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.9514767932489452,
4
+ "eval_loss": 0.8745647668838501,
5
+ "eval_runtime": 51.5176,
6
+ "eval_samples_per_second": 9.201,
7
+ "eval_steps_per_second": 1.165,
8
+ "total_flos": 1.0410532148820787e+18,
9
+ "train_loss": 1.5688391100792658,
10
+ "train_runtime": 1801.1044,
11
+ "train_samples_per_second": 7.457,
12
+ "train_steps_per_second": 0.933
13
+ }
config.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/vit-base-patch16-224-in21k",
3
+ "architectures": [
4
+ "ViTForImageClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "encoder_stride": 16,
8
+ "finetuning_task": "image-classification",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.0,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "aloevera",
14
+ "1": "arjun",
15
+ "10": "coffee",
16
+ "11": "coriander",
17
+ "12": "curry",
18
+ "13": "giloy",
19
+ "14": "ginger",
20
+ "15": "glochidion",
21
+ "16": "gotu kola",
22
+ "17": "hibiscus",
23
+ "18": "jasmine",
24
+ "19": "lemon",
25
+ "2": "ashwagandha",
26
+ "20": "madar",
27
+ "21": "mango",
28
+ "22": "marigold",
29
+ "23": "mint",
30
+ "24": "moringa",
31
+ "25": "naruneendi",
32
+ "26": "neem",
33
+ "27": "onion",
34
+ "28": "papaya",
35
+ "29": "ricinus",
36
+ "3": "babool",
37
+ "30": "rose",
38
+ "31": "sarpagandha",
39
+ "32": "shatavari",
40
+ "33": "stereoserpum",
41
+ "34": "tomato",
42
+ "35": "tulsi",
43
+ "36": "turmeric",
44
+ "37": "wedelia",
45
+ "4": "bael",
46
+ "5": "bakuchi",
47
+ "6": "barberry",
48
+ "7": "bhilawa",
49
+ "8": "bhringraj",
50
+ "9": "chilly"
51
+ },
52
+ "image_size": 224,
53
+ "initializer_range": 0.02,
54
+ "intermediate_size": 3072,
55
+ "label2id": {
56
+ "aloevera": "0",
57
+ "arjun": "1",
58
+ "ashwagandha": "2",
59
+ "babool": "3",
60
+ "bael": "4",
61
+ "bakuchi": "5",
62
+ "barberry": "6",
63
+ "bhilawa": "7",
64
+ "bhringraj": "8",
65
+ "chilly": "9",
66
+ "coffee": "10",
67
+ "coriander": "11",
68
+ "curry": "12",
69
+ "giloy": "13",
70
+ "ginger": "14",
71
+ "glochidion": "15",
72
+ "gotu kola": "16",
73
+ "hibiscus": "17",
74
+ "jasmine": "18",
75
+ "lemon": "19",
76
+ "madar": "20",
77
+ "mango": "21",
78
+ "marigold": "22",
79
+ "mint": "23",
80
+ "moringa": "24",
81
+ "naruneendi": "25",
82
+ "neem": "26",
83
+ "onion": "27",
84
+ "papaya": "28",
85
+ "ricinus": "29",
86
+ "rose": "30",
87
+ "sarpagandha": "31",
88
+ "shatavari": "32",
89
+ "stereoserpum": "33",
90
+ "tomato": "34",
91
+ "tulsi": "35",
92
+ "turmeric": "36",
93
+ "wedelia": "37"
94
+ },
95
+ "layer_norm_eps": 1e-12,
96
+ "model_type": "vit",
97
+ "num_attention_heads": 12,
98
+ "num_channels": 3,
99
+ "num_hidden_layers": 12,
100
+ "patch_size": 16,
101
+ "problem_type": "single_label_classification",
102
+ "qkv_bias": true,
103
+ "torch_dtype": "float32",
104
+ "transformers_version": "4.50.0.dev0"
105
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "eval_accuracy": 0.9514767932489452,
4
+ "eval_loss": 0.8745647668838501,
5
+ "eval_runtime": 51.5176,
6
+ "eval_samples_per_second": 9.201,
7
+ "eval_steps_per_second": 1.165
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:405aae6925ebb7690e4fc69bb192018b73635b0d76ba5fef4ea3dba7998a9b01
3
+ size 343334720
preprocessor_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.5,
8
+ 0.5,
9
+ 0.5
10
+ ],
11
+ "image_processor_type": "ViTImageProcessor",
12
+ "image_std": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "resample": 2,
18
+ "rescale_factor": 0.00392156862745098,
19
+ "size": {
20
+ "height": 224,
21
+ "width": 224
22
+ }
23
+ }
runs/Mar16_18-04-43_megamax-spl-vm01/events.out.tfevents.1742128491.megamax-spl-vm01.3229474.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc49302704f3016139389073f1fd079d7f7097dcc4196b717235d4943c8adf22
3
+ size 7663
runs/Mar16_18-07-16_megamax-spl-vm01/events.out.tfevents.1742128639.megamax-spl-vm01.3236623.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8a4193d1d23f9ead30fac7487299d057dceeb75b5f0548eb3f7aeaa15c2b795
3
+ size 43996
runs/Mar16_18-07-16_megamax-spl-vm01/events.out.tfevents.1742130492.megamax-spl-vm01.3236623.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40a8bff43120b59a5c0964a06324ad8fabed2569bbce79d2ddd4215bc8308dce
3
+ size 411
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 5.0,
3
+ "total_flos": 1.0410532148820787e+18,
4
+ "train_loss": 1.5688391100792658,
5
+ "train_runtime": 1801.1044,
6
+ "train_samples_per_second": 7.457,
7
+ "train_steps_per_second": 0.933
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8745647668838501,
3
+ "best_model_checkpoint": "./beans_outputs/checkpoint-1680",
4
+ "epoch": 5.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1680,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.02976190476190476,
13
+ "grad_norm": 3.170905113220215,
14
+ "learning_rate": 1.9880952380952384e-05,
15
+ "loss": 3.6048,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.05952380952380952,
20
+ "grad_norm": 2.51796555519104,
21
+ "learning_rate": 1.9761904761904763e-05,
22
+ "loss": 3.5551,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.08928571428571429,
27
+ "grad_norm": 2.868872880935669,
28
+ "learning_rate": 1.9642857142857145e-05,
29
+ "loss": 3.4868,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.11904761904761904,
34
+ "grad_norm": 3.2086801528930664,
35
+ "learning_rate": 1.9523809523809524e-05,
36
+ "loss": 3.4105,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.1488095238095238,
41
+ "grad_norm": 2.825397253036499,
42
+ "learning_rate": 1.9404761904761906e-05,
43
+ "loss": 3.3538,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.17857142857142858,
48
+ "grad_norm": 3.486938238143921,
49
+ "learning_rate": 1.928571428571429e-05,
50
+ "loss": 3.3201,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.20833333333333334,
55
+ "grad_norm": 2.802475929260254,
56
+ "learning_rate": 1.916666666666667e-05,
57
+ "loss": 3.2432,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.23809523809523808,
62
+ "grad_norm": 2.789459228515625,
63
+ "learning_rate": 1.904761904761905e-05,
64
+ "loss": 3.2041,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.26785714285714285,
69
+ "grad_norm": 3.008307933807373,
70
+ "learning_rate": 1.892857142857143e-05,
71
+ "loss": 3.1679,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.2976190476190476,
76
+ "grad_norm": 2.6487619876861572,
77
+ "learning_rate": 1.880952380952381e-05,
78
+ "loss": 3.1249,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.3273809523809524,
83
+ "grad_norm": 2.947179079055786,
84
+ "learning_rate": 1.8690476190476193e-05,
85
+ "loss": 3.0909,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.35714285714285715,
90
+ "grad_norm": 3.1243131160736084,
91
+ "learning_rate": 1.8571428571428575e-05,
92
+ "loss": 3.0953,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.3869047619047619,
97
+ "grad_norm": 2.9400837421417236,
98
+ "learning_rate": 1.8452380952380954e-05,
99
+ "loss": 2.9629,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.4166666666666667,
104
+ "grad_norm": 2.7061338424682617,
105
+ "learning_rate": 1.8333333333333333e-05,
106
+ "loss": 2.9307,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.44642857142857145,
111
+ "grad_norm": 2.6359243392944336,
112
+ "learning_rate": 1.8214285714285715e-05,
113
+ "loss": 2.8238,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.47619047619047616,
118
+ "grad_norm": 2.740408420562744,
119
+ "learning_rate": 1.8095238095238097e-05,
120
+ "loss": 2.8961,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.5059523809523809,
125
+ "grad_norm": 2.858968496322632,
126
+ "learning_rate": 1.797619047619048e-05,
127
+ "loss": 2.7505,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.5357142857142857,
132
+ "grad_norm": 2.7578256130218506,
133
+ "learning_rate": 1.785714285714286e-05,
134
+ "loss": 2.7989,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.5654761904761905,
139
+ "grad_norm": 2.9766931533813477,
140
+ "learning_rate": 1.7738095238095237e-05,
141
+ "loss": 2.6722,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.5952380952380952,
146
+ "grad_norm": 2.7900352478027344,
147
+ "learning_rate": 1.761904761904762e-05,
148
+ "loss": 2.7213,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.625,
153
+ "grad_norm": 3.004939556121826,
154
+ "learning_rate": 1.7500000000000002e-05,
155
+ "loss": 2.7287,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.6547619047619048,
160
+ "grad_norm": 2.7375917434692383,
161
+ "learning_rate": 1.7380952380952384e-05,
162
+ "loss": 2.6691,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.6845238095238095,
167
+ "grad_norm": 3.2530713081359863,
168
+ "learning_rate": 1.7261904761904763e-05,
169
+ "loss": 2.5742,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.7142857142857143,
174
+ "grad_norm": 3.0463545322418213,
175
+ "learning_rate": 1.7142857142857142e-05,
176
+ "loss": 2.4523,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.7440476190476191,
181
+ "grad_norm": 3.0471720695495605,
182
+ "learning_rate": 1.7023809523809524e-05,
183
+ "loss": 2.4592,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.7738095238095238,
188
+ "grad_norm": 3.4415907859802246,
189
+ "learning_rate": 1.6904761904761906e-05,
190
+ "loss": 2.4316,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.8035714285714286,
195
+ "grad_norm": 2.830673933029175,
196
+ "learning_rate": 1.678571428571429e-05,
197
+ "loss": 2.3903,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.8333333333333334,
202
+ "grad_norm": 3.584303617477417,
203
+ "learning_rate": 1.6666666666666667e-05,
204
+ "loss": 2.4643,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.8630952380952381,
209
+ "grad_norm": 3.9748589992523193,
210
+ "learning_rate": 1.6547619047619046e-05,
211
+ "loss": 2.3237,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.8928571428571429,
216
+ "grad_norm": 2.929922103881836,
217
+ "learning_rate": 1.642857142857143e-05,
218
+ "loss": 2.2639,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.9226190476190477,
223
+ "grad_norm": 4.647745132446289,
224
+ "learning_rate": 1.630952380952381e-05,
225
+ "loss": 2.4637,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.9523809523809523,
230
+ "grad_norm": 3.6543118953704834,
231
+ "learning_rate": 1.6190476190476193e-05,
232
+ "loss": 2.2519,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.9821428571428571,
237
+ "grad_norm": 3.3143322467803955,
238
+ "learning_rate": 1.6071428571428572e-05,
239
+ "loss": 2.1775,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 1.0,
244
+ "eval_accuracy": 0.7616033755274262,
245
+ "eval_loss": 2.1820600032806396,
246
+ "eval_runtime": 50.7645,
247
+ "eval_samples_per_second": 9.337,
248
+ "eval_steps_per_second": 1.182,
249
+ "step": 336
250
+ },
251
+ {
252
+ "epoch": 1.0119047619047619,
253
+ "grad_norm": 3.666236639022827,
254
+ "learning_rate": 1.5952380952380954e-05,
255
+ "loss": 2.1187,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 1.0416666666666667,
260
+ "grad_norm": 3.736830472946167,
261
+ "learning_rate": 1.5833333333333333e-05,
262
+ "loss": 2.1312,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 1.0714285714285714,
267
+ "grad_norm": 3.002455711364746,
268
+ "learning_rate": 1.5714285714285715e-05,
269
+ "loss": 2.2274,
270
+ "step": 360
271
+ },
272
+ {
273
+ "epoch": 1.1011904761904763,
274
+ "grad_norm": 3.2685108184814453,
275
+ "learning_rate": 1.5595238095238098e-05,
276
+ "loss": 2.1347,
277
+ "step": 370
278
+ },
279
+ {
280
+ "epoch": 1.130952380952381,
281
+ "grad_norm": 3.4998621940612793,
282
+ "learning_rate": 1.5476190476190476e-05,
283
+ "loss": 2.0757,
284
+ "step": 380
285
+ },
286
+ {
287
+ "epoch": 1.1607142857142858,
288
+ "grad_norm": 3.306267738342285,
289
+ "learning_rate": 1.535714285714286e-05,
290
+ "loss": 2.0177,
291
+ "step": 390
292
+ },
293
+ {
294
+ "epoch": 1.1904761904761905,
295
+ "grad_norm": 3.8774032592773438,
296
+ "learning_rate": 1.523809523809524e-05,
297
+ "loss": 1.9748,
298
+ "step": 400
299
+ },
300
+ {
301
+ "epoch": 1.2202380952380953,
302
+ "grad_norm": 2.662797212600708,
303
+ "learning_rate": 1.511904761904762e-05,
304
+ "loss": 1.9628,
305
+ "step": 410
306
+ },
307
+ {
308
+ "epoch": 1.25,
309
+ "grad_norm": 3.9353742599487305,
310
+ "learning_rate": 1.5000000000000002e-05,
311
+ "loss": 2.0104,
312
+ "step": 420
313
+ },
314
+ {
315
+ "epoch": 1.2797619047619047,
316
+ "grad_norm": 3.3460521697998047,
317
+ "learning_rate": 1.4880952380952383e-05,
318
+ "loss": 2.0678,
319
+ "step": 430
320
+ },
321
+ {
322
+ "epoch": 1.3095238095238095,
323
+ "grad_norm": 3.0211353302001953,
324
+ "learning_rate": 1.4761904761904763e-05,
325
+ "loss": 2.0294,
326
+ "step": 440
327
+ },
328
+ {
329
+ "epoch": 1.3392857142857144,
330
+ "grad_norm": 2.827756404876709,
331
+ "learning_rate": 1.4642857142857144e-05,
332
+ "loss": 1.9104,
333
+ "step": 450
334
+ },
335
+ {
336
+ "epoch": 1.369047619047619,
337
+ "grad_norm": 2.606844663619995,
338
+ "learning_rate": 1.4523809523809524e-05,
339
+ "loss": 1.933,
340
+ "step": 460
341
+ },
342
+ {
343
+ "epoch": 1.3988095238095237,
344
+ "grad_norm": 3.994950294494629,
345
+ "learning_rate": 1.4404761904761907e-05,
346
+ "loss": 1.9977,
347
+ "step": 470
348
+ },
349
+ {
350
+ "epoch": 1.4285714285714286,
351
+ "grad_norm": 3.6433207988739014,
352
+ "learning_rate": 1.4285714285714287e-05,
353
+ "loss": 1.897,
354
+ "step": 480
355
+ },
356
+ {
357
+ "epoch": 1.4583333333333333,
358
+ "grad_norm": 3.1899826526641846,
359
+ "learning_rate": 1.416666666666667e-05,
360
+ "loss": 1.9046,
361
+ "step": 490
362
+ },
363
+ {
364
+ "epoch": 1.4880952380952381,
365
+ "grad_norm": 3.352928638458252,
366
+ "learning_rate": 1.4047619047619048e-05,
367
+ "loss": 1.7378,
368
+ "step": 500
369
+ },
370
+ {
371
+ "epoch": 1.5178571428571428,
372
+ "grad_norm": 4.73577880859375,
373
+ "learning_rate": 1.3928571428571429e-05,
374
+ "loss": 1.7998,
375
+ "step": 510
376
+ },
377
+ {
378
+ "epoch": 1.5476190476190477,
379
+ "grad_norm": 3.118739366531372,
380
+ "learning_rate": 1.3809523809523811e-05,
381
+ "loss": 1.7316,
382
+ "step": 520
383
+ },
384
+ {
385
+ "epoch": 1.5773809523809523,
386
+ "grad_norm": 2.617877721786499,
387
+ "learning_rate": 1.3690476190476192e-05,
388
+ "loss": 1.6478,
389
+ "step": 530
390
+ },
391
+ {
392
+ "epoch": 1.6071428571428572,
393
+ "grad_norm": 3.3894600868225098,
394
+ "learning_rate": 1.3571428571428574e-05,
395
+ "loss": 1.7311,
396
+ "step": 540
397
+ },
398
+ {
399
+ "epoch": 1.6369047619047619,
400
+ "grad_norm": 4.088054656982422,
401
+ "learning_rate": 1.3452380952380954e-05,
402
+ "loss": 1.5008,
403
+ "step": 550
404
+ },
405
+ {
406
+ "epoch": 1.6666666666666665,
407
+ "grad_norm": 3.2209737300872803,
408
+ "learning_rate": 1.3333333333333333e-05,
409
+ "loss": 1.6994,
410
+ "step": 560
411
+ },
412
+ {
413
+ "epoch": 1.6964285714285714,
414
+ "grad_norm": 3.8286681175231934,
415
+ "learning_rate": 1.3214285714285716e-05,
416
+ "loss": 1.6879,
417
+ "step": 570
418
+ },
419
+ {
420
+ "epoch": 1.7261904761904763,
421
+ "grad_norm": 2.611720561981201,
422
+ "learning_rate": 1.3095238095238096e-05,
423
+ "loss": 1.6061,
424
+ "step": 580
425
+ },
426
+ {
427
+ "epoch": 1.755952380952381,
428
+ "grad_norm": 2.898097276687622,
429
+ "learning_rate": 1.2976190476190478e-05,
430
+ "loss": 1.5223,
431
+ "step": 590
432
+ },
433
+ {
434
+ "epoch": 1.7857142857142856,
435
+ "grad_norm": 2.2522895336151123,
436
+ "learning_rate": 1.2857142857142859e-05,
437
+ "loss": 1.5095,
438
+ "step": 600
439
+ },
440
+ {
441
+ "epoch": 1.8154761904761905,
442
+ "grad_norm": 3.5610804557800293,
443
+ "learning_rate": 1.2738095238095238e-05,
444
+ "loss": 1.6524,
445
+ "step": 610
446
+ },
447
+ {
448
+ "epoch": 1.8452380952380953,
449
+ "grad_norm": 3.532130002975464,
450
+ "learning_rate": 1.261904761904762e-05,
451
+ "loss": 1.5345,
452
+ "step": 620
453
+ },
454
+ {
455
+ "epoch": 1.875,
456
+ "grad_norm": 3.8648953437805176,
457
+ "learning_rate": 1.25e-05,
458
+ "loss": 1.691,
459
+ "step": 630
460
+ },
461
+ {
462
+ "epoch": 1.9047619047619047,
463
+ "grad_norm": 2.4936046600341797,
464
+ "learning_rate": 1.2380952380952383e-05,
465
+ "loss": 1.4573,
466
+ "step": 640
467
+ },
468
+ {
469
+ "epoch": 1.9345238095238095,
470
+ "grad_norm": 3.499699592590332,
471
+ "learning_rate": 1.2261904761904763e-05,
472
+ "loss": 1.5181,
473
+ "step": 650
474
+ },
475
+ {
476
+ "epoch": 1.9642857142857144,
477
+ "grad_norm": 2.7815959453582764,
478
+ "learning_rate": 1.2142857142857142e-05,
479
+ "loss": 1.4333,
480
+ "step": 660
481
+ },
482
+ {
483
+ "epoch": 1.994047619047619,
484
+ "grad_norm": 3.007183790206909,
485
+ "learning_rate": 1.2023809523809525e-05,
486
+ "loss": 1.4653,
487
+ "step": 670
488
+ },
489
+ {
490
+ "epoch": 2.0,
491
+ "eval_accuracy": 0.8839662447257384,
492
+ "eval_loss": 1.4698303937911987,
493
+ "eval_runtime": 51.4369,
494
+ "eval_samples_per_second": 9.215,
495
+ "eval_steps_per_second": 1.166,
496
+ "step": 672
497
+ },
498
+ {
499
+ "epoch": 2.0238095238095237,
500
+ "grad_norm": 3.4663267135620117,
501
+ "learning_rate": 1.1904761904761905e-05,
502
+ "loss": 1.4428,
503
+ "step": 680
504
+ },
505
+ {
506
+ "epoch": 2.0535714285714284,
507
+ "grad_norm": 2.2934768199920654,
508
+ "learning_rate": 1.1785714285714287e-05,
509
+ "loss": 1.4135,
510
+ "step": 690
511
+ },
512
+ {
513
+ "epoch": 2.0833333333333335,
514
+ "grad_norm": 2.601954221725464,
515
+ "learning_rate": 1.1666666666666668e-05,
516
+ "loss": 1.456,
517
+ "step": 700
518
+ },
519
+ {
520
+ "epoch": 2.113095238095238,
521
+ "grad_norm": 3.2254600524902344,
522
+ "learning_rate": 1.1547619047619047e-05,
523
+ "loss": 1.5227,
524
+ "step": 710
525
+ },
526
+ {
527
+ "epoch": 2.142857142857143,
528
+ "grad_norm": 3.2958316802978516,
529
+ "learning_rate": 1.1428571428571429e-05,
530
+ "loss": 1.4248,
531
+ "step": 720
532
+ },
533
+ {
534
+ "epoch": 2.1726190476190474,
535
+ "grad_norm": 4.993536472320557,
536
+ "learning_rate": 1.130952380952381e-05,
537
+ "loss": 1.4717,
538
+ "step": 730
539
+ },
540
+ {
541
+ "epoch": 2.2023809523809526,
542
+ "grad_norm": 3.3640084266662598,
543
+ "learning_rate": 1.1190476190476192e-05,
544
+ "loss": 1.4265,
545
+ "step": 740
546
+ },
547
+ {
548
+ "epoch": 2.232142857142857,
549
+ "grad_norm": 2.6835250854492188,
550
+ "learning_rate": 1.1071428571428572e-05,
551
+ "loss": 1.408,
552
+ "step": 750
553
+ },
554
+ {
555
+ "epoch": 2.261904761904762,
556
+ "grad_norm": 3.8518381118774414,
557
+ "learning_rate": 1.0952380952380955e-05,
558
+ "loss": 1.2666,
559
+ "step": 760
560
+ },
561
+ {
562
+ "epoch": 2.2916666666666665,
563
+ "grad_norm": 3.553366184234619,
564
+ "learning_rate": 1.0833333333333334e-05,
565
+ "loss": 1.4052,
566
+ "step": 770
567
+ },
568
+ {
569
+ "epoch": 2.3214285714285716,
570
+ "grad_norm": 2.657440423965454,
571
+ "learning_rate": 1.0714285714285714e-05,
572
+ "loss": 1.3953,
573
+ "step": 780
574
+ },
575
+ {
576
+ "epoch": 2.3511904761904763,
577
+ "grad_norm": 4.050617694854736,
578
+ "learning_rate": 1.0595238095238096e-05,
579
+ "loss": 1.3073,
580
+ "step": 790
581
+ },
582
+ {
583
+ "epoch": 2.380952380952381,
584
+ "grad_norm": 3.039287567138672,
585
+ "learning_rate": 1.0476190476190477e-05,
586
+ "loss": 1.3765,
587
+ "step": 800
588
+ },
589
+ {
590
+ "epoch": 2.4107142857142856,
591
+ "grad_norm": 3.350076913833618,
592
+ "learning_rate": 1.0357142857142859e-05,
593
+ "loss": 1.2713,
594
+ "step": 810
595
+ },
596
+ {
597
+ "epoch": 2.4404761904761907,
598
+ "grad_norm": 4.112967491149902,
599
+ "learning_rate": 1.0238095238095238e-05,
600
+ "loss": 1.3557,
601
+ "step": 820
602
+ },
603
+ {
604
+ "epoch": 2.4702380952380953,
605
+ "grad_norm": 2.587895154953003,
606
+ "learning_rate": 1.011904761904762e-05,
607
+ "loss": 1.2106,
608
+ "step": 830
609
+ },
610
+ {
611
+ "epoch": 2.5,
612
+ "grad_norm": 2.2189221382141113,
613
+ "learning_rate": 1e-05,
614
+ "loss": 1.1529,
615
+ "step": 840
616
+ },
617
+ {
618
+ "epoch": 2.5297619047619047,
619
+ "grad_norm": 1.7763313055038452,
620
+ "learning_rate": 9.880952380952381e-06,
621
+ "loss": 1.2066,
622
+ "step": 850
623
+ },
624
+ {
625
+ "epoch": 2.5595238095238093,
626
+ "grad_norm": 2.5652577877044678,
627
+ "learning_rate": 9.761904761904762e-06,
628
+ "loss": 1.2206,
629
+ "step": 860
630
+ },
631
+ {
632
+ "epoch": 2.5892857142857144,
633
+ "grad_norm": 2.4081642627716064,
634
+ "learning_rate": 9.642857142857144e-06,
635
+ "loss": 1.2288,
636
+ "step": 870
637
+ },
638
+ {
639
+ "epoch": 2.619047619047619,
640
+ "grad_norm": 3.4448933601379395,
641
+ "learning_rate": 9.523809523809525e-06,
642
+ "loss": 1.2764,
643
+ "step": 880
644
+ },
645
+ {
646
+ "epoch": 2.6488095238095237,
647
+ "grad_norm": 3.65535044670105,
648
+ "learning_rate": 9.404761904761905e-06,
649
+ "loss": 1.1818,
650
+ "step": 890
651
+ },
652
+ {
653
+ "epoch": 2.678571428571429,
654
+ "grad_norm": 2.902886152267456,
655
+ "learning_rate": 9.285714285714288e-06,
656
+ "loss": 1.2662,
657
+ "step": 900
658
+ },
659
+ {
660
+ "epoch": 2.7083333333333335,
661
+ "grad_norm": 2.8251378536224365,
662
+ "learning_rate": 9.166666666666666e-06,
663
+ "loss": 1.1246,
664
+ "step": 910
665
+ },
666
+ {
667
+ "epoch": 2.738095238095238,
668
+ "grad_norm": 2.1443264484405518,
669
+ "learning_rate": 9.047619047619049e-06,
670
+ "loss": 1.2486,
671
+ "step": 920
672
+ },
673
+ {
674
+ "epoch": 2.767857142857143,
675
+ "grad_norm": 4.930934429168701,
676
+ "learning_rate": 8.92857142857143e-06,
677
+ "loss": 1.1865,
678
+ "step": 930
679
+ },
680
+ {
681
+ "epoch": 2.7976190476190474,
682
+ "grad_norm": 3.2018985748291016,
683
+ "learning_rate": 8.80952380952381e-06,
684
+ "loss": 1.1047,
685
+ "step": 940
686
+ },
687
+ {
688
+ "epoch": 2.8273809523809526,
689
+ "grad_norm": 3.2998268604278564,
690
+ "learning_rate": 8.690476190476192e-06,
691
+ "loss": 1.2098,
692
+ "step": 950
693
+ },
694
+ {
695
+ "epoch": 2.857142857142857,
696
+ "grad_norm": 2.1316542625427246,
697
+ "learning_rate": 8.571428571428571e-06,
698
+ "loss": 1.0918,
699
+ "step": 960
700
+ },
701
+ {
702
+ "epoch": 2.886904761904762,
703
+ "grad_norm": 3.8014087677001953,
704
+ "learning_rate": 8.452380952380953e-06,
705
+ "loss": 1.1139,
706
+ "step": 970
707
+ },
708
+ {
709
+ "epoch": 2.9166666666666665,
710
+ "grad_norm": 2.8320999145507812,
711
+ "learning_rate": 8.333333333333334e-06,
712
+ "loss": 1.213,
713
+ "step": 980
714
+ },
715
+ {
716
+ "epoch": 2.946428571428571,
717
+ "grad_norm": 3.016481876373291,
718
+ "learning_rate": 8.214285714285714e-06,
719
+ "loss": 1.1398,
720
+ "step": 990
721
+ },
722
+ {
723
+ "epoch": 2.9761904761904763,
724
+ "grad_norm": 3.9006187915802,
725
+ "learning_rate": 8.095238095238097e-06,
726
+ "loss": 1.1052,
727
+ "step": 1000
728
+ },
729
+ {
730
+ "epoch": 3.0,
731
+ "eval_accuracy": 0.930379746835443,
732
+ "eval_loss": 1.0801581144332886,
733
+ "eval_runtime": 51.0077,
734
+ "eval_samples_per_second": 9.293,
735
+ "eval_steps_per_second": 1.176,
736
+ "step": 1008
737
+ },
738
+ {
739
+ "epoch": 3.005952380952381,
740
+ "grad_norm": 2.796464204788208,
741
+ "learning_rate": 7.976190476190477e-06,
742
+ "loss": 1.1341,
743
+ "step": 1010
744
+ },
745
+ {
746
+ "epoch": 3.0357142857142856,
747
+ "grad_norm": 2.1846368312835693,
748
+ "learning_rate": 7.857142857142858e-06,
749
+ "loss": 1.173,
750
+ "step": 1020
751
+ },
752
+ {
753
+ "epoch": 3.0654761904761907,
754
+ "grad_norm": 3.3909096717834473,
755
+ "learning_rate": 7.738095238095238e-06,
756
+ "loss": 1.0198,
757
+ "step": 1030
758
+ },
759
+ {
760
+ "epoch": 3.0952380952380953,
761
+ "grad_norm": 3.5887138843536377,
762
+ "learning_rate": 7.61904761904762e-06,
763
+ "loss": 1.0729,
764
+ "step": 1040
765
+ },
766
+ {
767
+ "epoch": 3.125,
768
+ "grad_norm": 2.7871737480163574,
769
+ "learning_rate": 7.500000000000001e-06,
770
+ "loss": 0.9676,
771
+ "step": 1050
772
+ },
773
+ {
774
+ "epoch": 3.1547619047619047,
775
+ "grad_norm": 3.3368754386901855,
776
+ "learning_rate": 7.380952380952382e-06,
777
+ "loss": 0.9599,
778
+ "step": 1060
779
+ },
780
+ {
781
+ "epoch": 3.1845238095238093,
782
+ "grad_norm": 3.748992919921875,
783
+ "learning_rate": 7.261904761904762e-06,
784
+ "loss": 1.1599,
785
+ "step": 1070
786
+ },
787
+ {
788
+ "epoch": 3.2142857142857144,
789
+ "grad_norm": 4.470694065093994,
790
+ "learning_rate": 7.1428571428571436e-06,
791
+ "loss": 1.155,
792
+ "step": 1080
793
+ },
794
+ {
795
+ "epoch": 3.244047619047619,
796
+ "grad_norm": 1.8315823078155518,
797
+ "learning_rate": 7.023809523809524e-06,
798
+ "loss": 0.979,
799
+ "step": 1090
800
+ },
801
+ {
802
+ "epoch": 3.2738095238095237,
803
+ "grad_norm": 2.505209445953369,
804
+ "learning_rate": 6.9047619047619055e-06,
805
+ "loss": 1.142,
806
+ "step": 1100
807
+ },
808
+ {
809
+ "epoch": 3.3035714285714284,
810
+ "grad_norm": 3.056353807449341,
811
+ "learning_rate": 6.785714285714287e-06,
812
+ "loss": 1.0072,
813
+ "step": 1110
814
+ },
815
+ {
816
+ "epoch": 3.3333333333333335,
817
+ "grad_norm": 3.9302310943603516,
818
+ "learning_rate": 6.666666666666667e-06,
819
+ "loss": 1.0705,
820
+ "step": 1120
821
+ },
822
+ {
823
+ "epoch": 3.363095238095238,
824
+ "grad_norm": 4.6520490646362305,
825
+ "learning_rate": 6.547619047619048e-06,
826
+ "loss": 1.0325,
827
+ "step": 1130
828
+ },
829
+ {
830
+ "epoch": 3.392857142857143,
831
+ "grad_norm": 3.9381701946258545,
832
+ "learning_rate": 6.4285714285714295e-06,
833
+ "loss": 0.9674,
834
+ "step": 1140
835
+ },
836
+ {
837
+ "epoch": 3.4226190476190474,
838
+ "grad_norm": 5.080965042114258,
839
+ "learning_rate": 6.30952380952381e-06,
840
+ "loss": 0.9812,
841
+ "step": 1150
842
+ },
843
+ {
844
+ "epoch": 3.4523809523809526,
845
+ "grad_norm": 4.649317264556885,
846
+ "learning_rate": 6.1904761904761914e-06,
847
+ "loss": 1.1093,
848
+ "step": 1160
849
+ },
850
+ {
851
+ "epoch": 3.482142857142857,
852
+ "grad_norm": 5.5956315994262695,
853
+ "learning_rate": 6.071428571428571e-06,
854
+ "loss": 1.0133,
855
+ "step": 1170
856
+ },
857
+ {
858
+ "epoch": 3.511904761904762,
859
+ "grad_norm": 4.99602746963501,
860
+ "learning_rate": 5.9523809523809525e-06,
861
+ "loss": 1.075,
862
+ "step": 1180
863
+ },
864
+ {
865
+ "epoch": 3.5416666666666665,
866
+ "grad_norm": 3.875300407409668,
867
+ "learning_rate": 5.833333333333334e-06,
868
+ "loss": 1.1469,
869
+ "step": 1190
870
+ },
871
+ {
872
+ "epoch": 3.571428571428571,
873
+ "grad_norm": 2.9351279735565186,
874
+ "learning_rate": 5.7142857142857145e-06,
875
+ "loss": 1.1746,
876
+ "step": 1200
877
+ },
878
+ {
879
+ "epoch": 3.6011904761904763,
880
+ "grad_norm": 3.581909418106079,
881
+ "learning_rate": 5.595238095238096e-06,
882
+ "loss": 1.0452,
883
+ "step": 1210
884
+ },
885
+ {
886
+ "epoch": 3.630952380952381,
887
+ "grad_norm": 2.4383697509765625,
888
+ "learning_rate": 5.476190476190477e-06,
889
+ "loss": 0.884,
890
+ "step": 1220
891
+ },
892
+ {
893
+ "epoch": 3.6607142857142856,
894
+ "grad_norm": 3.386600971221924,
895
+ "learning_rate": 5.357142857142857e-06,
896
+ "loss": 0.9479,
897
+ "step": 1230
898
+ },
899
+ {
900
+ "epoch": 3.6904761904761907,
901
+ "grad_norm": 1.5890535116195679,
902
+ "learning_rate": 5.2380952380952384e-06,
903
+ "loss": 0.8953,
904
+ "step": 1240
905
+ },
906
+ {
907
+ "epoch": 3.7202380952380953,
908
+ "grad_norm": 2.729491710662842,
909
+ "learning_rate": 5.119047619047619e-06,
910
+ "loss": 0.9071,
911
+ "step": 1250
912
+ },
913
+ {
914
+ "epoch": 3.75,
915
+ "grad_norm": 4.265748977661133,
916
+ "learning_rate": 5e-06,
917
+ "loss": 1.0496,
918
+ "step": 1260
919
+ },
920
+ {
921
+ "epoch": 3.7797619047619047,
922
+ "grad_norm": 3.6234512329101562,
923
+ "learning_rate": 4.880952380952381e-06,
924
+ "loss": 0.9945,
925
+ "step": 1270
926
+ },
927
+ {
928
+ "epoch": 3.8095238095238093,
929
+ "grad_norm": 3.0296449661254883,
930
+ "learning_rate": 4.761904761904762e-06,
931
+ "loss": 1.0592,
932
+ "step": 1280
933
+ },
934
+ {
935
+ "epoch": 3.8392857142857144,
936
+ "grad_norm": 3.7550673484802246,
937
+ "learning_rate": 4.642857142857144e-06,
938
+ "loss": 0.9102,
939
+ "step": 1290
940
+ },
941
+ {
942
+ "epoch": 3.869047619047619,
943
+ "grad_norm": 2.3732712268829346,
944
+ "learning_rate": 4.523809523809524e-06,
945
+ "loss": 0.9721,
946
+ "step": 1300
947
+ },
948
+ {
949
+ "epoch": 3.8988095238095237,
950
+ "grad_norm": 4.049142360687256,
951
+ "learning_rate": 4.404761904761905e-06,
952
+ "loss": 0.9409,
953
+ "step": 1310
954
+ },
955
+ {
956
+ "epoch": 3.928571428571429,
957
+ "grad_norm": 2.1877949237823486,
958
+ "learning_rate": 4.2857142857142855e-06,
959
+ "loss": 1.0235,
960
+ "step": 1320
961
+ },
962
+ {
963
+ "epoch": 3.9583333333333335,
964
+ "grad_norm": 1.8449411392211914,
965
+ "learning_rate": 4.166666666666667e-06,
966
+ "loss": 0.978,
967
+ "step": 1330
968
+ },
969
+ {
970
+ "epoch": 3.988095238095238,
971
+ "grad_norm": 2.8841190338134766,
972
+ "learning_rate": 4.047619047619048e-06,
973
+ "loss": 1.0055,
974
+ "step": 1340
975
+ },
976
+ {
977
+ "epoch": 4.0,
978
+ "eval_accuracy": 0.9493670886075949,
979
+ "eval_loss": 0.9248189926147461,
980
+ "eval_runtime": 51.0423,
981
+ "eval_samples_per_second": 9.286,
982
+ "eval_steps_per_second": 1.175,
983
+ "step": 1344
984
+ },
985
+ {
986
+ "epoch": 4.017857142857143,
987
+ "grad_norm": 2.242076873779297,
988
+ "learning_rate": 3.928571428571429e-06,
989
+ "loss": 0.9244,
990
+ "step": 1350
991
+ },
992
+ {
993
+ "epoch": 4.0476190476190474,
994
+ "grad_norm": 1.98090660572052,
995
+ "learning_rate": 3.80952380952381e-06,
996
+ "loss": 0.8568,
997
+ "step": 1360
998
+ },
999
+ {
1000
+ "epoch": 4.0773809523809526,
1001
+ "grad_norm": 3.927706718444824,
1002
+ "learning_rate": 3.690476190476191e-06,
1003
+ "loss": 0.9644,
1004
+ "step": 1370
1005
+ },
1006
+ {
1007
+ "epoch": 4.107142857142857,
1008
+ "grad_norm": 2.3780994415283203,
1009
+ "learning_rate": 3.5714285714285718e-06,
1010
+ "loss": 0.97,
1011
+ "step": 1380
1012
+ },
1013
+ {
1014
+ "epoch": 4.136904761904762,
1015
+ "grad_norm": 2.21608304977417,
1016
+ "learning_rate": 3.4523809523809528e-06,
1017
+ "loss": 0.9728,
1018
+ "step": 1390
1019
+ },
1020
+ {
1021
+ "epoch": 4.166666666666667,
1022
+ "grad_norm": 6.764073848724365,
1023
+ "learning_rate": 3.3333333333333333e-06,
1024
+ "loss": 0.8729,
1025
+ "step": 1400
1026
+ },
1027
+ {
1028
+ "epoch": 4.196428571428571,
1029
+ "grad_norm": 1.5746071338653564,
1030
+ "learning_rate": 3.2142857142857147e-06,
1031
+ "loss": 0.7702,
1032
+ "step": 1410
1033
+ },
1034
+ {
1035
+ "epoch": 4.226190476190476,
1036
+ "grad_norm": 1.8241825103759766,
1037
+ "learning_rate": 3.0952380952380957e-06,
1038
+ "loss": 0.9121,
1039
+ "step": 1420
1040
+ },
1041
+ {
1042
+ "epoch": 4.255952380952381,
1043
+ "grad_norm": 3.9683926105499268,
1044
+ "learning_rate": 2.9761904761904763e-06,
1045
+ "loss": 0.8749,
1046
+ "step": 1430
1047
+ },
1048
+ {
1049
+ "epoch": 4.285714285714286,
1050
+ "grad_norm": 1.5732113122940063,
1051
+ "learning_rate": 2.8571428571428573e-06,
1052
+ "loss": 0.9421,
1053
+ "step": 1440
1054
+ },
1055
+ {
1056
+ "epoch": 4.315476190476191,
1057
+ "grad_norm": 2.5848405361175537,
1058
+ "learning_rate": 2.7380952380952387e-06,
1059
+ "loss": 0.9617,
1060
+ "step": 1450
1061
+ },
1062
+ {
1063
+ "epoch": 4.345238095238095,
1064
+ "grad_norm": 3.7017910480499268,
1065
+ "learning_rate": 2.6190476190476192e-06,
1066
+ "loss": 0.905,
1067
+ "step": 1460
1068
+ },
1069
+ {
1070
+ "epoch": 4.375,
1071
+ "grad_norm": 5.973739147186279,
1072
+ "learning_rate": 2.5e-06,
1073
+ "loss": 0.89,
1074
+ "step": 1470
1075
+ },
1076
+ {
1077
+ "epoch": 4.404761904761905,
1078
+ "grad_norm": 1.8716737031936646,
1079
+ "learning_rate": 2.380952380952381e-06,
1080
+ "loss": 0.9635,
1081
+ "step": 1480
1082
+ },
1083
+ {
1084
+ "epoch": 4.434523809523809,
1085
+ "grad_norm": 3.3029792308807373,
1086
+ "learning_rate": 2.261904761904762e-06,
1087
+ "loss": 0.933,
1088
+ "step": 1490
1089
+ },
1090
+ {
1091
+ "epoch": 4.464285714285714,
1092
+ "grad_norm": 2.5819740295410156,
1093
+ "learning_rate": 2.1428571428571427e-06,
1094
+ "loss": 0.8899,
1095
+ "step": 1500
1096
+ },
1097
+ {
1098
+ "epoch": 4.494047619047619,
1099
+ "grad_norm": 3.5635058879852295,
1100
+ "learning_rate": 2.023809523809524e-06,
1101
+ "loss": 0.8539,
1102
+ "step": 1510
1103
+ },
1104
+ {
1105
+ "epoch": 4.523809523809524,
1106
+ "grad_norm": 2.5672874450683594,
1107
+ "learning_rate": 1.904761904761905e-06,
1108
+ "loss": 1.0972,
1109
+ "step": 1520
1110
+ },
1111
+ {
1112
+ "epoch": 4.553571428571429,
1113
+ "grad_norm": 5.11098051071167,
1114
+ "learning_rate": 1.7857142857142859e-06,
1115
+ "loss": 0.9862,
1116
+ "step": 1530
1117
+ },
1118
+ {
1119
+ "epoch": 4.583333333333333,
1120
+ "grad_norm": 2.5244972705841064,
1121
+ "learning_rate": 1.6666666666666667e-06,
1122
+ "loss": 1.0213,
1123
+ "step": 1540
1124
+ },
1125
+ {
1126
+ "epoch": 4.613095238095238,
1127
+ "grad_norm": 3.5044398307800293,
1128
+ "learning_rate": 1.5476190476190479e-06,
1129
+ "loss": 0.9144,
1130
+ "step": 1550
1131
+ },
1132
+ {
1133
+ "epoch": 4.642857142857143,
1134
+ "grad_norm": 2.4903435707092285,
1135
+ "learning_rate": 1.4285714285714286e-06,
1136
+ "loss": 0.9331,
1137
+ "step": 1560
1138
+ },
1139
+ {
1140
+ "epoch": 4.6726190476190474,
1141
+ "grad_norm": 3.208696126937866,
1142
+ "learning_rate": 1.3095238095238096e-06,
1143
+ "loss": 1.013,
1144
+ "step": 1570
1145
+ },
1146
+ {
1147
+ "epoch": 4.7023809523809526,
1148
+ "grad_norm": 2.255563735961914,
1149
+ "learning_rate": 1.1904761904761906e-06,
1150
+ "loss": 0.7625,
1151
+ "step": 1580
1152
+ },
1153
+ {
1154
+ "epoch": 4.732142857142857,
1155
+ "grad_norm": 2.1157748699188232,
1156
+ "learning_rate": 1.0714285714285714e-06,
1157
+ "loss": 0.8885,
1158
+ "step": 1590
1159
+ },
1160
+ {
1161
+ "epoch": 4.761904761904762,
1162
+ "grad_norm": 3.0076255798339844,
1163
+ "learning_rate": 9.523809523809525e-07,
1164
+ "loss": 1.0166,
1165
+ "step": 1600
1166
+ },
1167
+ {
1168
+ "epoch": 4.791666666666667,
1169
+ "grad_norm": 2.899481773376465,
1170
+ "learning_rate": 8.333333333333333e-07,
1171
+ "loss": 0.9983,
1172
+ "step": 1610
1173
+ },
1174
+ {
1175
+ "epoch": 4.821428571428571,
1176
+ "grad_norm": 6.084941387176514,
1177
+ "learning_rate": 7.142857142857143e-07,
1178
+ "loss": 1.1526,
1179
+ "step": 1620
1180
+ },
1181
+ {
1182
+ "epoch": 4.851190476190476,
1183
+ "grad_norm": 3.8710179328918457,
1184
+ "learning_rate": 5.952380952380953e-07,
1185
+ "loss": 0.8589,
1186
+ "step": 1630
1187
+ },
1188
+ {
1189
+ "epoch": 4.880952380952381,
1190
+ "grad_norm": 2.1053106784820557,
1191
+ "learning_rate": 4.7619047619047623e-07,
1192
+ "loss": 0.8788,
1193
+ "step": 1640
1194
+ },
1195
+ {
1196
+ "epoch": 4.910714285714286,
1197
+ "grad_norm": 2.2121217250823975,
1198
+ "learning_rate": 3.5714285714285716e-07,
1199
+ "loss": 0.8718,
1200
+ "step": 1650
1201
+ },
1202
+ {
1203
+ "epoch": 4.940476190476191,
1204
+ "grad_norm": 2.3137481212615967,
1205
+ "learning_rate": 2.3809523809523811e-07,
1206
+ "loss": 0.7878,
1207
+ "step": 1660
1208
+ },
1209
+ {
1210
+ "epoch": 4.970238095238095,
1211
+ "grad_norm": 2.676529884338379,
1212
+ "learning_rate": 1.1904761904761906e-07,
1213
+ "loss": 0.7782,
1214
+ "step": 1670
1215
+ },
1216
+ {
1217
+ "epoch": 5.0,
1218
+ "grad_norm": 7.775545597076416,
1219
+ "learning_rate": 0.0,
1220
+ "loss": 0.7847,
1221
+ "step": 1680
1222
+ },
1223
+ {
1224
+ "epoch": 5.0,
1225
+ "eval_accuracy": 0.9514767932489452,
1226
+ "eval_loss": 0.8745647668838501,
1227
+ "eval_runtime": 50.8678,
1228
+ "eval_samples_per_second": 9.318,
1229
+ "eval_steps_per_second": 1.18,
1230
+ "step": 1680
1231
+ },
1232
+ {
1233
+ "epoch": 5.0,
1234
+ "step": 1680,
1235
+ "total_flos": 1.0410532148820787e+18,
1236
+ "train_loss": 1.5688391100792658,
1237
+ "train_runtime": 1801.1044,
1238
+ "train_samples_per_second": 7.457,
1239
+ "train_steps_per_second": 0.933
1240
+ }
1241
+ ],
1242
+ "logging_steps": 10,
1243
+ "max_steps": 1680,
1244
+ "num_input_tokens_seen": 0,
1245
+ "num_train_epochs": 5,
1246
+ "save_steps": 500,
1247
+ "stateful_callbacks": {
1248
+ "TrainerControl": {
1249
+ "args": {
1250
+ "should_epoch_stop": false,
1251
+ "should_evaluate": false,
1252
+ "should_log": false,
1253
+ "should_save": true,
1254
+ "should_training_stop": true
1255
+ },
1256
+ "attributes": {}
1257
+ }
1258
+ },
1259
+ "total_flos": 1.0410532148820787e+18,
1260
+ "train_batch_size": 8,
1261
+ "trial_name": null,
1262
+ "trial_params": null
1263
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31662c38d27298aa5f2cd538b8973a3eb6908906cc3a005f04bcb4fc904d82b6
3
+ size 5368