meomeo163 commited on
Commit
c07a966
·
verified ·
1 Parent(s): d66d603

Upload 14 files

Browse files
added_tokens.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|câu6|>": 50261,
3
+ "<|câu8|>": 50262,
4
+ "<|endoftext|>": 50257,
5
+ "<|khổ|>": 50263,
6
+ "<|kết|>": 50259,
7
+ "<|pad|>": 50258,
8
+ "<|thơ|>": 50260
9
+ }
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.0,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.0,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_ctx": 1024,
14
+ "n_embd": 768,
15
+ "n_head": 12,
16
+ "n_inner": null,
17
+ "n_layer": 12,
18
+ "n_positions": 1024,
19
+ "reorder_and_upcast_attn": false,
20
+ "resid_pdrop": 0.0,
21
+ "scale_attn_by_inverse_layer_idx": false,
22
+ "scale_attn_weights": true,
23
+ "summary_activation": null,
24
+ "summary_first_dropout": 0.1,
25
+ "summary_proj_to_labels": true,
26
+ "summary_type": "cls_index",
27
+ "summary_use_proj": true,
28
+ "task_specific_params": {
29
+ "text-generation": {
30
+ "do_sample": true,
31
+ "max_length": 50
32
+ }
33
+ },
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.51.3",
36
+ "use_cache": true,
37
+ "vocab_size": 50264
38
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.51.3"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68605f8a459cefde6fa7afa47ed6de1be5bb816422175c8aeecb53c816457c3f
3
+ size 497795712
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:652c2b0dca5a9b2b71c9428329eb11e91cbe7a52b6df58b5d95d456a4cd67b88
3
+ size 995685771
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4b94d24c5fd3f8add3c432b54f5305b9646b5d564b95bdc0317d59c71a8bc1c
3
+ size 14645
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7d1729a0a737781a04316ceb381d8571ac00a4cfb9935801904a3a63d931dc3
3
+ size 1383
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45c7433081939794c4f3f9ca0a51bae55a2d89749e5802ef10d8b5cec4de5547
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|câu6|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ {
11
+ "content": "<|câu8|>",
12
+ "lstrip": false,
13
+ "normalized": false,
14
+ "rstrip": false,
15
+ "single_word": false
16
+ },
17
+ {
18
+ "content": "<|khổ|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ ],
25
+ "bos_token": {
26
+ "content": "<|thơ|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "eos_token": {
33
+ "content": "<|kết|>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ },
39
+ "pad_token": {
40
+ "content": "<|pad|>",
41
+ "lstrip": false,
42
+ "normalized": false,
43
+ "rstrip": false,
44
+ "single_word": false
45
+ },
46
+ "unk_token": {
47
+ "content": "<|endoftext|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false
52
+ }
53
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<s>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<pad>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "3": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "4": {
38
+ "content": "<mask>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "50257": {
46
+ "content": "<|endoftext|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "50258": {
54
+ "content": "<|pad|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "50259": {
62
+ "content": "<|kết|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "50260": {
70
+ "content": "<|thơ|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "50261": {
78
+ "content": "<|câu6|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "50262": {
86
+ "content": "<|câu8|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "50263": {
94
+ "content": "<|khổ|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ }
101
+ },
102
+ "additional_special_tokens": [
103
+ "<|câu6|>",
104
+ "<|câu8|>",
105
+ "<|khổ|>"
106
+ ],
107
+ "bos_token": "<|thơ|>",
108
+ "clean_up_tokenization_spaces": false,
109
+ "eos_token": "<|kết|>",
110
+ "errors": "replace",
111
+ "extra_special_tokens": {},
112
+ "model_max_length": 1000000000000000019884624838656,
113
+ "pad_token": "<|pad|>",
114
+ "tokenizer_class": "GPT2Tokenizer",
115
+ "unk_token": "<|endoftext|>"
116
+ }
trainer_state.json ADDED
@@ -0,0 +1,1934 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 6681,
3
+ "best_metric": 3.168698787689209,
4
+ "best_model_checkpoint": "./luc-bat-poet-model\\checkpoint-6681",
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 6681,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00044917324050419694,
14
+ "grad_norm": 5.473126411437988,
15
+ "learning_rate": 0.0,
16
+ "loss": 5.4778,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.011229331012604924,
21
+ "grad_norm": 0.8930467963218689,
22
+ "learning_rate": 0.00012,
23
+ "loss": 5.0452,
24
+ "step": 25
25
+ },
26
+ {
27
+ "epoch": 0.02245866202520985,
28
+ "grad_norm": 0.7617964744567871,
29
+ "learning_rate": 0.000245,
30
+ "loss": 4.5144,
31
+ "step": 50
32
+ },
33
+ {
34
+ "epoch": 0.03368799303781477,
35
+ "grad_norm": 0.8281979560852051,
36
+ "learning_rate": 0.00037,
37
+ "loss": 4.3613,
38
+ "step": 75
39
+ },
40
+ {
41
+ "epoch": 0.0449173240504197,
42
+ "grad_norm": 0.8189941644668579,
43
+ "learning_rate": 0.000495,
44
+ "loss": 4.2239,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.05614665506302462,
49
+ "grad_norm": 1.01946222782135,
50
+ "learning_rate": 0.00062,
51
+ "loss": 4.1386,
52
+ "step": 125
53
+ },
54
+ {
55
+ "epoch": 0.06737598607562954,
56
+ "grad_norm": 0.9489204287528992,
57
+ "learning_rate": 0.000745,
58
+ "loss": 4.0065,
59
+ "step": 150
60
+ },
61
+ {
62
+ "epoch": 0.07860531708823447,
63
+ "grad_norm": 1.6589257717132568,
64
+ "learning_rate": 0.00087,
65
+ "loss": 3.9373,
66
+ "step": 175
67
+ },
68
+ {
69
+ "epoch": 0.0898346481008394,
70
+ "grad_norm": 1.1351829767227173,
71
+ "learning_rate": 0.000995,
72
+ "loss": 3.9028,
73
+ "step": 200
74
+ },
75
+ {
76
+ "epoch": 0.10106397911344432,
77
+ "grad_norm": 0.7443220615386963,
78
+ "learning_rate": 0.000999995416032659,
79
+ "loss": 3.8687,
80
+ "step": 225
81
+ },
82
+ {
83
+ "epoch": 0.11229331012604923,
84
+ "grad_norm": 0.8907812833786011,
85
+ "learning_rate": 0.0009999808922703088,
86
+ "loss": 3.8144,
87
+ "step": 250
88
+ },
89
+ {
90
+ "epoch": 0.12352264113865416,
91
+ "grad_norm": 0.651372492313385,
92
+ "learning_rate": 0.0009999564210436207,
93
+ "loss": 3.8007,
94
+ "step": 275
95
+ },
96
+ {
97
+ "epoch": 0.1347519721512591,
98
+ "grad_norm": 0.7908700704574585,
99
+ "learning_rate": 0.000999922002839467,
100
+ "loss": 3.7684,
101
+ "step": 300
102
+ },
103
+ {
104
+ "epoch": 0.145981303163864,
105
+ "grad_norm": 0.6001281142234802,
106
+ "learning_rate": 0.0009998776383426215,
107
+ "loss": 3.743,
108
+ "step": 325
109
+ },
110
+ {
111
+ "epoch": 0.15721063417646894,
112
+ "grad_norm": 0.6484293937683105,
113
+ "learning_rate": 0.0009998233284357462,
114
+ "loss": 3.7416,
115
+ "step": 350
116
+ },
117
+ {
118
+ "epoch": 0.16843996518907386,
119
+ "grad_norm": 0.7714053988456726,
120
+ "learning_rate": 0.0009997590741993743,
121
+ "loss": 3.7108,
122
+ "step": 375
123
+ },
124
+ {
125
+ "epoch": 0.1796692962016788,
126
+ "grad_norm": 0.6280301213264465,
127
+ "learning_rate": 0.0009996848769118882,
128
+ "loss": 3.6854,
129
+ "step": 400
130
+ },
131
+ {
132
+ "epoch": 0.19089862721428372,
133
+ "grad_norm": 0.644794225692749,
134
+ "learning_rate": 0.0009996007380494937,
135
+ "loss": 3.6737,
136
+ "step": 425
137
+ },
138
+ {
139
+ "epoch": 0.20212795822688864,
140
+ "grad_norm": 0.6872850060462952,
141
+ "learning_rate": 0.0009995066592861919,
142
+ "loss": 3.662,
143
+ "step": 450
144
+ },
145
+ {
146
+ "epoch": 0.21335728923949357,
147
+ "grad_norm": 0.601828396320343,
148
+ "learning_rate": 0.0009994026424937441,
149
+ "loss": 3.678,
150
+ "step": 475
151
+ },
152
+ {
153
+ "epoch": 0.22458662025209847,
154
+ "grad_norm": 0.6280804872512817,
155
+ "learning_rate": 0.0009992886897416365,
156
+ "loss": 3.6652,
157
+ "step": 500
158
+ },
159
+ {
160
+ "epoch": 0.2358159512647034,
161
+ "grad_norm": 0.5711939930915833,
162
+ "learning_rate": 0.0009991648032970373,
163
+ "loss": 3.627,
164
+ "step": 525
165
+ },
166
+ {
167
+ "epoch": 0.24704528227730832,
168
+ "grad_norm": 0.5883836150169373,
169
+ "learning_rate": 0.000999030985624753,
170
+ "loss": 3.6433,
171
+ "step": 550
172
+ },
173
+ {
174
+ "epoch": 0.25827461328991325,
175
+ "grad_norm": 0.5956864356994629,
176
+ "learning_rate": 0.000998887239387178,
177
+ "loss": 3.6207,
178
+ "step": 575
179
+ },
180
+ {
181
+ "epoch": 0.2695039443025182,
182
+ "grad_norm": 0.577458918094635,
183
+ "learning_rate": 0.000998733567444243,
184
+ "loss": 3.6074,
185
+ "step": 600
186
+ },
187
+ {
188
+ "epoch": 0.2807332753151231,
189
+ "grad_norm": 0.8079866170883179,
190
+ "learning_rate": 0.0009985699728533573,
191
+ "loss": 3.6119,
192
+ "step": 625
193
+ },
194
+ {
195
+ "epoch": 0.291962606327728,
196
+ "grad_norm": 0.5645362138748169,
197
+ "learning_rate": 0.0009983964588693478,
198
+ "loss": 3.6411,
199
+ "step": 650
200
+ },
201
+ {
202
+ "epoch": 0.30319193734033295,
203
+ "grad_norm": 0.5666602253913879,
204
+ "learning_rate": 0.0009982130289443944,
205
+ "loss": 3.5725,
206
+ "step": 675
207
+ },
208
+ {
209
+ "epoch": 0.3144212683529379,
210
+ "grad_norm": 0.6160576343536377,
211
+ "learning_rate": 0.0009980196867279626,
212
+ "loss": 3.6055,
213
+ "step": 700
214
+ },
215
+ {
216
+ "epoch": 0.3256505993655428,
217
+ "grad_norm": 0.6656046509742737,
218
+ "learning_rate": 0.0009978164360667286,
219
+ "loss": 3.5776,
220
+ "step": 725
221
+ },
222
+ {
223
+ "epoch": 0.33687993037814773,
224
+ "grad_norm": 0.6236292123794556,
225
+ "learning_rate": 0.0009976032810045043,
226
+ "loss": 3.5509,
227
+ "step": 750
228
+ },
229
+ {
230
+ "epoch": 0.34810926139075266,
231
+ "grad_norm": 0.5736802816390991,
232
+ "learning_rate": 0.0009973802257821566,
233
+ "loss": 3.557,
234
+ "step": 775
235
+ },
236
+ {
237
+ "epoch": 0.3593385924033576,
238
+ "grad_norm": 0.5729912519454956,
239
+ "learning_rate": 0.000997147274837523,
240
+ "loss": 3.5374,
241
+ "step": 800
242
+ },
243
+ {
244
+ "epoch": 0.3705679234159625,
245
+ "grad_norm": 0.6009777188301086,
246
+ "learning_rate": 0.000996904432805323,
247
+ "loss": 3.5564,
248
+ "step": 825
249
+ },
250
+ {
251
+ "epoch": 0.38179725442856743,
252
+ "grad_norm": 0.5442889928817749,
253
+ "learning_rate": 0.0009966517045170659,
254
+ "loss": 3.5109,
255
+ "step": 850
256
+ },
257
+ {
258
+ "epoch": 0.39302658544117236,
259
+ "grad_norm": 0.5199196338653564,
260
+ "learning_rate": 0.0009963890950009549,
261
+ "loss": 3.526,
262
+ "step": 875
263
+ },
264
+ {
265
+ "epoch": 0.4042559164537773,
266
+ "grad_norm": 0.5444336533546448,
267
+ "learning_rate": 0.000996116609481788,
268
+ "loss": 3.5177,
269
+ "step": 900
270
+ },
271
+ {
272
+ "epoch": 0.4154852474663822,
273
+ "grad_norm": 0.5710541009902954,
274
+ "learning_rate": 0.000995834253380852,
275
+ "loss": 3.4767,
276
+ "step": 925
277
+ },
278
+ {
279
+ "epoch": 0.42671457847898714,
280
+ "grad_norm": 0.5203866362571716,
281
+ "learning_rate": 0.000995542032315816,
282
+ "loss": 3.4972,
283
+ "step": 950
284
+ },
285
+ {
286
+ "epoch": 0.43794390949159206,
287
+ "grad_norm": 0.545213520526886,
288
+ "learning_rate": 0.0009952399521006192,
289
+ "loss": 3.4623,
290
+ "step": 975
291
+ },
292
+ {
293
+ "epoch": 0.44917324050419694,
294
+ "grad_norm": 0.5384295582771301,
295
+ "learning_rate": 0.0009949280187453561,
296
+ "loss": 3.4367,
297
+ "step": 1000
298
+ },
299
+ {
300
+ "epoch": 0.46040257151680186,
301
+ "grad_norm": 0.5814492106437683,
302
+ "learning_rate": 0.0009946062384561555,
303
+ "loss": 3.441,
304
+ "step": 1025
305
+ },
306
+ {
307
+ "epoch": 0.4716319025294068,
308
+ "grad_norm": 0.5313106179237366,
309
+ "learning_rate": 0.000994274617635058,
310
+ "loss": 3.4336,
311
+ "step": 1050
312
+ },
313
+ {
314
+ "epoch": 0.4828612335420117,
315
+ "grad_norm": 0.6174953579902649,
316
+ "learning_rate": 0.0009939331628798882,
317
+ "loss": 3.3919,
318
+ "step": 1075
319
+ },
320
+ {
321
+ "epoch": 0.49409056455461664,
322
+ "grad_norm": 0.566247284412384,
323
+ "learning_rate": 0.0009935818809841239,
324
+ "loss": 3.4281,
325
+ "step": 1100
326
+ },
327
+ {
328
+ "epoch": 0.5053198955672216,
329
+ "grad_norm": 0.5577934384346008,
330
+ "learning_rate": 0.0009932207789367603,
331
+ "loss": 3.4043,
332
+ "step": 1125
333
+ },
334
+ {
335
+ "epoch": 0.5165492265798265,
336
+ "grad_norm": 0.5082374811172485,
337
+ "learning_rate": 0.0009928498639221715,
338
+ "loss": 3.3853,
339
+ "step": 1150
340
+ },
341
+ {
342
+ "epoch": 0.5277785575924314,
343
+ "grad_norm": 0.49451902508735657,
344
+ "learning_rate": 0.0009924691433199674,
345
+ "loss": 3.3794,
346
+ "step": 1175
347
+ },
348
+ {
349
+ "epoch": 0.5390078886050363,
350
+ "grad_norm": 0.5019585490226746,
351
+ "learning_rate": 0.0009920786247048464,
352
+ "loss": 3.4127,
353
+ "step": 1200
354
+ },
355
+ {
356
+ "epoch": 0.5502372196176413,
357
+ "grad_norm": 0.599949300289154,
358
+ "learning_rate": 0.0009916783158464455,
359
+ "loss": 3.3923,
360
+ "step": 1225
361
+ },
362
+ {
363
+ "epoch": 0.5614665506302462,
364
+ "grad_norm": 0.5792480707168579,
365
+ "learning_rate": 0.0009912682247091853,
366
+ "loss": 3.3656,
367
+ "step": 1250
368
+ },
369
+ {
370
+ "epoch": 0.5726958816428511,
371
+ "grad_norm": 0.5839366912841797,
372
+ "learning_rate": 0.0009908483594521116,
373
+ "loss": 3.3895,
374
+ "step": 1275
375
+ },
376
+ {
377
+ "epoch": 0.583925212655456,
378
+ "grad_norm": 0.5739409923553467,
379
+ "learning_rate": 0.0009904187284287332,
380
+ "loss": 3.3506,
381
+ "step": 1300
382
+ },
383
+ {
384
+ "epoch": 0.595154543668061,
385
+ "grad_norm": 0.5897430777549744,
386
+ "learning_rate": 0.0009899793401868546,
387
+ "loss": 3.3247,
388
+ "step": 1325
389
+ },
390
+ {
391
+ "epoch": 0.6063838746806659,
392
+ "grad_norm": 0.5063382983207703,
393
+ "learning_rate": 0.0009895302034684083,
394
+ "loss": 3.3369,
395
+ "step": 1350
396
+ },
397
+ {
398
+ "epoch": 0.6176132056932708,
399
+ "grad_norm": 0.5431721806526184,
400
+ "learning_rate": 0.0009890713272092786,
401
+ "loss": 3.3446,
402
+ "step": 1375
403
+ },
404
+ {
405
+ "epoch": 0.6288425367058758,
406
+ "grad_norm": 0.4924439489841461,
407
+ "learning_rate": 0.0009886027205391248,
408
+ "loss": 3.3377,
409
+ "step": 1400
410
+ },
411
+ {
412
+ "epoch": 0.6400718677184807,
413
+ "grad_norm": 0.54843670129776,
414
+ "learning_rate": 0.0009881243927811992,
415
+ "loss": 3.345,
416
+ "step": 1425
417
+ },
418
+ {
419
+ "epoch": 0.6513011987310856,
420
+ "grad_norm": 0.5399184226989746,
421
+ "learning_rate": 0.0009876363534521626,
422
+ "loss": 3.3344,
423
+ "step": 1450
424
+ },
425
+ {
426
+ "epoch": 0.6625305297436905,
427
+ "grad_norm": 0.5677058696746826,
428
+ "learning_rate": 0.0009871386122618933,
429
+ "loss": 3.3328,
430
+ "step": 1475
431
+ },
432
+ {
433
+ "epoch": 0.6737598607562955,
434
+ "grad_norm": 0.482010155916214,
435
+ "learning_rate": 0.0009866311791132953,
436
+ "loss": 3.3433,
437
+ "step": 1500
438
+ },
439
+ {
440
+ "epoch": 0.6849891917689004,
441
+ "grad_norm": 0.49796995520591736,
442
+ "learning_rate": 0.000986114064102101,
443
+ "loss": 3.3413,
444
+ "step": 1525
445
+ },
446
+ {
447
+ "epoch": 0.6962185227815053,
448
+ "grad_norm": 0.4976087808609009,
449
+ "learning_rate": 0.0009855872775166696,
450
+ "loss": 3.3176,
451
+ "step": 1550
452
+ },
453
+ {
454
+ "epoch": 0.7074478537941102,
455
+ "grad_norm": 0.5586963891983032,
456
+ "learning_rate": 0.0009850508298377832,
457
+ "loss": 3.3188,
458
+ "step": 1575
459
+ },
460
+ {
461
+ "epoch": 0.7186771848067152,
462
+ "grad_norm": 0.45419466495513916,
463
+ "learning_rate": 0.0009845047317384378,
464
+ "loss": 3.2902,
465
+ "step": 1600
466
+ },
467
+ {
468
+ "epoch": 0.7299065158193201,
469
+ "grad_norm": 0.5402281880378723,
470
+ "learning_rate": 0.0009839489940836317,
471
+ "loss": 3.2893,
472
+ "step": 1625
473
+ },
474
+ {
475
+ "epoch": 0.741135846831925,
476
+ "grad_norm": 0.5439639687538147,
477
+ "learning_rate": 0.0009833836279301484,
478
+ "loss": 3.3065,
479
+ "step": 1650
480
+ },
481
+ {
482
+ "epoch": 0.7523651778445299,
483
+ "grad_norm": 0.49050387740135193,
484
+ "learning_rate": 0.0009828086445263368,
485
+ "loss": 3.2796,
486
+ "step": 1675
487
+ },
488
+ {
489
+ "epoch": 0.7635945088571349,
490
+ "grad_norm": 0.5266304016113281,
491
+ "learning_rate": 0.000982224055311888,
492
+ "loss": 3.29,
493
+ "step": 1700
494
+ },
495
+ {
496
+ "epoch": 0.7748238398697398,
497
+ "grad_norm": 0.5335237979888916,
498
+ "learning_rate": 0.0009816298719176073,
499
+ "loss": 3.2889,
500
+ "step": 1725
501
+ },
502
+ {
503
+ "epoch": 0.7860531708823447,
504
+ "grad_norm": 0.5236843824386597,
505
+ "learning_rate": 0.0009810261061651826,
506
+ "loss": 3.2758,
507
+ "step": 1750
508
+ },
509
+ {
510
+ "epoch": 0.7972825018949496,
511
+ "grad_norm": 0.541266679763794,
512
+ "learning_rate": 0.0009804127700669496,
513
+ "loss": 3.3053,
514
+ "step": 1775
515
+ },
516
+ {
517
+ "epoch": 0.8085118329075546,
518
+ "grad_norm": 0.47298872470855713,
519
+ "learning_rate": 0.0009797898758256525,
520
+ "loss": 3.277,
521
+ "step": 1800
522
+ },
523
+ {
524
+ "epoch": 0.8197411639201595,
525
+ "grad_norm": 0.5097940564155579,
526
+ "learning_rate": 0.0009791574358342014,
527
+ "loss": 3.2537,
528
+ "step": 1825
529
+ },
530
+ {
531
+ "epoch": 0.8309704949327644,
532
+ "grad_norm": 0.5668537616729736,
533
+ "learning_rate": 0.0009785154626754259,
534
+ "loss": 3.2514,
535
+ "step": 1850
536
+ },
537
+ {
538
+ "epoch": 0.8421998259453694,
539
+ "grad_norm": 0.5071256160736084,
540
+ "learning_rate": 0.000977863969121824,
541
+ "loss": 3.2467,
542
+ "step": 1875
543
+ },
544
+ {
545
+ "epoch": 0.8534291569579743,
546
+ "grad_norm": 0.4892118573188782,
547
+ "learning_rate": 0.000977202968135309,
548
+ "loss": 3.2772,
549
+ "step": 1900
550
+ },
551
+ {
552
+ "epoch": 0.8646584879705792,
553
+ "grad_norm": 0.463008850812912,
554
+ "learning_rate": 0.000976532472866951,
555
+ "loss": 3.2492,
556
+ "step": 1925
557
+ },
558
+ {
559
+ "epoch": 0.8758878189831841,
560
+ "grad_norm": 0.5179749727249146,
561
+ "learning_rate": 0.0009758524966567152,
562
+ "loss": 3.2289,
563
+ "step": 1950
564
+ },
565
+ {
566
+ "epoch": 0.8871171499957891,
567
+ "grad_norm": 0.5276876091957092,
568
+ "learning_rate": 0.000975163053033197,
569
+ "loss": 3.2474,
570
+ "step": 1975
571
+ },
572
+ {
573
+ "epoch": 0.8983464810083939,
574
+ "grad_norm": 0.5618298649787903,
575
+ "learning_rate": 0.000974464155713352,
576
+ "loss": 3.2652,
577
+ "step": 2000
578
+ },
579
+ {
580
+ "epoch": 0.9095758120209988,
581
+ "grad_norm": 0.4800003468990326,
582
+ "learning_rate": 0.0009737558186022242,
583
+ "loss": 3.2424,
584
+ "step": 2025
585
+ },
586
+ {
587
+ "epoch": 0.9208051430336037,
588
+ "grad_norm": 0.47998154163360596,
589
+ "learning_rate": 0.0009730380557926682,
590
+ "loss": 3.2737,
591
+ "step": 2050
592
+ },
593
+ {
594
+ "epoch": 0.9320344740462086,
595
+ "grad_norm": 0.5447694659233093,
596
+ "learning_rate": 0.00097231088156507,
597
+ "loss": 3.249,
598
+ "step": 2075
599
+ },
600
+ {
601
+ "epoch": 0.9432638050588136,
602
+ "grad_norm": 0.5559055209159851,
603
+ "learning_rate": 0.0009715743103870615,
604
+ "loss": 3.2566,
605
+ "step": 2100
606
+ },
607
+ {
608
+ "epoch": 0.9544931360714185,
609
+ "grad_norm": 0.478614866733551,
610
+ "learning_rate": 0.0009708283569132341,
611
+ "loss": 3.2076,
612
+ "step": 2125
613
+ },
614
+ {
615
+ "epoch": 0.9657224670840234,
616
+ "grad_norm": 0.44457143545150757,
617
+ "learning_rate": 0.000970073035984846,
618
+ "loss": 3.2052,
619
+ "step": 2150
620
+ },
621
+ {
622
+ "epoch": 0.9769517980966284,
623
+ "grad_norm": 0.5057160258293152,
624
+ "learning_rate": 0.0009693083626295274,
625
+ "loss": 3.1944,
626
+ "step": 2175
627
+ },
628
+ {
629
+ "epoch": 0.9881811291092333,
630
+ "grad_norm": 0.487543523311615,
631
+ "learning_rate": 0.0009685343520609816,
632
+ "loss": 3.2862,
633
+ "step": 2200
634
+ },
635
+ {
636
+ "epoch": 0.9994104601218382,
637
+ "grad_norm": 0.5547086000442505,
638
+ "learning_rate": 0.0009677510196786822,
639
+ "loss": 3.2249,
640
+ "step": 2225
641
+ },
642
+ {
643
+ "epoch": 1.0,
644
+ "eval_loss": 3.2925968170166016,
645
+ "eval_runtime": 230.4828,
646
+ "eval_samples_per_second": 54.598,
647
+ "eval_steps_per_second": 54.598,
648
+ "step": 2227
649
+ },
650
+ {
651
+ "epoch": 1.0103309845315964,
652
+ "grad_norm": 0.5509684085845947,
653
+ "learning_rate": 0.0009669583810675666,
654
+ "loss": 3.0297,
655
+ "step": 2250
656
+ },
657
+ {
658
+ "epoch": 1.0215603155442015,
659
+ "grad_norm": 0.5036989450454712,
660
+ "learning_rate": 0.0009661564519977263,
661
+ "loss": 2.9815,
662
+ "step": 2275
663
+ },
664
+ {
665
+ "epoch": 1.0327896465568063,
666
+ "grad_norm": 0.5602796673774719,
667
+ "learning_rate": 0.0009653452484240923,
668
+ "loss": 2.994,
669
+ "step": 2300
670
+ },
671
+ {
672
+ "epoch": 1.0440189775694113,
673
+ "grad_norm": 0.5729214549064636,
674
+ "learning_rate": 0.0009645247864861191,
675
+ "loss": 2.9956,
676
+ "step": 2325
677
+ },
678
+ {
679
+ "epoch": 1.0552483085820161,
680
+ "grad_norm": 0.5456846356391907,
681
+ "learning_rate": 0.0009636950825074618,
682
+ "loss": 2.985,
683
+ "step": 2350
684
+ },
685
+ {
686
+ "epoch": 1.0664776395946212,
687
+ "grad_norm": 0.544643223285675,
688
+ "learning_rate": 0.0009628561529956529,
689
+ "loss": 2.9973,
690
+ "step": 2375
691
+ },
692
+ {
693
+ "epoch": 1.077706970607226,
694
+ "grad_norm": 0.5306060314178467,
695
+ "learning_rate": 0.0009620080146417731,
696
+ "loss": 3.0053,
697
+ "step": 2400
698
+ },
699
+ {
700
+ "epoch": 1.088936301619831,
701
+ "grad_norm": 0.5001072883605957,
702
+ "learning_rate": 0.0009611506843201193,
703
+ "loss": 3.0244,
704
+ "step": 2425
705
+ },
706
+ {
707
+ "epoch": 1.1001656326324358,
708
+ "grad_norm": 0.52583909034729,
709
+ "learning_rate": 0.0009602841790878688,
710
+ "loss": 3.0266,
711
+ "step": 2450
712
+ },
713
+ {
714
+ "epoch": 1.1113949636450409,
715
+ "grad_norm": 0.536445677280426,
716
+ "learning_rate": 0.0009594085161847405,
717
+ "loss": 3.0124,
718
+ "step": 2475
719
+ },
720
+ {
721
+ "epoch": 1.1226242946576457,
722
+ "grad_norm": 0.5341405272483826,
723
+ "learning_rate": 0.0009585237130326508,
724
+ "loss": 3.0272,
725
+ "step": 2500
726
+ },
727
+ {
728
+ "epoch": 1.1338536256702507,
729
+ "grad_norm": 0.5340954661369324,
730
+ "learning_rate": 0.0009576297872353686,
731
+ "loss": 3.0152,
732
+ "step": 2525
733
+ },
734
+ {
735
+ "epoch": 1.1450829566828555,
736
+ "grad_norm": 0.4479193687438965,
737
+ "learning_rate": 0.0009567267565781628,
738
+ "loss": 3.0202,
739
+ "step": 2550
740
+ },
741
+ {
742
+ "epoch": 1.1563122876954606,
743
+ "grad_norm": 0.5316035747528076,
744
+ "learning_rate": 0.0009558146390274512,
745
+ "loss": 3.015,
746
+ "step": 2575
747
+ },
748
+ {
749
+ "epoch": 1.1675416187080654,
750
+ "grad_norm": 0.5239371061325073,
751
+ "learning_rate": 0.0009548934527304407,
752
+ "loss": 3.0618,
753
+ "step": 2600
754
+ },
755
+ {
756
+ "epoch": 1.1787709497206704,
757
+ "grad_norm": 0.6486944556236267,
758
+ "learning_rate": 0.0009539632160147672,
759
+ "loss": 3.0004,
760
+ "step": 2625
761
+ },
762
+ {
763
+ "epoch": 1.1900002807332752,
764
+ "grad_norm": 0.5308857560157776,
765
+ "learning_rate": 0.0009530239473881313,
766
+ "loss": 3.0425,
767
+ "step": 2650
768
+ },
769
+ {
770
+ "epoch": 1.2012296117458803,
771
+ "grad_norm": 0.5612149834632874,
772
+ "learning_rate": 0.0009520756655379293,
773
+ "loss": 3.0447,
774
+ "step": 2675
775
+ },
776
+ {
777
+ "epoch": 1.212458942758485,
778
+ "grad_norm": 0.5429418683052063,
779
+ "learning_rate": 0.0009511183893308821,
780
+ "loss": 2.9887,
781
+ "step": 2700
782
+ },
783
+ {
784
+ "epoch": 1.2236882737710901,
785
+ "grad_norm": 0.5688816905021667,
786
+ "learning_rate": 0.0009501521378126594,
787
+ "loss": 2.9961,
788
+ "step": 2725
789
+ },
790
+ {
791
+ "epoch": 1.234917604783695,
792
+ "grad_norm": 0.5409512519836426,
793
+ "learning_rate": 0.0009491769302075008,
794
+ "loss": 3.0,
795
+ "step": 2750
796
+ },
797
+ {
798
+ "epoch": 1.2461469357963,
799
+ "grad_norm": 0.5384955406188965,
800
+ "learning_rate": 0.0009481927859178337,
801
+ "loss": 3.0271,
802
+ "step": 2775
803
+ },
804
+ {
805
+ "epoch": 1.2573762668089048,
806
+ "grad_norm": 0.5857961177825928,
807
+ "learning_rate": 0.0009471997245238865,
808
+ "loss": 2.9983,
809
+ "step": 2800
810
+ },
811
+ {
812
+ "epoch": 1.2686055978215098,
813
+ "grad_norm": 0.5337027907371521,
814
+ "learning_rate": 0.0009461977657833003,
815
+ "loss": 3.0552,
816
+ "step": 2825
817
+ },
818
+ {
819
+ "epoch": 1.2798349288341146,
820
+ "grad_norm": 0.5078946352005005,
821
+ "learning_rate": 0.0009451869296307341,
822
+ "loss": 3.0191,
823
+ "step": 2850
824
+ },
825
+ {
826
+ "epoch": 1.2910642598467197,
827
+ "grad_norm": 0.5108660459518433,
828
+ "learning_rate": 0.00094416723617747,
829
+ "loss": 3.0234,
830
+ "step": 2875
831
+ },
832
+ {
833
+ "epoch": 1.3022935908593245,
834
+ "grad_norm": 0.5631129741668701,
835
+ "learning_rate": 0.0009431387057110118,
836
+ "loss": 3.0319,
837
+ "step": 2900
838
+ },
839
+ {
840
+ "epoch": 1.3135229218719295,
841
+ "grad_norm": 0.5249589085578918,
842
+ "learning_rate": 0.0009421013586946816,
843
+ "loss": 2.9866,
844
+ "step": 2925
845
+ },
846
+ {
847
+ "epoch": 1.3247522528845344,
848
+ "grad_norm": 0.4992469251155853,
849
+ "learning_rate": 0.000941055215767213,
850
+ "loss": 3.0144,
851
+ "step": 2950
852
+ },
853
+ {
854
+ "epoch": 1.3359815838971394,
855
+ "grad_norm": 0.4509263336658478,
856
+ "learning_rate": 0.0009400002977423405,
857
+ "loss": 3.0092,
858
+ "step": 2975
859
+ },
860
+ {
861
+ "epoch": 1.3472109149097442,
862
+ "grad_norm": 0.515438973903656,
863
+ "learning_rate": 0.0009389366256083849,
864
+ "loss": 2.9993,
865
+ "step": 3000
866
+ },
867
+ {
868
+ "epoch": 1.3584402459223492,
869
+ "grad_norm": 0.5087840557098389,
870
+ "learning_rate": 0.0009378642205278363,
871
+ "loss": 3.0242,
872
+ "step": 3025
873
+ },
874
+ {
875
+ "epoch": 1.369669576934954,
876
+ "grad_norm": 0.5046051144599915,
877
+ "learning_rate": 0.0009367831038369326,
878
+ "loss": 2.9971,
879
+ "step": 3050
880
+ },
881
+ {
882
+ "epoch": 1.380898907947559,
883
+ "grad_norm": 0.5728681087493896,
884
+ "learning_rate": 0.0009356932970452353,
885
+ "loss": 3.0292,
886
+ "step": 3075
887
+ },
888
+ {
889
+ "epoch": 1.392128238960164,
890
+ "grad_norm": 0.5724380016326904,
891
+ "learning_rate": 0.0009345948218352014,
892
+ "loss": 3.0098,
893
+ "step": 3100
894
+ },
895
+ {
896
+ "epoch": 1.403357569972769,
897
+ "grad_norm": 0.5322164297103882,
898
+ "learning_rate": 0.0009334877000617518,
899
+ "loss": 2.9968,
900
+ "step": 3125
901
+ },
902
+ {
903
+ "epoch": 1.4145869009853738,
904
+ "grad_norm": 0.558423638343811,
905
+ "learning_rate": 0.0009323719537518374,
906
+ "loss": 3.0334,
907
+ "step": 3150
908
+ },
909
+ {
910
+ "epoch": 1.4258162319979788,
911
+ "grad_norm": 0.5415078997612,
912
+ "learning_rate": 0.0009312476051039994,
913
+ "loss": 3.0313,
914
+ "step": 3175
915
+ },
916
+ {
917
+ "epoch": 1.4370455630105836,
918
+ "grad_norm": 0.46919873356819153,
919
+ "learning_rate": 0.0009301146764879292,
920
+ "loss": 2.9992,
921
+ "step": 3200
922
+ },
923
+ {
924
+ "epoch": 1.4482748940231884,
925
+ "grad_norm": 0.5965465903282166,
926
+ "learning_rate": 0.0009289731904440217,
927
+ "loss": 3.0071,
928
+ "step": 3225
929
+ },
930
+ {
931
+ "epoch": 1.4595042250357935,
932
+ "grad_norm": 0.4882059693336487,
933
+ "learning_rate": 0.0009278231696829288,
934
+ "loss": 2.968,
935
+ "step": 3250
936
+ },
937
+ {
938
+ "epoch": 1.4707335560483985,
939
+ "grad_norm": 0.6297493577003479,
940
+ "learning_rate": 0.0009266646370851055,
941
+ "loss": 3.0411,
942
+ "step": 3275
943
+ },
944
+ {
945
+ "epoch": 1.4819628870610033,
946
+ "grad_norm": 0.5603842735290527,
947
+ "learning_rate": 0.0009254976157003563,
948
+ "loss": 3.0203,
949
+ "step": 3300
950
+ },
951
+ {
952
+ "epoch": 1.4931922180736081,
953
+ "grad_norm": 0.49509698152542114,
954
+ "learning_rate": 0.0009243221287473755,
955
+ "loss": 3.0176,
956
+ "step": 3325
957
+ },
958
+ {
959
+ "epoch": 1.5044215490862132,
960
+ "grad_norm": 0.48536983132362366,
961
+ "learning_rate": 0.0009231381996132862,
962
+ "loss": 2.9547,
963
+ "step": 3350
964
+ },
965
+ {
966
+ "epoch": 1.5156508800988182,
967
+ "grad_norm": 0.47351208329200745,
968
+ "learning_rate": 0.0009219458518531739,
969
+ "loss": 2.9666,
970
+ "step": 3375
971
+ },
972
+ {
973
+ "epoch": 1.526880211111423,
974
+ "grad_norm": 0.5615521669387817,
975
+ "learning_rate": 0.0009207451091896191,
976
+ "loss": 3.0295,
977
+ "step": 3400
978
+ },
979
+ {
980
+ "epoch": 1.5381095421240278,
981
+ "grad_norm": 0.5138916969299316,
982
+ "learning_rate": 0.0009195359955122244,
983
+ "loss": 3.0146,
984
+ "step": 3425
985
+ },
986
+ {
987
+ "epoch": 1.5493388731366329,
988
+ "grad_norm": 0.5883649587631226,
989
+ "learning_rate": 0.0009183185348771392,
990
+ "loss": 3.0151,
991
+ "step": 3450
992
+ },
993
+ {
994
+ "epoch": 1.560568204149238,
995
+ "grad_norm": 0.5921751260757446,
996
+ "learning_rate": 0.0009170927515065821,
997
+ "loss": 3.0314,
998
+ "step": 3475
999
+ },
1000
+ {
1001
+ "epoch": 1.5717975351618427,
1002
+ "grad_norm": 0.5592530965805054,
1003
+ "learning_rate": 0.0009158586697883576,
1004
+ "loss": 2.9921,
1005
+ "step": 3500
1006
+ },
1007
+ {
1008
+ "epoch": 1.5830268661744475,
1009
+ "grad_norm": 0.5621814727783203,
1010
+ "learning_rate": 0.0009146163142753716,
1011
+ "loss": 2.9987,
1012
+ "step": 3525
1013
+ },
1014
+ {
1015
+ "epoch": 1.5942561971870526,
1016
+ "grad_norm": 0.5482603311538696,
1017
+ "learning_rate": 0.0009133657096851431,
1018
+ "loss": 2.9802,
1019
+ "step": 3550
1020
+ },
1021
+ {
1022
+ "epoch": 1.6054855281996576,
1023
+ "grad_norm": 0.5254377722740173,
1024
+ "learning_rate": 0.0009121068808993124,
1025
+ "loss": 3.0121,
1026
+ "step": 3575
1027
+ },
1028
+ {
1029
+ "epoch": 1.6167148592122624,
1030
+ "grad_norm": 0.47623664140701294,
1031
+ "learning_rate": 0.0009108398529631451,
1032
+ "loss": 3.0068,
1033
+ "step": 3600
1034
+ },
1035
+ {
1036
+ "epoch": 1.6279441902248672,
1037
+ "grad_norm": 0.49733710289001465,
1038
+ "learning_rate": 0.0009095646510850351,
1039
+ "loss": 3.0104,
1040
+ "step": 3625
1041
+ },
1042
+ {
1043
+ "epoch": 1.6391735212374723,
1044
+ "grad_norm": 0.5388875603675842,
1045
+ "learning_rate": 0.0009082813006360026,
1046
+ "loss": 2.9823,
1047
+ "step": 3650
1048
+ },
1049
+ {
1050
+ "epoch": 1.6504028522500773,
1051
+ "grad_norm": 0.5329872965812683,
1052
+ "learning_rate": 0.0009069898271491887,
1053
+ "loss": 2.9945,
1054
+ "step": 3675
1055
+ },
1056
+ {
1057
+ "epoch": 1.6616321832626821,
1058
+ "grad_norm": 0.5175071358680725,
1059
+ "learning_rate": 0.0009056902563193486,
1060
+ "loss": 2.9875,
1061
+ "step": 3700
1062
+ },
1063
+ {
1064
+ "epoch": 1.672861514275287,
1065
+ "grad_norm": 0.514216423034668,
1066
+ "learning_rate": 0.0009043826140023388,
1067
+ "loss": 3.016,
1068
+ "step": 3725
1069
+ },
1070
+ {
1071
+ "epoch": 1.684090845287892,
1072
+ "grad_norm": 0.5547803640365601,
1073
+ "learning_rate": 0.0009030669262146046,
1074
+ "loss": 2.9906,
1075
+ "step": 3750
1076
+ },
1077
+ {
1078
+ "epoch": 1.695320176300497,
1079
+ "grad_norm": 0.5035697817802429,
1080
+ "learning_rate": 0.0009017432191326611,
1081
+ "loss": 2.9795,
1082
+ "step": 3775
1083
+ },
1084
+ {
1085
+ "epoch": 1.7065495073131018,
1086
+ "grad_norm": 0.4960135519504547,
1087
+ "learning_rate": 0.0009004115190925724,
1088
+ "loss": 2.986,
1089
+ "step": 3800
1090
+ },
1091
+ {
1092
+ "epoch": 1.7177788383257067,
1093
+ "grad_norm": 0.5573786497116089,
1094
+ "learning_rate": 0.0008990718525894286,
1095
+ "loss": 2.9981,
1096
+ "step": 3825
1097
+ },
1098
+ {
1099
+ "epoch": 1.7290081693383117,
1100
+ "grad_norm": 0.558542788028717,
1101
+ "learning_rate": 0.0008977242462768177,
1102
+ "loss": 3.0122,
1103
+ "step": 3850
1104
+ },
1105
+ {
1106
+ "epoch": 1.7402375003509167,
1107
+ "grad_norm": 0.48205050826072693,
1108
+ "learning_rate": 0.0008963687269662957,
1109
+ "loss": 2.9558,
1110
+ "step": 3875
1111
+ },
1112
+ {
1113
+ "epoch": 1.7514668313635215,
1114
+ "grad_norm": 0.48525846004486084,
1115
+ "learning_rate": 0.0008950053216268534,
1116
+ "loss": 3.0034,
1117
+ "step": 3900
1118
+ },
1119
+ {
1120
+ "epoch": 1.7626961623761264,
1121
+ "grad_norm": 0.5863490700721741,
1122
+ "learning_rate": 0.0008936340573843795,
1123
+ "loss": 3.0222,
1124
+ "step": 3925
1125
+ },
1126
+ {
1127
+ "epoch": 1.7739254933887314,
1128
+ "grad_norm": 0.54740309715271,
1129
+ "learning_rate": 0.0008922549615211206,
1130
+ "loss": 2.9785,
1131
+ "step": 3950
1132
+ },
1133
+ {
1134
+ "epoch": 1.7851548244013364,
1135
+ "grad_norm": 0.5275555849075317,
1136
+ "learning_rate": 0.0008908680614751392,
1137
+ "loss": 2.982,
1138
+ "step": 3975
1139
+ },
1140
+ {
1141
+ "epoch": 1.7963841554139413,
1142
+ "grad_norm": 0.5472078919410706,
1143
+ "learning_rate": 0.0008894733848397674,
1144
+ "loss": 3.0128,
1145
+ "step": 4000
1146
+ },
1147
+ {
1148
+ "epoch": 1.807613486426546,
1149
+ "grad_norm": 0.5604407787322998,
1150
+ "learning_rate": 0.0008880709593630578,
1151
+ "loss": 3.0119,
1152
+ "step": 4025
1153
+ },
1154
+ {
1155
+ "epoch": 1.818842817439151,
1156
+ "grad_norm": 0.5137823224067688,
1157
+ "learning_rate": 0.0008866608129472313,
1158
+ "loss": 2.9858,
1159
+ "step": 4050
1160
+ },
1161
+ {
1162
+ "epoch": 1.8300721484517561,
1163
+ "grad_norm": 0.5707024931907654,
1164
+ "learning_rate": 0.0008852429736481227,
1165
+ "loss": 3.013,
1166
+ "step": 4075
1167
+ },
1168
+ {
1169
+ "epoch": 1.841301479464361,
1170
+ "grad_norm": 0.5344915986061096,
1171
+ "learning_rate": 0.0008838174696746215,
1172
+ "loss": 2.9899,
1173
+ "step": 4100
1174
+ },
1175
+ {
1176
+ "epoch": 1.8525308104769658,
1177
+ "grad_norm": 0.504295289516449,
1178
+ "learning_rate": 0.0008823843293881117,
1179
+ "loss": 3.0095,
1180
+ "step": 4125
1181
+ },
1182
+ {
1183
+ "epoch": 1.8637601414895708,
1184
+ "grad_norm": 0.5654752254486084,
1185
+ "learning_rate": 0.0008809435813019065,
1186
+ "loss": 2.988,
1187
+ "step": 4150
1188
+ },
1189
+ {
1190
+ "epoch": 1.8749894725021758,
1191
+ "grad_norm": 0.5086371302604675,
1192
+ "learning_rate": 0.0008794952540806817,
1193
+ "loss": 3.0304,
1194
+ "step": 4175
1195
+ },
1196
+ {
1197
+ "epoch": 1.8862188035147807,
1198
+ "grad_norm": 0.5218560099601746,
1199
+ "learning_rate": 0.0008780393765399055,
1200
+ "loss": 2.9817,
1201
+ "step": 4200
1202
+ },
1203
+ {
1204
+ "epoch": 1.8974481345273855,
1205
+ "grad_norm": 0.48831528425216675,
1206
+ "learning_rate": 0.0008765759776452646,
1207
+ "loss": 3.0245,
1208
+ "step": 4225
1209
+ },
1210
+ {
1211
+ "epoch": 1.9086774655399905,
1212
+ "grad_norm": 0.5015767812728882,
1213
+ "learning_rate": 0.0008751050865120882,
1214
+ "loss": 3.0238,
1215
+ "step": 4250
1216
+ },
1217
+ {
1218
+ "epoch": 1.9199067965525953,
1219
+ "grad_norm": 0.5325757265090942,
1220
+ "learning_rate": 0.000873626732404769,
1221
+ "loss": 2.9993,
1222
+ "step": 4275
1223
+ },
1224
+ {
1225
+ "epoch": 1.9311361275652001,
1226
+ "grad_norm": 0.48629334568977356,
1227
+ "learning_rate": 0.0008721409447361803,
1228
+ "loss": 2.9634,
1229
+ "step": 4300
1230
+ },
1231
+ {
1232
+ "epoch": 1.9423654585778052,
1233
+ "grad_norm": 0.49022358655929565,
1234
+ "learning_rate": 0.0008706477530670917,
1235
+ "loss": 2.9736,
1236
+ "step": 4325
1237
+ },
1238
+ {
1239
+ "epoch": 1.9535947895904102,
1240
+ "grad_norm": 0.5039647221565247,
1241
+ "learning_rate": 0.0008691471871055801,
1242
+ "loss": 2.9802,
1243
+ "step": 4350
1244
+ },
1245
+ {
1246
+ "epoch": 1.964824120603015,
1247
+ "grad_norm": 0.5223824977874756,
1248
+ "learning_rate": 0.0008676392767064391,
1249
+ "loss": 3.0397,
1250
+ "step": 4375
1251
+ },
1252
+ {
1253
+ "epoch": 1.9760534516156198,
1254
+ "grad_norm": 0.5172558426856995,
1255
+ "learning_rate": 0.0008661240518705854,
1256
+ "loss": 2.9756,
1257
+ "step": 4400
1258
+ },
1259
+ {
1260
+ "epoch": 1.9872827826282249,
1261
+ "grad_norm": 0.4955403208732605,
1262
+ "learning_rate": 0.0008646015427444609,
1263
+ "loss": 2.9748,
1264
+ "step": 4425
1265
+ },
1266
+ {
1267
+ "epoch": 1.99851211364083,
1268
+ "grad_norm": 0.553287923336029,
1269
+ "learning_rate": 0.0008630717796194337,
1270
+ "loss": 2.9501,
1271
+ "step": 4450
1272
+ },
1273
+ {
1274
+ "epoch": 2.0,
1275
+ "eval_loss": 3.1917288303375244,
1276
+ "eval_runtime": 227.8959,
1277
+ "eval_samples_per_second": 55.218,
1278
+ "eval_steps_per_second": 55.218,
1279
+ "step": 4454
1280
+ },
1281
+ {
1282
+ "epoch": 2.0094326380505882,
1283
+ "grad_norm": 0.5545716285705566,
1284
+ "learning_rate": 0.0008615347929311949,
1285
+ "loss": 2.7426,
1286
+ "step": 4475
1287
+ },
1288
+ {
1289
+ "epoch": 2.020661969063193,
1290
+ "grad_norm": 0.5116350650787354,
1291
+ "learning_rate": 0.0008599906132591541,
1292
+ "loss": 2.6669,
1293
+ "step": 4500
1294
+ },
1295
+ {
1296
+ "epoch": 2.031891300075798,
1297
+ "grad_norm": 0.6248686909675598,
1298
+ "learning_rate": 0.0008584392713258295,
1299
+ "loss": 2.6597,
1300
+ "step": 4525
1301
+ },
1302
+ {
1303
+ "epoch": 2.043120631088403,
1304
+ "grad_norm": 0.5305931568145752,
1305
+ "learning_rate": 0.0008568807979962379,
1306
+ "loss": 2.6635,
1307
+ "step": 4550
1308
+ },
1309
+ {
1310
+ "epoch": 2.054349962101008,
1311
+ "grad_norm": 0.5366395711898804,
1312
+ "learning_rate": 0.0008553152242772798,
1313
+ "loss": 2.668,
1314
+ "step": 4575
1315
+ },
1316
+ {
1317
+ "epoch": 2.0655792931136125,
1318
+ "grad_norm": 0.6074578762054443,
1319
+ "learning_rate": 0.0008537425813171232,
1320
+ "loss": 2.7031,
1321
+ "step": 4600
1322
+ },
1323
+ {
1324
+ "epoch": 2.0768086241262176,
1325
+ "grad_norm": 0.6074210405349731,
1326
+ "learning_rate": 0.0008521629004045832,
1327
+ "loss": 2.6721,
1328
+ "step": 4625
1329
+ },
1330
+ {
1331
+ "epoch": 2.0880379551388226,
1332
+ "grad_norm": 0.6065968871116638,
1333
+ "learning_rate": 0.0008505762129685002,
1334
+ "loss": 2.6774,
1335
+ "step": 4650
1336
+ },
1337
+ {
1338
+ "epoch": 2.0992672861514277,
1339
+ "grad_norm": 0.5635676383972168,
1340
+ "learning_rate": 0.0008489825505771136,
1341
+ "loss": 2.6537,
1342
+ "step": 4675
1343
+ },
1344
+ {
1345
+ "epoch": 2.1104966171640323,
1346
+ "grad_norm": 0.5447210669517517,
1347
+ "learning_rate": 0.000847381944937435,
1348
+ "loss": 2.6964,
1349
+ "step": 4700
1350
+ },
1351
+ {
1352
+ "epoch": 2.1217259481766373,
1353
+ "grad_norm": 0.6450474858283997,
1354
+ "learning_rate": 0.0008457744278946162,
1355
+ "loss": 2.6591,
1356
+ "step": 4725
1357
+ },
1358
+ {
1359
+ "epoch": 2.1329552791892423,
1360
+ "grad_norm": 0.5620629787445068,
1361
+ "learning_rate": 0.0008441600314313165,
1362
+ "loss": 2.6787,
1363
+ "step": 4750
1364
+ },
1365
+ {
1366
+ "epoch": 2.1441846102018474,
1367
+ "grad_norm": 0.5661433935165405,
1368
+ "learning_rate": 0.0008425387876670658,
1369
+ "loss": 2.7193,
1370
+ "step": 4775
1371
+ },
1372
+ {
1373
+ "epoch": 2.155413941214452,
1374
+ "grad_norm": 0.6069077849388123,
1375
+ "learning_rate": 0.0008409107288576259,
1376
+ "loss": 2.6947,
1377
+ "step": 4800
1378
+ },
1379
+ {
1380
+ "epoch": 2.166643272227057,
1381
+ "grad_norm": 0.6271758675575256,
1382
+ "learning_rate": 0.0008392758873943484,
1383
+ "loss": 2.6952,
1384
+ "step": 4825
1385
+ },
1386
+ {
1387
+ "epoch": 2.177872603239662,
1388
+ "grad_norm": 0.5473480820655823,
1389
+ "learning_rate": 0.0008376342958035308,
1390
+ "loss": 2.6981,
1391
+ "step": 4850
1392
+ },
1393
+ {
1394
+ "epoch": 2.189101934252267,
1395
+ "grad_norm": 0.631519079208374,
1396
+ "learning_rate": 0.0008359859867457686,
1397
+ "loss": 2.6921,
1398
+ "step": 4875
1399
+ },
1400
+ {
1401
+ "epoch": 2.2003312652648717,
1402
+ "grad_norm": 0.6639063954353333,
1403
+ "learning_rate": 0.0008343309930153064,
1404
+ "loss": 2.6837,
1405
+ "step": 4900
1406
+ },
1407
+ {
1408
+ "epoch": 2.2115605962774767,
1409
+ "grad_norm": 0.6233927011489868,
1410
+ "learning_rate": 0.0008326693475393846,
1411
+ "loss": 2.7112,
1412
+ "step": 4925
1413
+ },
1414
+ {
1415
+ "epoch": 2.2227899272900817,
1416
+ "grad_norm": 0.636464536190033,
1417
+ "learning_rate": 0.0008310010833775849,
1418
+ "loss": 2.7213,
1419
+ "step": 4950
1420
+ },
1421
+ {
1422
+ "epoch": 2.2340192583026868,
1423
+ "grad_norm": 0.5854448676109314,
1424
+ "learning_rate": 0.0008293262337211723,
1425
+ "loss": 2.7131,
1426
+ "step": 4975
1427
+ },
1428
+ {
1429
+ "epoch": 2.2452485893152914,
1430
+ "grad_norm": 0.6958891749382019,
1431
+ "learning_rate": 0.0008276448318924346,
1432
+ "loss": 2.6883,
1433
+ "step": 5000
1434
+ },
1435
+ {
1436
+ "epoch": 2.2564779203278964,
1437
+ "grad_norm": 0.5899659991264343,
1438
+ "learning_rate": 0.0008259569113440198,
1439
+ "loss": 2.6872,
1440
+ "step": 5025
1441
+ },
1442
+ {
1443
+ "epoch": 2.2677072513405014,
1444
+ "grad_norm": 0.6791245937347412,
1445
+ "learning_rate": 0.0008242625056582698,
1446
+ "loss": 2.7202,
1447
+ "step": 5050
1448
+ },
1449
+ {
1450
+ "epoch": 2.2789365823531065,
1451
+ "grad_norm": 0.5778390169143677,
1452
+ "learning_rate": 0.0008225616485465535,
1453
+ "loss": 2.7153,
1454
+ "step": 5075
1455
+ },
1456
+ {
1457
+ "epoch": 2.290165913365711,
1458
+ "grad_norm": 0.5727918148040771,
1459
+ "learning_rate": 0.000820854373848595,
1460
+ "loss": 2.7314,
1461
+ "step": 5100
1462
+ },
1463
+ {
1464
+ "epoch": 2.301395244378316,
1465
+ "grad_norm": 0.6461373567581177,
1466
+ "learning_rate": 0.0008191407155318007,
1467
+ "loss": 2.6973,
1468
+ "step": 5125
1469
+ },
1470
+ {
1471
+ "epoch": 2.312624575390921,
1472
+ "grad_norm": 0.6795935034751892,
1473
+ "learning_rate": 0.0008174207076905835,
1474
+ "loss": 2.6605,
1475
+ "step": 5150
1476
+ },
1477
+ {
1478
+ "epoch": 2.323853906403526,
1479
+ "grad_norm": 0.6217265725135803,
1480
+ "learning_rate": 0.0008156943845456843,
1481
+ "loss": 2.6715,
1482
+ "step": 5175
1483
+ },
1484
+ {
1485
+ "epoch": 2.3350832374161308,
1486
+ "grad_norm": 0.6071234941482544,
1487
+ "learning_rate": 0.0008139617804434918,
1488
+ "loss": 2.6806,
1489
+ "step": 5200
1490
+ },
1491
+ {
1492
+ "epoch": 2.346312568428736,
1493
+ "grad_norm": 0.6218218207359314,
1494
+ "learning_rate": 0.0008122229298553583,
1495
+ "loss": 2.7077,
1496
+ "step": 5225
1497
+ },
1498
+ {
1499
+ "epoch": 2.357541899441341,
1500
+ "grad_norm": 0.5912306904792786,
1501
+ "learning_rate": 0.0008104778673769142,
1502
+ "loss": 2.7314,
1503
+ "step": 5250
1504
+ },
1505
+ {
1506
+ "epoch": 2.368771230453946,
1507
+ "grad_norm": 0.5841456651687622,
1508
+ "learning_rate": 0.0008087266277273799,
1509
+ "loss": 2.6645,
1510
+ "step": 5275
1511
+ },
1512
+ {
1513
+ "epoch": 2.3800005614665505,
1514
+ "grad_norm": 0.6491414904594421,
1515
+ "learning_rate": 0.0008069692457488749,
1516
+ "loss": 2.7115,
1517
+ "step": 5300
1518
+ },
1519
+ {
1520
+ "epoch": 2.3912298924791555,
1521
+ "grad_norm": 0.6534895896911621,
1522
+ "learning_rate": 0.0008052057564057244,
1523
+ "loss": 2.7057,
1524
+ "step": 5325
1525
+ },
1526
+ {
1527
+ "epoch": 2.4024592234917606,
1528
+ "grad_norm": 0.5900655388832092,
1529
+ "learning_rate": 0.000803436194783764,
1530
+ "loss": 2.7302,
1531
+ "step": 5350
1532
+ },
1533
+ {
1534
+ "epoch": 2.4136885545043656,
1535
+ "grad_norm": 0.5586231350898743,
1536
+ "learning_rate": 0.0008016605960896412,
1537
+ "loss": 2.7339,
1538
+ "step": 5375
1539
+ },
1540
+ {
1541
+ "epoch": 2.42491788551697,
1542
+ "grad_norm": 0.705515444278717,
1543
+ "learning_rate": 0.0007998789956501159,
1544
+ "loss": 2.7323,
1545
+ "step": 5400
1546
+ },
1547
+ {
1548
+ "epoch": 2.436147216529575,
1549
+ "grad_norm": 0.5936200022697449,
1550
+ "learning_rate": 0.0007980914289113558,
1551
+ "loss": 2.7116,
1552
+ "step": 5425
1553
+ },
1554
+ {
1555
+ "epoch": 2.4473765475421803,
1556
+ "grad_norm": 0.6085701584815979,
1557
+ "learning_rate": 0.000796297931438233,
1558
+ "loss": 2.7406,
1559
+ "step": 5450
1560
+ },
1561
+ {
1562
+ "epoch": 2.458605878554785,
1563
+ "grad_norm": 0.5549573302268982,
1564
+ "learning_rate": 0.0007944985389136157,
1565
+ "loss": 2.7408,
1566
+ "step": 5475
1567
+ },
1568
+ {
1569
+ "epoch": 2.46983520956739,
1570
+ "grad_norm": 0.5694999694824219,
1571
+ "learning_rate": 0.0007926932871376575,
1572
+ "loss": 2.7216,
1573
+ "step": 5500
1574
+ },
1575
+ {
1576
+ "epoch": 2.481064540579995,
1577
+ "grad_norm": 0.5795106887817383,
1578
+ "learning_rate": 0.0007908822120270867,
1579
+ "loss": 2.6724,
1580
+ "step": 5525
1581
+ },
1582
+ {
1583
+ "epoch": 2.4922938715926,
1584
+ "grad_norm": 0.5619019865989685,
1585
+ "learning_rate": 0.0007890653496144902,
1586
+ "loss": 2.6867,
1587
+ "step": 5550
1588
+ },
1589
+ {
1590
+ "epoch": 2.503523202605205,
1591
+ "grad_norm": 0.5836601257324219,
1592
+ "learning_rate": 0.0007872427360475974,
1593
+ "loss": 2.7091,
1594
+ "step": 5575
1595
+ },
1596
+ {
1597
+ "epoch": 2.5147525336178096,
1598
+ "grad_norm": 0.6521953344345093,
1599
+ "learning_rate": 0.0007854144075885614,
1600
+ "loss": 2.7138,
1601
+ "step": 5600
1602
+ },
1603
+ {
1604
+ "epoch": 2.5259818646304146,
1605
+ "grad_norm": 0.6129563450813293,
1606
+ "learning_rate": 0.0007835804006132364,
1607
+ "loss": 2.6796,
1608
+ "step": 5625
1609
+ },
1610
+ {
1611
+ "epoch": 2.5372111956430197,
1612
+ "grad_norm": 0.5933245420455933,
1613
+ "learning_rate": 0.0007817407516104547,
1614
+ "loss": 2.6541,
1615
+ "step": 5650
1616
+ },
1617
+ {
1618
+ "epoch": 2.5484405266556243,
1619
+ "grad_norm": 0.5935245156288147,
1620
+ "learning_rate": 0.0007798954971813009,
1621
+ "loss": 2.6849,
1622
+ "step": 5675
1623
+ },
1624
+ {
1625
+ "epoch": 2.5596698576682293,
1626
+ "grad_norm": 0.7134594321250916,
1627
+ "learning_rate": 0.0007780446740383829,
1628
+ "loss": 2.7141,
1629
+ "step": 5700
1630
+ },
1631
+ {
1632
+ "epoch": 2.5708991886808343,
1633
+ "grad_norm": 0.6013050675392151,
1634
+ "learning_rate": 0.0007761883190051029,
1635
+ "loss": 2.7276,
1636
+ "step": 5725
1637
+ },
1638
+ {
1639
+ "epoch": 2.5821285196934394,
1640
+ "grad_norm": 0.6081655025482178,
1641
+ "learning_rate": 0.000774326469014923,
1642
+ "loss": 2.7205,
1643
+ "step": 5750
1644
+ },
1645
+ {
1646
+ "epoch": 2.5933578507060444,
1647
+ "grad_norm": 0.6464730501174927,
1648
+ "learning_rate": 0.0007724591611106315,
1649
+ "loss": 2.6872,
1650
+ "step": 5775
1651
+ },
1652
+ {
1653
+ "epoch": 2.604587181718649,
1654
+ "grad_norm": 0.578700840473175,
1655
+ "learning_rate": 0.0007705864324436059,
1656
+ "loss": 2.7152,
1657
+ "step": 5800
1658
+ },
1659
+ {
1660
+ "epoch": 2.615816512731254,
1661
+ "grad_norm": 0.5782270431518555,
1662
+ "learning_rate": 0.000768708320273073,
1663
+ "loss": 2.7233,
1664
+ "step": 5825
1665
+ },
1666
+ {
1667
+ "epoch": 2.627045843743859,
1668
+ "grad_norm": 0.5789017081260681,
1669
+ "learning_rate": 0.000766824861965369,
1670
+ "loss": 2.7474,
1671
+ "step": 5850
1672
+ },
1673
+ {
1674
+ "epoch": 2.6382751747564637,
1675
+ "grad_norm": 0.6152642369270325,
1676
+ "learning_rate": 0.0007649360949931941,
1677
+ "loss": 2.7071,
1678
+ "step": 5875
1679
+ },
1680
+ {
1681
+ "epoch": 2.6495045057690687,
1682
+ "grad_norm": 0.6417413353919983,
1683
+ "learning_rate": 0.0007630420569348688,
1684
+ "loss": 2.694,
1685
+ "step": 5900
1686
+ },
1687
+ {
1688
+ "epoch": 2.6607338367816737,
1689
+ "grad_norm": 0.5956742167472839,
1690
+ "learning_rate": 0.0007611427854735855,
1691
+ "loss": 2.7318,
1692
+ "step": 5925
1693
+ },
1694
+ {
1695
+ "epoch": 2.671963167794279,
1696
+ "grad_norm": 0.6362649202346802,
1697
+ "learning_rate": 0.0007592383183966581,
1698
+ "loss": 2.6966,
1699
+ "step": 5950
1700
+ },
1701
+ {
1702
+ "epoch": 2.683192498806884,
1703
+ "grad_norm": 0.5551230311393738,
1704
+ "learning_rate": 0.0007573286935947715,
1705
+ "loss": 2.6876,
1706
+ "step": 5975
1707
+ },
1708
+ {
1709
+ "epoch": 2.6944218298194884,
1710
+ "grad_norm": 0.6288260817527771,
1711
+ "learning_rate": 0.0007554139490612269,
1712
+ "loss": 2.7336,
1713
+ "step": 6000
1714
+ },
1715
+ {
1716
+ "epoch": 2.7056511608320934,
1717
+ "grad_norm": 0.5820605158805847,
1718
+ "learning_rate": 0.0007534941228911856,
1719
+ "loss": 2.683,
1720
+ "step": 6025
1721
+ },
1722
+ {
1723
+ "epoch": 2.7168804918446985,
1724
+ "grad_norm": 0.6264435648918152,
1725
+ "learning_rate": 0.0007515692532809126,
1726
+ "loss": 2.7461,
1727
+ "step": 6050
1728
+ },
1729
+ {
1730
+ "epoch": 2.728109822857303,
1731
+ "grad_norm": 0.6948567628860474,
1732
+ "learning_rate": 0.0007496393785270148,
1733
+ "loss": 2.7297,
1734
+ "step": 6075
1735
+ },
1736
+ {
1737
+ "epoch": 2.739339153869908,
1738
+ "grad_norm": 0.5976940393447876,
1739
+ "learning_rate": 0.0007477045370256802,
1740
+ "loss": 2.7419,
1741
+ "step": 6100
1742
+ },
1743
+ {
1744
+ "epoch": 2.750568484882513,
1745
+ "grad_norm": 0.5582289099693298,
1746
+ "learning_rate": 0.0007457647672719133,
1747
+ "loss": 2.7238,
1748
+ "step": 6125
1749
+ },
1750
+ {
1751
+ "epoch": 2.761797815895118,
1752
+ "grad_norm": 0.6097133755683899,
1753
+ "learning_rate": 0.00074389798763174,
1754
+ "loss": 2.7345,
1755
+ "step": 6150
1756
+ },
1757
+ {
1758
+ "epoch": 2.7730271469077232,
1759
+ "grad_norm": 0.576604425907135,
1760
+ "learning_rate": 0.0007419486705442532,
1761
+ "loss": 2.7075,
1762
+ "step": 6175
1763
+ },
1764
+ {
1765
+ "epoch": 2.784256477920328,
1766
+ "grad_norm": 0.6071319580078125,
1767
+ "learning_rate": 0.0007399945397212636,
1768
+ "loss": 2.7122,
1769
+ "step": 6200
1770
+ },
1771
+ {
1772
+ "epoch": 2.795485808932933,
1773
+ "grad_norm": 0.6164671182632446,
1774
+ "learning_rate": 0.0007380356340415503,
1775
+ "loss": 2.698,
1776
+ "step": 6225
1777
+ },
1778
+ {
1779
+ "epoch": 2.806715139945538,
1780
+ "grad_norm": 0.5970659255981445,
1781
+ "learning_rate": 0.0007360719924788919,
1782
+ "loss": 2.7429,
1783
+ "step": 6250
1784
+ },
1785
+ {
1786
+ "epoch": 2.8179444709581425,
1787
+ "grad_norm": 0.5380481481552124,
1788
+ "learning_rate": 0.0007341036541012898,
1789
+ "loss": 2.6655,
1790
+ "step": 6275
1791
+ },
1792
+ {
1793
+ "epoch": 2.8291738019707475,
1794
+ "grad_norm": 0.5859966278076172,
1795
+ "learning_rate": 0.0007321306580701923,
1796
+ "loss": 2.7115,
1797
+ "step": 6300
1798
+ },
1799
+ {
1800
+ "epoch": 2.8404031329833526,
1801
+ "grad_norm": 0.5644718408584595,
1802
+ "learning_rate": 0.0007301530436397148,
1803
+ "loss": 2.6945,
1804
+ "step": 6325
1805
+ },
1806
+ {
1807
+ "epoch": 2.8516324639959576,
1808
+ "grad_norm": 0.5971605777740479,
1809
+ "learning_rate": 0.0007281708501558591,
1810
+ "loss": 2.7082,
1811
+ "step": 6350
1812
+ },
1813
+ {
1814
+ "epoch": 2.8628617950085626,
1815
+ "grad_norm": 0.6274561882019043,
1816
+ "learning_rate": 0.0007261841170557303,
1817
+ "loss": 2.7207,
1818
+ "step": 6375
1819
+ },
1820
+ {
1821
+ "epoch": 2.8740911260211672,
1822
+ "grad_norm": 0.611348032951355,
1823
+ "learning_rate": 0.0007241928838667522,
1824
+ "loss": 2.7155,
1825
+ "step": 6400
1826
+ },
1827
+ {
1828
+ "epoch": 2.8853204570337723,
1829
+ "grad_norm": 0.5650250911712646,
1830
+ "learning_rate": 0.000722197190205881,
1831
+ "loss": 2.7063,
1832
+ "step": 6425
1833
+ },
1834
+ {
1835
+ "epoch": 2.896549788046377,
1836
+ "grad_norm": 0.6043295860290527,
1837
+ "learning_rate": 0.0007201970757788173,
1838
+ "loss": 2.6909,
1839
+ "step": 6450
1840
+ },
1841
+ {
1842
+ "epoch": 2.907779119058982,
1843
+ "grad_norm": 0.582062304019928,
1844
+ "learning_rate": 0.0007181925803792153,
1845
+ "loss": 2.7262,
1846
+ "step": 6475
1847
+ },
1848
+ {
1849
+ "epoch": 2.919008450071587,
1850
+ "grad_norm": 0.6272075772285461,
1851
+ "learning_rate": 0.0007161837438878926,
1852
+ "loss": 2.7224,
1853
+ "step": 6500
1854
+ },
1855
+ {
1856
+ "epoch": 2.930237781084192,
1857
+ "grad_norm": 0.6399256587028503,
1858
+ "learning_rate": 0.0007141706062720349,
1859
+ "loss": 2.7202,
1860
+ "step": 6525
1861
+ },
1862
+ {
1863
+ "epoch": 2.941467112096797,
1864
+ "grad_norm": 0.637417197227478,
1865
+ "learning_rate": 0.0007121532075844023,
1866
+ "loss": 2.6624,
1867
+ "step": 6550
1868
+ },
1869
+ {
1870
+ "epoch": 2.9526964431094016,
1871
+ "grad_norm": 0.7301665544509888,
1872
+ "learning_rate": 0.0007101315879625315,
1873
+ "loss": 2.7103,
1874
+ "step": 6575
1875
+ },
1876
+ {
1877
+ "epoch": 2.9639257741220066,
1878
+ "grad_norm": 0.6282750964164734,
1879
+ "learning_rate": 0.000708105787627938,
1880
+ "loss": 2.6985,
1881
+ "step": 6600
1882
+ },
1883
+ {
1884
+ "epoch": 2.9751551051346117,
1885
+ "grad_norm": 0.7559499144554138,
1886
+ "learning_rate": 0.0007060758468853153,
1887
+ "loss": 2.6989,
1888
+ "step": 6625
1889
+ },
1890
+ {
1891
+ "epoch": 2.9863844361472163,
1892
+ "grad_norm": 0.6338511109352112,
1893
+ "learning_rate": 0.0007040418061217324,
1894
+ "loss": 2.7278,
1895
+ "step": 6650
1896
+ },
1897
+ {
1898
+ "epoch": 2.9976137671598213,
1899
+ "grad_norm": 0.6456329226493835,
1900
+ "learning_rate": 0.0007020037058058326,
1901
+ "loss": 2.6851,
1902
+ "step": 6675
1903
+ },
1904
+ {
1905
+ "epoch": 3.0,
1906
+ "eval_loss": 3.168698787689209,
1907
+ "eval_runtime": 228.8127,
1908
+ "eval_samples_per_second": 54.997,
1909
+ "eval_steps_per_second": 54.997,
1910
+ "step": 6681
1911
+ }
1912
+ ],
1913
+ "logging_steps": 25,
1914
+ "max_steps": 17808,
1915
+ "num_input_tokens_seen": 0,
1916
+ "num_train_epochs": 8,
1917
+ "save_steps": 500,
1918
+ "stateful_callbacks": {
1919
+ "TrainerControl": {
1920
+ "args": {
1921
+ "should_epoch_stop": false,
1922
+ "should_evaluate": false,
1923
+ "should_log": false,
1924
+ "should_save": true,
1925
+ "should_training_stop": false
1926
+ },
1927
+ "attributes": {}
1928
+ }
1929
+ },
1930
+ "total_flos": 1.3044200408064e+16,
1931
+ "train_batch_size": 1,
1932
+ "trial_name": null,
1933
+ "trial_params": null
1934
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:489f044b9392a9ee28314cc59ac7352ed56662a402d578ab9e5c76a9b1cca731
3
+ size 5713
vocab.json ADDED
The diff for this file is too large to render. See raw diff