Tej3 commited on
Commit
88ce59e
·
verified ·
1 Parent(s): 36d019f

Initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<+>": 151665,
3
+ "<->": 151666,
4
+ "</tool_call>": 151658,
5
+ "<extra>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<|box_end|>": 151649,
8
+ "<|box_start|>": 151648,
9
+ "<|endoftext|>": 151643,
10
+ "<|file_sep|>": 151664,
11
+ "<|fim_middle|>": 151660,
12
+ "<|fim_pad|>": 151662,
13
+ "<|fim_prefix|>": 151659,
14
+ "<|fim_suffix|>": 151661,
15
+ "<|im_end|>": 151645,
16
+ "<|im_start|>": 151644,
17
+ "<|image_pad|>": 151655,
18
+ "<|object_ref_end|>": 151647,
19
+ "<|object_ref_start|>": 151646,
20
+ "<|quad_end|>": 151651,
21
+ "<|quad_start|>": 151650,
22
+ "<|repo_name|>": 151663,
23
+ "<|video_pad|>": 151656,
24
+ "<|vision_end|>": 151653,
25
+ "<|vision_pad|>": 151654,
26
+ "<|vision_start|>": 151652
27
+ }
config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 18944,
12
+ "max_position_embeddings": 4096,
13
+ "max_window_layers": 28,
14
+ "model_type": "qwen2",
15
+ "num_attention_heads": 28,
16
+ "num_hidden_layers": 28,
17
+ "num_key_value_heads": 4,
18
+ "rms_norm_eps": 1e-06,
19
+ "rope_scaling": null,
20
+ "rope_theta": 10000.0,
21
+ "sliding_window": 4096,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.51.3",
25
+ "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 152064
28
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "transformers_version": "4.51.3"
9
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11fb2e3fc2360cf6eb364ef5495e7851d42efcd21cffeb69016d332a9c693e63
3
+ size 4877660776
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cc7f47e05608a50d710c7ada2b03cd5262f53d5d5674446afc4790ac55d571b
3
+ size 4932751008
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f4933b87f75a556b8ddcff21f69199cf9fb81970c8a667d50ba309b4e54bc8e
3
+ size 4330865200
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8d8aea02848c864d55c11286523912c816cdf7823a42ec409f33c26387447fb
3
+ size 1089994880
model.safetensors.index.json ADDED
@@ -0,0 +1,346 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15231233024
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
298
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
299
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
300
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
301
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
303
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
304
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
306
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
308
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
309
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
310
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
311
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
312
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
313
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
316
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
327
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
328
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
330
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
332
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "model.norm.weight": "model-00003-of-00004.safetensors"
345
+ }
346
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|video_pad|>",
4
+ "<|box_end|>",
5
+ "<+>",
6
+ "<|object_ref_end|>",
7
+ "<|vision_start|>",
8
+ "<|vision_end|>",
9
+ "<->",
10
+ "<extra>",
11
+ "<|image_pad|>",
12
+ "<|object_ref_start|>",
13
+ "<|box_start|>",
14
+ "<|im_start|>",
15
+ "<|im_end|>",
16
+ "<|quad_start|>",
17
+ "<|vision_pad|>",
18
+ "<|quad_end|>"
19
+ ],
20
+ "eos_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "pad_token": {
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:291d05ad00455fef901f3c9cdbd95f896c591cdff089f3189f95d3171634563f
3
+ size 11422540
tokenizer_config.json ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<+>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<->",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<extra>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ }
205
+ },
206
+ "additional_special_tokens": [
207
+ "<|video_pad|>",
208
+ "<|box_end|>",
209
+ "<+>",
210
+ "<|object_ref_end|>",
211
+ "<|vision_start|>",
212
+ "<|vision_end|>",
213
+ "<->",
214
+ "<extra>",
215
+ "<|image_pad|>",
216
+ "<|object_ref_start|>",
217
+ "<|box_start|>",
218
+ "<|im_start|>",
219
+ "<|im_end|>",
220
+ "<|quad_start|>",
221
+ "<|vision_pad|>",
222
+ "<|quad_end|>"
223
+ ],
224
+ "bos_token": null,
225
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'Please reason step by step, and put your final answer within \\\\boxed{}.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nPlease reason step by step, and put your final answer within \\\\boxed{}.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
226
+ "clean_up_tokenization_spaces": false,
227
+ "eos_token": "<|im_end|>",
228
+ "errors": "replace",
229
+ "extra_special_tokens": {},
230
+ "model_max_length": 32000,
231
+ "pad_token": "<|endoftext|>",
232
+ "split_special_tokens": false,
233
+ "tokenizer_class": "Qwen2Tokenizer",
234
+ "unk_token": null
235
+ }
trainer_state.json ADDED
@@ -0,0 +1,2879 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 4000,
3
+ "best_metric": 0.7443897795757539,
4
+ "best_model_checkpoint": "/shared/sutd/tej/Finegrained_PRM/models/Qwen2_5_Math_7b_instruct_more_data_run1/checkpoint-4000",
5
+ "epoch": 0.8958880139982502,
6
+ "eval_steps": 1000,
7
+ "global_step": 4000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0,
14
+ "eval_loss": 12.598268508911133,
15
+ "eval_runtime": 512.8513,
16
+ "eval_samples_per_second": 234.602,
17
+ "eval_steps_per_second": 14.663,
18
+ "eval_token_accuracy": 0.6931271686857003,
19
+ "step": 0
20
+ },
21
+ {
22
+ "epoch": 0.0022397200349956255,
23
+ "grad_norm": 74.01309967041016,
24
+ "learning_rate": 5.03919372900336e-08,
25
+ "loss": 12.6044,
26
+ "step": 10
27
+ },
28
+ {
29
+ "epoch": 0.004479440069991251,
30
+ "grad_norm": 69.36959838867188,
31
+ "learning_rate": 1.0638297872340426e-07,
32
+ "loss": 12.6075,
33
+ "step": 20
34
+ },
35
+ {
36
+ "epoch": 0.006719160104986877,
37
+ "grad_norm": 79.40869140625,
38
+ "learning_rate": 1.6237402015677493e-07,
39
+ "loss": 12.5963,
40
+ "step": 30
41
+ },
42
+ {
43
+ "epoch": 0.008958880139982502,
44
+ "grad_norm": 67.53589630126953,
45
+ "learning_rate": 2.1836506159014558e-07,
46
+ "loss": 12.5561,
47
+ "step": 40
48
+ },
49
+ {
50
+ "epoch": 0.011198600174978127,
51
+ "grad_norm": 77.55623626708984,
52
+ "learning_rate": 2.7435610302351626e-07,
53
+ "loss": 12.4879,
54
+ "step": 50
55
+ },
56
+ {
57
+ "epoch": 0.013438320209973754,
58
+ "grad_norm": 57.3841552734375,
59
+ "learning_rate": 3.303471444568869e-07,
60
+ "loss": 12.3546,
61
+ "step": 60
62
+ },
63
+ {
64
+ "epoch": 0.01567804024496938,
65
+ "grad_norm": 56.3227424621582,
66
+ "learning_rate": 3.863381858902576e-07,
67
+ "loss": 12.2106,
68
+ "step": 70
69
+ },
70
+ {
71
+ "epoch": 0.017917760279965004,
72
+ "grad_norm": 85.61465454101562,
73
+ "learning_rate": 4.423292273236283e-07,
74
+ "loss": 11.8673,
75
+ "step": 80
76
+ },
77
+ {
78
+ "epoch": 0.02015748031496063,
79
+ "grad_norm": 89.71321868896484,
80
+ "learning_rate": 4.98320268756999e-07,
81
+ "loss": 11.414,
82
+ "step": 90
83
+ },
84
+ {
85
+ "epoch": 0.022397200349956254,
86
+ "grad_norm": 133.00584411621094,
87
+ "learning_rate": 5.543113101903696e-07,
88
+ "loss": 10.6907,
89
+ "step": 100
90
+ },
91
+ {
92
+ "epoch": 0.024636920384951883,
93
+ "grad_norm": 126.6435775756836,
94
+ "learning_rate": 6.103023516237402e-07,
95
+ "loss": 9.8793,
96
+ "step": 110
97
+ },
98
+ {
99
+ "epoch": 0.026876640419947508,
100
+ "grad_norm": 129.69862365722656,
101
+ "learning_rate": 6.662933930571109e-07,
102
+ "loss": 9.3174,
103
+ "step": 120
104
+ },
105
+ {
106
+ "epoch": 0.029116360454943133,
107
+ "grad_norm": 139.8076629638672,
108
+ "learning_rate": 7.222844344904815e-07,
109
+ "loss": 8.8794,
110
+ "step": 130
111
+ },
112
+ {
113
+ "epoch": 0.03135608048993876,
114
+ "grad_norm": 139.58309936523438,
115
+ "learning_rate": 7.782754759238523e-07,
116
+ "loss": 8.5155,
117
+ "step": 140
118
+ },
119
+ {
120
+ "epoch": 0.03359580052493438,
121
+ "grad_norm": 161.12001037597656,
122
+ "learning_rate": 8.342665173572229e-07,
123
+ "loss": 8.1803,
124
+ "step": 150
125
+ },
126
+ {
127
+ "epoch": 0.03583552055993001,
128
+ "grad_norm": 145.7100830078125,
129
+ "learning_rate": 8.902575587905936e-07,
130
+ "loss": 7.8429,
131
+ "step": 160
132
+ },
133
+ {
134
+ "epoch": 0.03807524059492563,
135
+ "grad_norm": 141.91400146484375,
136
+ "learning_rate": 9.462486002239643e-07,
137
+ "loss": 7.5282,
138
+ "step": 170
139
+ },
140
+ {
141
+ "epoch": 0.04031496062992126,
142
+ "grad_norm": 160.6728973388672,
143
+ "learning_rate": 1.0022396416573349e-06,
144
+ "loss": 7.1987,
145
+ "step": 180
146
+ },
147
+ {
148
+ "epoch": 0.04255468066491688,
149
+ "grad_norm": 147.54306030273438,
150
+ "learning_rate": 1.0582306830907057e-06,
151
+ "loss": 6.8656,
152
+ "step": 190
153
+ },
154
+ {
155
+ "epoch": 0.04479440069991251,
156
+ "grad_norm": 170.87681579589844,
157
+ "learning_rate": 1.1142217245240761e-06,
158
+ "loss": 6.5252,
159
+ "step": 200
160
+ },
161
+ {
162
+ "epoch": 0.04703412073490813,
163
+ "grad_norm": 152.63320922851562,
164
+ "learning_rate": 1.170212765957447e-06,
165
+ "loss": 6.179,
166
+ "step": 210
167
+ },
168
+ {
169
+ "epoch": 0.049273840769903765,
170
+ "grad_norm": 149.25778198242188,
171
+ "learning_rate": 1.2262038073908176e-06,
172
+ "loss": 5.8261,
173
+ "step": 220
174
+ },
175
+ {
176
+ "epoch": 0.05151356080489939,
177
+ "grad_norm": 162.14759826660156,
178
+ "learning_rate": 1.2821948488241882e-06,
179
+ "loss": 5.4615,
180
+ "step": 230
181
+ },
182
+ {
183
+ "epoch": 0.053753280839895015,
184
+ "grad_norm": 150.42588806152344,
185
+ "learning_rate": 1.338185890257559e-06,
186
+ "loss": 5.0995,
187
+ "step": 240
188
+ },
189
+ {
190
+ "epoch": 0.05599300087489064,
191
+ "grad_norm": 180.09596252441406,
192
+ "learning_rate": 1.3941769316909296e-06,
193
+ "loss": 4.7214,
194
+ "step": 250
195
+ },
196
+ {
197
+ "epoch": 0.058232720909886265,
198
+ "grad_norm": 151.71328735351562,
199
+ "learning_rate": 1.4501679731243e-06,
200
+ "loss": 4.3411,
201
+ "step": 260
202
+ },
203
+ {
204
+ "epoch": 0.06047244094488189,
205
+ "grad_norm": 149.18179321289062,
206
+ "learning_rate": 1.506159014557671e-06,
207
+ "loss": 3.9648,
208
+ "step": 270
209
+ },
210
+ {
211
+ "epoch": 0.06271216097987752,
212
+ "grad_norm": 158.7989959716797,
213
+ "learning_rate": 1.5621500559910415e-06,
214
+ "loss": 3.5754,
215
+ "step": 280
216
+ },
217
+ {
218
+ "epoch": 0.06495188101487315,
219
+ "grad_norm": 142.4947509765625,
220
+ "learning_rate": 1.6181410974244121e-06,
221
+ "loss": 3.198,
222
+ "step": 290
223
+ },
224
+ {
225
+ "epoch": 0.06719160104986877,
226
+ "grad_norm": 172.62350463867188,
227
+ "learning_rate": 1.674132138857783e-06,
228
+ "loss": 2.8298,
229
+ "step": 300
230
+ },
231
+ {
232
+ "epoch": 0.0694313210848644,
233
+ "grad_norm": 133.85870361328125,
234
+ "learning_rate": 1.7301231802911536e-06,
235
+ "loss": 2.471,
236
+ "step": 310
237
+ },
238
+ {
239
+ "epoch": 0.07167104111986002,
240
+ "grad_norm": 118.70149230957031,
241
+ "learning_rate": 1.7861142217245242e-06,
242
+ "loss": 2.1336,
243
+ "step": 320
244
+ },
245
+ {
246
+ "epoch": 0.07391076115485565,
247
+ "grad_norm": 116.01596069335938,
248
+ "learning_rate": 1.8421052631578948e-06,
249
+ "loss": 1.8315,
250
+ "step": 330
251
+ },
252
+ {
253
+ "epoch": 0.07615048118985127,
254
+ "grad_norm": 91.01036071777344,
255
+ "learning_rate": 1.8980963045912657e-06,
256
+ "loss": 1.5717,
257
+ "step": 340
258
+ },
259
+ {
260
+ "epoch": 0.0783902012248469,
261
+ "grad_norm": 118.93321228027344,
262
+ "learning_rate": 1.954087346024636e-06,
263
+ "loss": 1.3266,
264
+ "step": 350
265
+ },
266
+ {
267
+ "epoch": 0.08062992125984252,
268
+ "grad_norm": 67.08316040039062,
269
+ "learning_rate": 2.010078387458007e-06,
270
+ "loss": 1.1372,
271
+ "step": 360
272
+ },
273
+ {
274
+ "epoch": 0.08286964129483815,
275
+ "grad_norm": 42.60381317138672,
276
+ "learning_rate": 2.0660694288913777e-06,
277
+ "loss": 0.9771,
278
+ "step": 370
279
+ },
280
+ {
281
+ "epoch": 0.08510936132983377,
282
+ "grad_norm": 46.03950500488281,
283
+ "learning_rate": 2.122060470324748e-06,
284
+ "loss": 0.8681,
285
+ "step": 380
286
+ },
287
+ {
288
+ "epoch": 0.0873490813648294,
289
+ "grad_norm": 23.767711639404297,
290
+ "learning_rate": 2.178051511758119e-06,
291
+ "loss": 0.7751,
292
+ "step": 390
293
+ },
294
+ {
295
+ "epoch": 0.08958880139982502,
296
+ "grad_norm": 45.41588592529297,
297
+ "learning_rate": 2.2340425531914894e-06,
298
+ "loss": 0.6874,
299
+ "step": 400
300
+ },
301
+ {
302
+ "epoch": 0.09182852143482065,
303
+ "grad_norm": 18.76463508605957,
304
+ "learning_rate": 2.2900335946248602e-06,
305
+ "loss": 0.6544,
306
+ "step": 410
307
+ },
308
+ {
309
+ "epoch": 0.09406824146981627,
310
+ "grad_norm": 21.69906234741211,
311
+ "learning_rate": 2.346024636058231e-06,
312
+ "loss": 0.5834,
313
+ "step": 420
314
+ },
315
+ {
316
+ "epoch": 0.0963079615048119,
317
+ "grad_norm": 29.034330368041992,
318
+ "learning_rate": 2.4020156774916015e-06,
319
+ "loss": 0.5398,
320
+ "step": 430
321
+ },
322
+ {
323
+ "epoch": 0.09854768153980753,
324
+ "grad_norm": 12.911064147949219,
325
+ "learning_rate": 2.4580067189249723e-06,
326
+ "loss": 0.4761,
327
+ "step": 440
328
+ },
329
+ {
330
+ "epoch": 0.10078740157480315,
331
+ "grad_norm": 32.778602600097656,
332
+ "learning_rate": 2.5139977603583427e-06,
333
+ "loss": 0.4394,
334
+ "step": 450
335
+ },
336
+ {
337
+ "epoch": 0.10302712160979878,
338
+ "grad_norm": 48.48755645751953,
339
+ "learning_rate": 2.5699888017917135e-06,
340
+ "loss": 0.4834,
341
+ "step": 460
342
+ },
343
+ {
344
+ "epoch": 0.1052668416447944,
345
+ "grad_norm": 24.921396255493164,
346
+ "learning_rate": 2.6259798432250844e-06,
347
+ "loss": 0.4231,
348
+ "step": 470
349
+ },
350
+ {
351
+ "epoch": 0.10750656167979003,
352
+ "grad_norm": 7.630334377288818,
353
+ "learning_rate": 2.6819708846584548e-06,
354
+ "loss": 0.3715,
355
+ "step": 480
356
+ },
357
+ {
358
+ "epoch": 0.10974628171478565,
359
+ "grad_norm": 36.03708267211914,
360
+ "learning_rate": 2.7379619260918256e-06,
361
+ "loss": 0.37,
362
+ "step": 490
363
+ },
364
+ {
365
+ "epoch": 0.11198600174978128,
366
+ "grad_norm": 33.10087585449219,
367
+ "learning_rate": 2.7939529675251964e-06,
368
+ "loss": 0.3522,
369
+ "step": 500
370
+ },
371
+ {
372
+ "epoch": 0.1142257217847769,
373
+ "grad_norm": 8.544207572937012,
374
+ "learning_rate": 2.849944008958567e-06,
375
+ "loss": 0.3339,
376
+ "step": 510
377
+ },
378
+ {
379
+ "epoch": 0.11646544181977253,
380
+ "grad_norm": 37.781951904296875,
381
+ "learning_rate": 2.9059350503919377e-06,
382
+ "loss": 0.325,
383
+ "step": 520
384
+ },
385
+ {
386
+ "epoch": 0.11870516185476815,
387
+ "grad_norm": 9.192522048950195,
388
+ "learning_rate": 2.9619260918253085e-06,
389
+ "loss": 0.3175,
390
+ "step": 530
391
+ },
392
+ {
393
+ "epoch": 0.12094488188976378,
394
+ "grad_norm": 35.02260208129883,
395
+ "learning_rate": 3.017917133258679e-06,
396
+ "loss": 0.3193,
397
+ "step": 540
398
+ },
399
+ {
400
+ "epoch": 0.1231846019247594,
401
+ "grad_norm": 4.580764293670654,
402
+ "learning_rate": 3.0739081746920498e-06,
403
+ "loss": 0.3122,
404
+ "step": 550
405
+ },
406
+ {
407
+ "epoch": 0.12542432195975503,
408
+ "grad_norm": 12.692420959472656,
409
+ "learning_rate": 3.1298992161254197e-06,
410
+ "loss": 0.3234,
411
+ "step": 560
412
+ },
413
+ {
414
+ "epoch": 0.12766404199475065,
415
+ "grad_norm": 5.639922618865967,
416
+ "learning_rate": 3.1858902575587906e-06,
417
+ "loss": 0.2774,
418
+ "step": 570
419
+ },
420
+ {
421
+ "epoch": 0.1299037620297463,
422
+ "grad_norm": 2.170525550842285,
423
+ "learning_rate": 3.241881298992162e-06,
424
+ "loss": 0.2828,
425
+ "step": 580
426
+ },
427
+ {
428
+ "epoch": 0.1321434820647419,
429
+ "grad_norm": 2.8537590503692627,
430
+ "learning_rate": 3.297872340425532e-06,
431
+ "loss": 0.2961,
432
+ "step": 590
433
+ },
434
+ {
435
+ "epoch": 0.13438320209973753,
436
+ "grad_norm": 4.110259532928467,
437
+ "learning_rate": 3.3538633818589027e-06,
438
+ "loss": 0.2664,
439
+ "step": 600
440
+ },
441
+ {
442
+ "epoch": 0.13662292213473315,
443
+ "grad_norm": 5.937915802001953,
444
+ "learning_rate": 3.4098544232922735e-06,
445
+ "loss": 0.3023,
446
+ "step": 610
447
+ },
448
+ {
449
+ "epoch": 0.1388626421697288,
450
+ "grad_norm": 14.195796012878418,
451
+ "learning_rate": 3.465845464725644e-06,
452
+ "loss": 0.2662,
453
+ "step": 620
454
+ },
455
+ {
456
+ "epoch": 0.1411023622047244,
457
+ "grad_norm": 8.165706634521484,
458
+ "learning_rate": 3.5218365061590147e-06,
459
+ "loss": 0.2909,
460
+ "step": 630
461
+ },
462
+ {
463
+ "epoch": 0.14334208223972003,
464
+ "grad_norm": 9.505974769592285,
465
+ "learning_rate": 3.5778275475923856e-06,
466
+ "loss": 0.277,
467
+ "step": 640
468
+ },
469
+ {
470
+ "epoch": 0.14558180227471565,
471
+ "grad_norm": 6.215830326080322,
472
+ "learning_rate": 3.633818589025756e-06,
473
+ "loss": 0.2649,
474
+ "step": 650
475
+ },
476
+ {
477
+ "epoch": 0.1478215223097113,
478
+ "grad_norm": 16.8066463470459,
479
+ "learning_rate": 3.689809630459127e-06,
480
+ "loss": 0.299,
481
+ "step": 660
482
+ },
483
+ {
484
+ "epoch": 0.1500612423447069,
485
+ "grad_norm": 8.53093147277832,
486
+ "learning_rate": 3.7458006718924976e-06,
487
+ "loss": 0.2673,
488
+ "step": 670
489
+ },
490
+ {
491
+ "epoch": 0.15230096237970253,
492
+ "grad_norm": 6.421195030212402,
493
+ "learning_rate": 3.801791713325868e-06,
494
+ "loss": 0.2792,
495
+ "step": 680
496
+ },
497
+ {
498
+ "epoch": 0.15454068241469815,
499
+ "grad_norm": 2.7667346000671387,
500
+ "learning_rate": 3.857782754759239e-06,
501
+ "loss": 0.2909,
502
+ "step": 690
503
+ },
504
+ {
505
+ "epoch": 0.1567804024496938,
506
+ "grad_norm": 10.2630033493042,
507
+ "learning_rate": 3.91377379619261e-06,
508
+ "loss": 0.2716,
509
+ "step": 700
510
+ },
511
+ {
512
+ "epoch": 0.1590201224846894,
513
+ "grad_norm": 1.535446286201477,
514
+ "learning_rate": 3.96976483762598e-06,
515
+ "loss": 0.2816,
516
+ "step": 710
517
+ },
518
+ {
519
+ "epoch": 0.16125984251968503,
520
+ "grad_norm": 9.585880279541016,
521
+ "learning_rate": 4.025755879059351e-06,
522
+ "loss": 0.2553,
523
+ "step": 720
524
+ },
525
+ {
526
+ "epoch": 0.16349956255468068,
527
+ "grad_norm": 14.28587818145752,
528
+ "learning_rate": 4.081746920492721e-06,
529
+ "loss": 0.2825,
530
+ "step": 730
531
+ },
532
+ {
533
+ "epoch": 0.1657392825896763,
534
+ "grad_norm": 4.546958923339844,
535
+ "learning_rate": 4.137737961926092e-06,
536
+ "loss": 0.2718,
537
+ "step": 740
538
+ },
539
+ {
540
+ "epoch": 0.1679790026246719,
541
+ "grad_norm": 2.867119312286377,
542
+ "learning_rate": 4.193729003359463e-06,
543
+ "loss": 0.2507,
544
+ "step": 750
545
+ },
546
+ {
547
+ "epoch": 0.17021872265966753,
548
+ "grad_norm": 14.170395851135254,
549
+ "learning_rate": 4.2497200447928334e-06,
550
+ "loss": 0.2677,
551
+ "step": 760
552
+ },
553
+ {
554
+ "epoch": 0.17245844269466318,
555
+ "grad_norm": 17.095518112182617,
556
+ "learning_rate": 4.305711086226204e-06,
557
+ "loss": 0.2506,
558
+ "step": 770
559
+ },
560
+ {
561
+ "epoch": 0.1746981627296588,
562
+ "grad_norm": 5.597836017608643,
563
+ "learning_rate": 4.361702127659575e-06,
564
+ "loss": 0.2871,
565
+ "step": 780
566
+ },
567
+ {
568
+ "epoch": 0.17693788276465441,
569
+ "grad_norm": 14.059051513671875,
570
+ "learning_rate": 4.417693169092945e-06,
571
+ "loss": 0.2731,
572
+ "step": 790
573
+ },
574
+ {
575
+ "epoch": 0.17917760279965003,
576
+ "grad_norm": 4.634145259857178,
577
+ "learning_rate": 4.473684210526316e-06,
578
+ "loss": 0.2461,
579
+ "step": 800
580
+ },
581
+ {
582
+ "epoch": 0.18141732283464568,
583
+ "grad_norm": 2.4694900512695312,
584
+ "learning_rate": 4.529675251959687e-06,
585
+ "loss": 0.2798,
586
+ "step": 810
587
+ },
588
+ {
589
+ "epoch": 0.1836570428696413,
590
+ "grad_norm": 2.7586958408355713,
591
+ "learning_rate": 4.585666293393058e-06,
592
+ "loss": 0.263,
593
+ "step": 820
594
+ },
595
+ {
596
+ "epoch": 0.18589676290463691,
597
+ "grad_norm": 1.317171335220337,
598
+ "learning_rate": 4.6416573348264284e-06,
599
+ "loss": 0.2602,
600
+ "step": 830
601
+ },
602
+ {
603
+ "epoch": 0.18813648293963253,
604
+ "grad_norm": 16.16120719909668,
605
+ "learning_rate": 4.697648376259799e-06,
606
+ "loss": 0.2567,
607
+ "step": 840
608
+ },
609
+ {
610
+ "epoch": 0.19037620297462818,
611
+ "grad_norm": 1.187530279159546,
612
+ "learning_rate": 4.753639417693169e-06,
613
+ "loss": 0.2715,
614
+ "step": 850
615
+ },
616
+ {
617
+ "epoch": 0.1926159230096238,
618
+ "grad_norm": 11.328052520751953,
619
+ "learning_rate": 4.80963045912654e-06,
620
+ "loss": 0.2785,
621
+ "step": 860
622
+ },
623
+ {
624
+ "epoch": 0.19485564304461941,
625
+ "grad_norm": 5.0461530685424805,
626
+ "learning_rate": 4.865621500559911e-06,
627
+ "loss": 0.2466,
628
+ "step": 870
629
+ },
630
+ {
631
+ "epoch": 0.19709536307961506,
632
+ "grad_norm": 2.635329246520996,
633
+ "learning_rate": 4.921612541993282e-06,
634
+ "loss": 0.2689,
635
+ "step": 880
636
+ },
637
+ {
638
+ "epoch": 0.19933508311461068,
639
+ "grad_norm": 18.956153869628906,
640
+ "learning_rate": 4.977603583426653e-06,
641
+ "loss": 0.2594,
642
+ "step": 890
643
+ },
644
+ {
645
+ "epoch": 0.2015748031496063,
646
+ "grad_norm": 1.4908218383789062,
647
+ "learning_rate": 5.033594624860023e-06,
648
+ "loss": 0.2559,
649
+ "step": 900
650
+ },
651
+ {
652
+ "epoch": 0.20381452318460191,
653
+ "grad_norm": 13.888111114501953,
654
+ "learning_rate": 5.089585666293394e-06,
655
+ "loss": 0.2859,
656
+ "step": 910
657
+ },
658
+ {
659
+ "epoch": 0.20605424321959756,
660
+ "grad_norm": 1.7197439670562744,
661
+ "learning_rate": 5.145576707726763e-06,
662
+ "loss": 0.2462,
663
+ "step": 920
664
+ },
665
+ {
666
+ "epoch": 0.20829396325459318,
667
+ "grad_norm": 1.1483347415924072,
668
+ "learning_rate": 5.201567749160134e-06,
669
+ "loss": 0.25,
670
+ "step": 930
671
+ },
672
+ {
673
+ "epoch": 0.2105336832895888,
674
+ "grad_norm": 10.579352378845215,
675
+ "learning_rate": 5.257558790593506e-06,
676
+ "loss": 0.2376,
677
+ "step": 940
678
+ },
679
+ {
680
+ "epoch": 0.21277340332458441,
681
+ "grad_norm": 2.291902542114258,
682
+ "learning_rate": 5.313549832026877e-06,
683
+ "loss": 0.2453,
684
+ "step": 950
685
+ },
686
+ {
687
+ "epoch": 0.21501312335958006,
688
+ "grad_norm": 5.439045429229736,
689
+ "learning_rate": 5.3695408734602476e-06,
690
+ "loss": 0.2984,
691
+ "step": 960
692
+ },
693
+ {
694
+ "epoch": 0.21725284339457568,
695
+ "grad_norm": 1.5583287477493286,
696
+ "learning_rate": 5.425531914893617e-06,
697
+ "loss": 0.256,
698
+ "step": 970
699
+ },
700
+ {
701
+ "epoch": 0.2194925634295713,
702
+ "grad_norm": 5.13300895690918,
703
+ "learning_rate": 5.4815229563269875e-06,
704
+ "loss": 0.259,
705
+ "step": 980
706
+ },
707
+ {
708
+ "epoch": 0.22173228346456694,
709
+ "grad_norm": 3.150655746459961,
710
+ "learning_rate": 5.537513997760358e-06,
711
+ "loss": 0.2447,
712
+ "step": 990
713
+ },
714
+ {
715
+ "epoch": 0.22397200349956256,
716
+ "grad_norm": 5.581788063049316,
717
+ "learning_rate": 5.593505039193729e-06,
718
+ "loss": 0.2388,
719
+ "step": 1000
720
+ },
721
+ {
722
+ "epoch": 0.22397200349956256,
723
+ "eval_loss": 0.24665296077728271,
724
+ "eval_runtime": 508.3924,
725
+ "eval_samples_per_second": 236.66,
726
+ "eval_steps_per_second": 14.792,
727
+ "eval_token_accuracy": 0.6965583269229705,
728
+ "step": 1000
729
+ },
730
+ {
731
+ "epoch": 0.22621172353455818,
732
+ "grad_norm": 6.410147666931152,
733
+ "learning_rate": 5.6494960806271e-06,
734
+ "loss": 0.2569,
735
+ "step": 1010
736
+ },
737
+ {
738
+ "epoch": 0.2284514435695538,
739
+ "grad_norm": 8.687148094177246,
740
+ "learning_rate": 5.705487122060471e-06,
741
+ "loss": 0.2499,
742
+ "step": 1020
743
+ },
744
+ {
745
+ "epoch": 0.23069116360454944,
746
+ "grad_norm": 6.87606954574585,
747
+ "learning_rate": 5.761478163493841e-06,
748
+ "loss": 0.2555,
749
+ "step": 1030
750
+ },
751
+ {
752
+ "epoch": 0.23293088363954506,
753
+ "grad_norm": 1.6779398918151855,
754
+ "learning_rate": 5.817469204927212e-06,
755
+ "loss": 0.2556,
756
+ "step": 1040
757
+ },
758
+ {
759
+ "epoch": 0.23517060367454068,
760
+ "grad_norm": 7.228851318359375,
761
+ "learning_rate": 5.8734602463605825e-06,
762
+ "loss": 0.245,
763
+ "step": 1050
764
+ },
765
+ {
766
+ "epoch": 0.2374103237095363,
767
+ "grad_norm": 5.364595890045166,
768
+ "learning_rate": 5.929451287793953e-06,
769
+ "loss": 0.2717,
770
+ "step": 1060
771
+ },
772
+ {
773
+ "epoch": 0.23965004374453194,
774
+ "grad_norm": 3.2734596729278564,
775
+ "learning_rate": 5.985442329227324e-06,
776
+ "loss": 0.2274,
777
+ "step": 1070
778
+ },
779
+ {
780
+ "epoch": 0.24188976377952756,
781
+ "grad_norm": 4.536466598510742,
782
+ "learning_rate": 6.041433370660694e-06,
783
+ "loss": 0.2561,
784
+ "step": 1080
785
+ },
786
+ {
787
+ "epoch": 0.24412948381452318,
788
+ "grad_norm": 7.9406280517578125,
789
+ "learning_rate": 6.097424412094065e-06,
790
+ "loss": 0.2428,
791
+ "step": 1090
792
+ },
793
+ {
794
+ "epoch": 0.2463692038495188,
795
+ "grad_norm": 6.413431644439697,
796
+ "learning_rate": 6.153415453527436e-06,
797
+ "loss": 0.2327,
798
+ "step": 1100
799
+ },
800
+ {
801
+ "epoch": 0.24860892388451444,
802
+ "grad_norm": 1.6045399904251099,
803
+ "learning_rate": 6.209406494960807e-06,
804
+ "loss": 0.2704,
805
+ "step": 1110
806
+ },
807
+ {
808
+ "epoch": 0.25084864391951006,
809
+ "grad_norm": 8.93922233581543,
810
+ "learning_rate": 6.2653975363941775e-06,
811
+ "loss": 0.2365,
812
+ "step": 1120
813
+ },
814
+ {
815
+ "epoch": 0.2530883639545057,
816
+ "grad_norm": 6.114246845245361,
817
+ "learning_rate": 6.321388577827548e-06,
818
+ "loss": 0.2476,
819
+ "step": 1130
820
+ },
821
+ {
822
+ "epoch": 0.2553280839895013,
823
+ "grad_norm": 4.84804105758667,
824
+ "learning_rate": 6.377379619260918e-06,
825
+ "loss": 0.2411,
826
+ "step": 1140
827
+ },
828
+ {
829
+ "epoch": 0.2575678040244969,
830
+ "grad_norm": 2.421581506729126,
831
+ "learning_rate": 6.433370660694289e-06,
832
+ "loss": 0.2251,
833
+ "step": 1150
834
+ },
835
+ {
836
+ "epoch": 0.2598075240594926,
837
+ "grad_norm": 13.80971622467041,
838
+ "learning_rate": 6.48936170212766e-06,
839
+ "loss": 0.2541,
840
+ "step": 1160
841
+ },
842
+ {
843
+ "epoch": 0.2620472440944882,
844
+ "grad_norm": 1.8289860486984253,
845
+ "learning_rate": 6.545352743561031e-06,
846
+ "loss": 0.2462,
847
+ "step": 1170
848
+ },
849
+ {
850
+ "epoch": 0.2642869641294838,
851
+ "grad_norm": 1.4232782125473022,
852
+ "learning_rate": 6.601343784994402e-06,
853
+ "loss": 0.2394,
854
+ "step": 1180
855
+ },
856
+ {
857
+ "epoch": 0.26652668416447944,
858
+ "grad_norm": 6.040958404541016,
859
+ "learning_rate": 6.6573348264277725e-06,
860
+ "loss": 0.2441,
861
+ "step": 1190
862
+ },
863
+ {
864
+ "epoch": 0.26876640419947506,
865
+ "grad_norm": 5.110551357269287,
866
+ "learning_rate": 6.7133258678611425e-06,
867
+ "loss": 0.2285,
868
+ "step": 1200
869
+ },
870
+ {
871
+ "epoch": 0.2710061242344707,
872
+ "grad_norm": 14.493913650512695,
873
+ "learning_rate": 6.769316909294513e-06,
874
+ "loss": 0.2545,
875
+ "step": 1210
876
+ },
877
+ {
878
+ "epoch": 0.2732458442694663,
879
+ "grad_norm": 6.708395481109619,
880
+ "learning_rate": 6.825307950727884e-06,
881
+ "loss": 0.2399,
882
+ "step": 1220
883
+ },
884
+ {
885
+ "epoch": 0.2754855643044619,
886
+ "grad_norm": 7.098331451416016,
887
+ "learning_rate": 6.881298992161255e-06,
888
+ "loss": 0.2529,
889
+ "step": 1230
890
+ },
891
+ {
892
+ "epoch": 0.2777252843394576,
893
+ "grad_norm": 7.667686462402344,
894
+ "learning_rate": 6.937290033594626e-06,
895
+ "loss": 0.2265,
896
+ "step": 1240
897
+ },
898
+ {
899
+ "epoch": 0.2799650043744532,
900
+ "grad_norm": 3.293351411819458,
901
+ "learning_rate": 6.993281075027996e-06,
902
+ "loss": 0.212,
903
+ "step": 1250
904
+ },
905
+ {
906
+ "epoch": 0.2822047244094488,
907
+ "grad_norm": 6.79879093170166,
908
+ "learning_rate": 7.049272116461367e-06,
909
+ "loss": 0.2496,
910
+ "step": 1260
911
+ },
912
+ {
913
+ "epoch": 0.28444444444444444,
914
+ "grad_norm": 4.2775092124938965,
915
+ "learning_rate": 7.1052631578947375e-06,
916
+ "loss": 0.2219,
917
+ "step": 1270
918
+ },
919
+ {
920
+ "epoch": 0.28668416447944006,
921
+ "grad_norm": 2.1673076152801514,
922
+ "learning_rate": 7.161254199328108e-06,
923
+ "loss": 0.2374,
924
+ "step": 1280
925
+ },
926
+ {
927
+ "epoch": 0.2889238845144357,
928
+ "grad_norm": 5.312763214111328,
929
+ "learning_rate": 7.217245240761479e-06,
930
+ "loss": 0.2256,
931
+ "step": 1290
932
+ },
933
+ {
934
+ "epoch": 0.2911636045494313,
935
+ "grad_norm": 4.370241165161133,
936
+ "learning_rate": 7.27323628219485e-06,
937
+ "loss": 0.2357,
938
+ "step": 1300
939
+ },
940
+ {
941
+ "epoch": 0.29340332458442697,
942
+ "grad_norm": 3.1714890003204346,
943
+ "learning_rate": 7.32922732362822e-06,
944
+ "loss": 0.2557,
945
+ "step": 1310
946
+ },
947
+ {
948
+ "epoch": 0.2956430446194226,
949
+ "grad_norm": 2.53678297996521,
950
+ "learning_rate": 7.385218365061591e-06,
951
+ "loss": 0.2331,
952
+ "step": 1320
953
+ },
954
+ {
955
+ "epoch": 0.2978827646544182,
956
+ "grad_norm": 3.696383476257324,
957
+ "learning_rate": 7.441209406494962e-06,
958
+ "loss": 0.232,
959
+ "step": 1330
960
+ },
961
+ {
962
+ "epoch": 0.3001224846894138,
963
+ "grad_norm": 4.8781232833862305,
964
+ "learning_rate": 7.4972004479283325e-06,
965
+ "loss": 0.2306,
966
+ "step": 1340
967
+ },
968
+ {
969
+ "epoch": 0.30236220472440944,
970
+ "grad_norm": 1.83755362033844,
971
+ "learning_rate": 7.553191489361703e-06,
972
+ "loss": 0.2337,
973
+ "step": 1350
974
+ },
975
+ {
976
+ "epoch": 0.30460192475940506,
977
+ "grad_norm": 11.356858253479004,
978
+ "learning_rate": 7.609182530795074e-06,
979
+ "loss": 0.2459,
980
+ "step": 1360
981
+ },
982
+ {
983
+ "epoch": 0.3068416447944007,
984
+ "grad_norm": 8.49129581451416,
985
+ "learning_rate": 7.665173572228444e-06,
986
+ "loss": 0.2293,
987
+ "step": 1370
988
+ },
989
+ {
990
+ "epoch": 0.3090813648293963,
991
+ "grad_norm": 2.2894465923309326,
992
+ "learning_rate": 7.721164613661814e-06,
993
+ "loss": 0.2383,
994
+ "step": 1380
995
+ },
996
+ {
997
+ "epoch": 0.311321084864392,
998
+ "grad_norm": 1.8182672262191772,
999
+ "learning_rate": 7.777155655095186e-06,
1000
+ "loss": 0.241,
1001
+ "step": 1390
1002
+ },
1003
+ {
1004
+ "epoch": 0.3135608048993876,
1005
+ "grad_norm": 3.236206293106079,
1006
+ "learning_rate": 7.833146696528556e-06,
1007
+ "loss": 0.2285,
1008
+ "step": 1400
1009
+ },
1010
+ {
1011
+ "epoch": 0.3158005249343832,
1012
+ "grad_norm": 1.631281852722168,
1013
+ "learning_rate": 7.889137737961927e-06,
1014
+ "loss": 0.2571,
1015
+ "step": 1410
1016
+ },
1017
+ {
1018
+ "epoch": 0.3180402449693788,
1019
+ "grad_norm": 8.107253074645996,
1020
+ "learning_rate": 7.945128779395297e-06,
1021
+ "loss": 0.2209,
1022
+ "step": 1420
1023
+ },
1024
+ {
1025
+ "epoch": 0.32027996500437445,
1026
+ "grad_norm": 1.1233727931976318,
1027
+ "learning_rate": 8.001119820828667e-06,
1028
+ "loss": 0.2427,
1029
+ "step": 1430
1030
+ },
1031
+ {
1032
+ "epoch": 0.32251968503937006,
1033
+ "grad_norm": 8.097697257995605,
1034
+ "learning_rate": 8.057110862262039e-06,
1035
+ "loss": 0.2452,
1036
+ "step": 1440
1037
+ },
1038
+ {
1039
+ "epoch": 0.3247594050743657,
1040
+ "grad_norm": 2.3983142375946045,
1041
+ "learning_rate": 8.113101903695409e-06,
1042
+ "loss": 0.2186,
1043
+ "step": 1450
1044
+ },
1045
+ {
1046
+ "epoch": 0.32699912510936135,
1047
+ "grad_norm": 12.040475845336914,
1048
+ "learning_rate": 8.16909294512878e-06,
1049
+ "loss": 0.2438,
1050
+ "step": 1460
1051
+ },
1052
+ {
1053
+ "epoch": 0.329238845144357,
1054
+ "grad_norm": 1.604379653930664,
1055
+ "learning_rate": 8.22508398656215e-06,
1056
+ "loss": 0.2395,
1057
+ "step": 1470
1058
+ },
1059
+ {
1060
+ "epoch": 0.3314785651793526,
1061
+ "grad_norm": 3.6802425384521484,
1062
+ "learning_rate": 8.28107502799552e-06,
1063
+ "loss": 0.2417,
1064
+ "step": 1480
1065
+ },
1066
+ {
1067
+ "epoch": 0.3337182852143482,
1068
+ "grad_norm": 1.3274339437484741,
1069
+ "learning_rate": 8.337066069428892e-06,
1070
+ "loss": 0.2333,
1071
+ "step": 1490
1072
+ },
1073
+ {
1074
+ "epoch": 0.3359580052493438,
1075
+ "grad_norm": 2.7458624839782715,
1076
+ "learning_rate": 8.393057110862262e-06,
1077
+ "loss": 0.2099,
1078
+ "step": 1500
1079
+ },
1080
+ {
1081
+ "epoch": 0.33819772528433945,
1082
+ "grad_norm": 2.586228847503662,
1083
+ "learning_rate": 8.449048152295634e-06,
1084
+ "loss": 0.2487,
1085
+ "step": 1510
1086
+ },
1087
+ {
1088
+ "epoch": 0.34043744531933506,
1089
+ "grad_norm": 4.21591329574585,
1090
+ "learning_rate": 8.505039193729004e-06,
1091
+ "loss": 0.2232,
1092
+ "step": 1520
1093
+ },
1094
+ {
1095
+ "epoch": 0.3426771653543307,
1096
+ "grad_norm": 10.471442222595215,
1097
+ "learning_rate": 8.561030235162374e-06,
1098
+ "loss": 0.243,
1099
+ "step": 1530
1100
+ },
1101
+ {
1102
+ "epoch": 0.34491688538932636,
1103
+ "grad_norm": 2.1316909790039062,
1104
+ "learning_rate": 8.617021276595746e-06,
1105
+ "loss": 0.2245,
1106
+ "step": 1540
1107
+ },
1108
+ {
1109
+ "epoch": 0.347156605424322,
1110
+ "grad_norm": 2.768832206726074,
1111
+ "learning_rate": 8.673012318029116e-06,
1112
+ "loss": 0.231,
1113
+ "step": 1550
1114
+ },
1115
+ {
1116
+ "epoch": 0.3493963254593176,
1117
+ "grad_norm": 1.221541404724121,
1118
+ "learning_rate": 8.729003359462487e-06,
1119
+ "loss": 0.227,
1120
+ "step": 1560
1121
+ },
1122
+ {
1123
+ "epoch": 0.3516360454943132,
1124
+ "grad_norm": 7.449015140533447,
1125
+ "learning_rate": 8.784994400895857e-06,
1126
+ "loss": 0.2445,
1127
+ "step": 1570
1128
+ },
1129
+ {
1130
+ "epoch": 0.35387576552930883,
1131
+ "grad_norm": 3.9796910285949707,
1132
+ "learning_rate": 8.840985442329229e-06,
1133
+ "loss": 0.2312,
1134
+ "step": 1580
1135
+ },
1136
+ {
1137
+ "epoch": 0.35611548556430445,
1138
+ "grad_norm": 4.768671989440918,
1139
+ "learning_rate": 8.896976483762599e-06,
1140
+ "loss": 0.2371,
1141
+ "step": 1590
1142
+ },
1143
+ {
1144
+ "epoch": 0.35835520559930006,
1145
+ "grad_norm": 2.461826801300049,
1146
+ "learning_rate": 8.952967525195969e-06,
1147
+ "loss": 0.2342,
1148
+ "step": 1600
1149
+ },
1150
+ {
1151
+ "epoch": 0.36059492563429574,
1152
+ "grad_norm": 15.604554176330566,
1153
+ "learning_rate": 9.00895856662934e-06,
1154
+ "loss": 0.2639,
1155
+ "step": 1610
1156
+ },
1157
+ {
1158
+ "epoch": 0.36283464566929136,
1159
+ "grad_norm": 10.057650566101074,
1160
+ "learning_rate": 9.06494960806271e-06,
1161
+ "loss": 0.2364,
1162
+ "step": 1620
1163
+ },
1164
+ {
1165
+ "epoch": 0.365074365704287,
1166
+ "grad_norm": 1.363266110420227,
1167
+ "learning_rate": 9.120940649496082e-06,
1168
+ "loss": 0.2284,
1169
+ "step": 1630
1170
+ },
1171
+ {
1172
+ "epoch": 0.3673140857392826,
1173
+ "grad_norm": 6.2388763427734375,
1174
+ "learning_rate": 9.176931690929452e-06,
1175
+ "loss": 0.2363,
1176
+ "step": 1640
1177
+ },
1178
+ {
1179
+ "epoch": 0.3695538057742782,
1180
+ "grad_norm": 5.490090370178223,
1181
+ "learning_rate": 9.232922732362822e-06,
1182
+ "loss": 0.2274,
1183
+ "step": 1650
1184
+ },
1185
+ {
1186
+ "epoch": 0.37179352580927383,
1187
+ "grad_norm": 4.038967132568359,
1188
+ "learning_rate": 9.288913773796194e-06,
1189
+ "loss": 0.2431,
1190
+ "step": 1660
1191
+ },
1192
+ {
1193
+ "epoch": 0.37403324584426945,
1194
+ "grad_norm": 2.4451630115509033,
1195
+ "learning_rate": 9.344904815229564e-06,
1196
+ "loss": 0.2269,
1197
+ "step": 1670
1198
+ },
1199
+ {
1200
+ "epoch": 0.37627296587926506,
1201
+ "grad_norm": 7.310824871063232,
1202
+ "learning_rate": 9.400895856662936e-06,
1203
+ "loss": 0.2338,
1204
+ "step": 1680
1205
+ },
1206
+ {
1207
+ "epoch": 0.37851268591426074,
1208
+ "grad_norm": 3.421902656555176,
1209
+ "learning_rate": 9.456886898096306e-06,
1210
+ "loss": 0.225,
1211
+ "step": 1690
1212
+ },
1213
+ {
1214
+ "epoch": 0.38075240594925636,
1215
+ "grad_norm": 2.0587453842163086,
1216
+ "learning_rate": 9.512877939529676e-06,
1217
+ "loss": 0.2229,
1218
+ "step": 1700
1219
+ },
1220
+ {
1221
+ "epoch": 0.382992125984252,
1222
+ "grad_norm": 7.844244956970215,
1223
+ "learning_rate": 9.568868980963046e-06,
1224
+ "loss": 0.2471,
1225
+ "step": 1710
1226
+ },
1227
+ {
1228
+ "epoch": 0.3852318460192476,
1229
+ "grad_norm": 2.0231168270111084,
1230
+ "learning_rate": 9.624860022396417e-06,
1231
+ "loss": 0.2265,
1232
+ "step": 1720
1233
+ },
1234
+ {
1235
+ "epoch": 0.3874715660542432,
1236
+ "grad_norm": 4.695873737335205,
1237
+ "learning_rate": 9.680851063829787e-06,
1238
+ "loss": 0.2319,
1239
+ "step": 1730
1240
+ },
1241
+ {
1242
+ "epoch": 0.38971128608923883,
1243
+ "grad_norm": 4.3562703132629395,
1244
+ "learning_rate": 9.736842105263159e-06,
1245
+ "loss": 0.2402,
1246
+ "step": 1740
1247
+ },
1248
+ {
1249
+ "epoch": 0.39195100612423445,
1250
+ "grad_norm": 3.8503875732421875,
1251
+ "learning_rate": 9.79283314669653e-06,
1252
+ "loss": 0.2172,
1253
+ "step": 1750
1254
+ },
1255
+ {
1256
+ "epoch": 0.3941907261592301,
1257
+ "grad_norm": 1.6659972667694092,
1258
+ "learning_rate": 9.848824188129899e-06,
1259
+ "loss": 0.2556,
1260
+ "step": 1760
1261
+ },
1262
+ {
1263
+ "epoch": 0.39643044619422574,
1264
+ "grad_norm": 8.133085250854492,
1265
+ "learning_rate": 9.90481522956327e-06,
1266
+ "loss": 0.2404,
1267
+ "step": 1770
1268
+ },
1269
+ {
1270
+ "epoch": 0.39867016622922136,
1271
+ "grad_norm": 1.0106691122055054,
1272
+ "learning_rate": 9.96080627099664e-06,
1273
+ "loss": 0.2221,
1274
+ "step": 1780
1275
+ },
1276
+ {
1277
+ "epoch": 0.400909886264217,
1278
+ "grad_norm": 4.416316986083984,
1279
+ "learning_rate": 9.999999140094955e-06,
1280
+ "loss": 0.2338,
1281
+ "step": 1790
1282
+ },
1283
+ {
1284
+ "epoch": 0.4031496062992126,
1285
+ "grad_norm": 2.253812074661255,
1286
+ "learning_rate": 9.999983852902361e-06,
1287
+ "loss": 0.224,
1288
+ "step": 1800
1289
+ },
1290
+ {
1291
+ "epoch": 0.4053893263342082,
1292
+ "grad_norm": 3.477606773376465,
1293
+ "learning_rate": 9.999949456775993e-06,
1294
+ "loss": 0.2328,
1295
+ "step": 1810
1296
+ },
1297
+ {
1298
+ "epoch": 0.40762904636920383,
1299
+ "grad_norm": 4.672702789306641,
1300
+ "learning_rate": 9.9998959518473e-06,
1301
+ "loss": 0.2336,
1302
+ "step": 1820
1303
+ },
1304
+ {
1305
+ "epoch": 0.4098687664041995,
1306
+ "grad_norm": 1.8212214708328247,
1307
+ "learning_rate": 9.999823338320772e-06,
1308
+ "loss": 0.2283,
1309
+ "step": 1830
1310
+ },
1311
+ {
1312
+ "epoch": 0.4121084864391951,
1313
+ "grad_norm": 2.800435781478882,
1314
+ "learning_rate": 9.99973161647392e-06,
1315
+ "loss": 0.2379,
1316
+ "step": 1840
1317
+ },
1318
+ {
1319
+ "epoch": 0.41434820647419074,
1320
+ "grad_norm": 7.563694477081299,
1321
+ "learning_rate": 9.999620786657289e-06,
1322
+ "loss": 0.2188,
1323
+ "step": 1850
1324
+ },
1325
+ {
1326
+ "epoch": 0.41658792650918636,
1327
+ "grad_norm": 13.088976860046387,
1328
+ "learning_rate": 9.999490849294448e-06,
1329
+ "loss": 0.2567,
1330
+ "step": 1860
1331
+ },
1332
+ {
1333
+ "epoch": 0.418827646544182,
1334
+ "grad_norm": 1.8773629665374756,
1335
+ "learning_rate": 9.99934180488199e-06,
1336
+ "loss": 0.2316,
1337
+ "step": 1870
1338
+ },
1339
+ {
1340
+ "epoch": 0.4210673665791776,
1341
+ "grad_norm": 3.4965360164642334,
1342
+ "learning_rate": 9.999173653989533e-06,
1343
+ "loss": 0.2355,
1344
+ "step": 1880
1345
+ },
1346
+ {
1347
+ "epoch": 0.4233070866141732,
1348
+ "grad_norm": 3.4962074756622314,
1349
+ "learning_rate": 9.998986397259716e-06,
1350
+ "loss": 0.229,
1351
+ "step": 1890
1352
+ },
1353
+ {
1354
+ "epoch": 0.42554680664916883,
1355
+ "grad_norm": 1.0358535051345825,
1356
+ "learning_rate": 9.998780035408198e-06,
1357
+ "loss": 0.2235,
1358
+ "step": 1900
1359
+ },
1360
+ {
1361
+ "epoch": 0.4277865266841645,
1362
+ "grad_norm": 15.394458770751953,
1363
+ "learning_rate": 9.998554569223652e-06,
1364
+ "loss": 0.2484,
1365
+ "step": 1910
1366
+ },
1367
+ {
1368
+ "epoch": 0.4300262467191601,
1369
+ "grad_norm": 2.1868062019348145,
1370
+ "learning_rate": 9.998309999567764e-06,
1371
+ "loss": 0.2382,
1372
+ "step": 1920
1373
+ },
1374
+ {
1375
+ "epoch": 0.43226596675415574,
1376
+ "grad_norm": 3.1081955432891846,
1377
+ "learning_rate": 9.99804632737523e-06,
1378
+ "loss": 0.229,
1379
+ "step": 1930
1380
+ },
1381
+ {
1382
+ "epoch": 0.43450568678915136,
1383
+ "grad_norm": 1.4906216859817505,
1384
+ "learning_rate": 9.997763553653752e-06,
1385
+ "loss": 0.2319,
1386
+ "step": 1940
1387
+ },
1388
+ {
1389
+ "epoch": 0.436745406824147,
1390
+ "grad_norm": 3.580657482147217,
1391
+ "learning_rate": 9.997461679484034e-06,
1392
+ "loss": 0.2156,
1393
+ "step": 1950
1394
+ },
1395
+ {
1396
+ "epoch": 0.4389851268591426,
1397
+ "grad_norm": 9.583480834960938,
1398
+ "learning_rate": 9.997140706019779e-06,
1399
+ "loss": 0.237,
1400
+ "step": 1960
1401
+ },
1402
+ {
1403
+ "epoch": 0.4412248468941382,
1404
+ "grad_norm": 5.272135257720947,
1405
+ "learning_rate": 9.996800634487685e-06,
1406
+ "loss": 0.227,
1407
+ "step": 1970
1408
+ },
1409
+ {
1410
+ "epoch": 0.4434645669291339,
1411
+ "grad_norm": 2.0222976207733154,
1412
+ "learning_rate": 9.996441466187434e-06,
1413
+ "loss": 0.2317,
1414
+ "step": 1980
1415
+ },
1416
+ {
1417
+ "epoch": 0.4457042869641295,
1418
+ "grad_norm": 1.1668893098831177,
1419
+ "learning_rate": 9.996063202491698e-06,
1420
+ "loss": 0.232,
1421
+ "step": 1990
1422
+ },
1423
+ {
1424
+ "epoch": 0.4479440069991251,
1425
+ "grad_norm": 10.037023544311523,
1426
+ "learning_rate": 9.995665844846119e-06,
1427
+ "loss": 0.2207,
1428
+ "step": 2000
1429
+ },
1430
+ {
1431
+ "epoch": 0.4479440069991251,
1432
+ "eval_loss": 0.23211318254470825,
1433
+ "eval_runtime": 506.8766,
1434
+ "eval_samples_per_second": 237.367,
1435
+ "eval_steps_per_second": 14.836,
1436
+ "eval_token_accuracy": 0.6974161164822881,
1437
+ "step": 2000
1438
+ },
1439
+ {
1440
+ "epoch": 0.45018372703412074,
1441
+ "grad_norm": 2.2029612064361572,
1442
+ "learning_rate": 9.995249394769327e-06,
1443
+ "loss": 0.2383,
1444
+ "step": 2010
1445
+ },
1446
+ {
1447
+ "epoch": 0.45242344706911636,
1448
+ "grad_norm": 9.830987930297852,
1449
+ "learning_rate": 9.994813853852903e-06,
1450
+ "loss": 0.2207,
1451
+ "step": 2020
1452
+ },
1453
+ {
1454
+ "epoch": 0.454663167104112,
1455
+ "grad_norm": 1.8869295120239258,
1456
+ "learning_rate": 9.9943592237614e-06,
1457
+ "loss": 0.2264,
1458
+ "step": 2030
1459
+ },
1460
+ {
1461
+ "epoch": 0.4569028871391076,
1462
+ "grad_norm": 1.761500597000122,
1463
+ "learning_rate": 9.993885506232324e-06,
1464
+ "loss": 0.2285,
1465
+ "step": 2040
1466
+ },
1467
+ {
1468
+ "epoch": 0.4591426071741032,
1469
+ "grad_norm": 4.521030426025391,
1470
+ "learning_rate": 9.993392703076126e-06,
1471
+ "loss": 0.2236,
1472
+ "step": 2050
1473
+ },
1474
+ {
1475
+ "epoch": 0.4613823272090989,
1476
+ "grad_norm": 1.2697912454605103,
1477
+ "learning_rate": 9.9928808161762e-06,
1478
+ "loss": 0.2341,
1479
+ "step": 2060
1480
+ },
1481
+ {
1482
+ "epoch": 0.4636220472440945,
1483
+ "grad_norm": 3.4168875217437744,
1484
+ "learning_rate": 9.992349847488878e-06,
1485
+ "loss": 0.2126,
1486
+ "step": 2070
1487
+ },
1488
+ {
1489
+ "epoch": 0.4658617672790901,
1490
+ "grad_norm": 5.974389553070068,
1491
+ "learning_rate": 9.991799799043413e-06,
1492
+ "loss": 0.2372,
1493
+ "step": 2080
1494
+ },
1495
+ {
1496
+ "epoch": 0.46810148731408574,
1497
+ "grad_norm": 1.9752360582351685,
1498
+ "learning_rate": 9.991230672941982e-06,
1499
+ "loss": 0.2288,
1500
+ "step": 2090
1501
+ },
1502
+ {
1503
+ "epoch": 0.47034120734908136,
1504
+ "grad_norm": 1.7691487073898315,
1505
+ "learning_rate": 9.990642471359668e-06,
1506
+ "loss": 0.2142,
1507
+ "step": 2100
1508
+ },
1509
+ {
1510
+ "epoch": 0.472580927384077,
1511
+ "grad_norm": 4.6645731925964355,
1512
+ "learning_rate": 9.990035196544461e-06,
1513
+ "loss": 0.2403,
1514
+ "step": 2110
1515
+ },
1516
+ {
1517
+ "epoch": 0.4748206474190726,
1518
+ "grad_norm": 2.952207565307617,
1519
+ "learning_rate": 9.989408850817243e-06,
1520
+ "loss": 0.2269,
1521
+ "step": 2120
1522
+ },
1523
+ {
1524
+ "epoch": 0.47706036745406827,
1525
+ "grad_norm": 1.3406009674072266,
1526
+ "learning_rate": 9.988763436571783e-06,
1527
+ "loss": 0.2302,
1528
+ "step": 2130
1529
+ },
1530
+ {
1531
+ "epoch": 0.4793000874890639,
1532
+ "grad_norm": 1.8972488641738892,
1533
+ "learning_rate": 9.98809895627472e-06,
1534
+ "loss": 0.2271,
1535
+ "step": 2140
1536
+ },
1537
+ {
1538
+ "epoch": 0.4815398075240595,
1539
+ "grad_norm": 2.660292863845825,
1540
+ "learning_rate": 9.987415412465568e-06,
1541
+ "loss": 0.2278,
1542
+ "step": 2150
1543
+ },
1544
+ {
1545
+ "epoch": 0.4837795275590551,
1546
+ "grad_norm": 2.4427502155303955,
1547
+ "learning_rate": 9.986712807756695e-06,
1548
+ "loss": 0.2633,
1549
+ "step": 2160
1550
+ },
1551
+ {
1552
+ "epoch": 0.48601924759405074,
1553
+ "grad_norm": 5.147864818572998,
1554
+ "learning_rate": 9.98599114483331e-06,
1555
+ "loss": 0.23,
1556
+ "step": 2170
1557
+ },
1558
+ {
1559
+ "epoch": 0.48825896762904636,
1560
+ "grad_norm": 5.563263893127441,
1561
+ "learning_rate": 9.98525042645347e-06,
1562
+ "loss": 0.2378,
1563
+ "step": 2180
1564
+ },
1565
+ {
1566
+ "epoch": 0.490498687664042,
1567
+ "grad_norm": 6.136834621429443,
1568
+ "learning_rate": 9.984490655448049e-06,
1569
+ "loss": 0.2303,
1570
+ "step": 2190
1571
+ },
1572
+ {
1573
+ "epoch": 0.4927384076990376,
1574
+ "grad_norm": 2.446849822998047,
1575
+ "learning_rate": 9.983711834720738e-06,
1576
+ "loss": 0.2234,
1577
+ "step": 2200
1578
+ },
1579
+ {
1580
+ "epoch": 0.49497812773403327,
1581
+ "grad_norm": 4.507411956787109,
1582
+ "learning_rate": 9.982913967248035e-06,
1583
+ "loss": 0.2462,
1584
+ "step": 2210
1585
+ },
1586
+ {
1587
+ "epoch": 0.4972178477690289,
1588
+ "grad_norm": 1.4177577495574951,
1589
+ "learning_rate": 9.982097056079228e-06,
1590
+ "loss": 0.2254,
1591
+ "step": 2220
1592
+ },
1593
+ {
1594
+ "epoch": 0.4994575678040245,
1595
+ "grad_norm": 7.209784030914307,
1596
+ "learning_rate": 9.981261104336389e-06,
1597
+ "loss": 0.2264,
1598
+ "step": 2230
1599
+ },
1600
+ {
1601
+ "epoch": 0.5016972878390201,
1602
+ "grad_norm": 18.120464324951172,
1603
+ "learning_rate": 9.980406115214353e-06,
1604
+ "loss": 0.2385,
1605
+ "step": 2240
1606
+ },
1607
+ {
1608
+ "epoch": 0.5039370078740157,
1609
+ "grad_norm": 1.6847277879714966,
1610
+ "learning_rate": 9.979532091980723e-06,
1611
+ "loss": 0.2201,
1612
+ "step": 2250
1613
+ },
1614
+ {
1615
+ "epoch": 0.5061767279090114,
1616
+ "grad_norm": 2.8450379371643066,
1617
+ "learning_rate": 9.97863903797584e-06,
1618
+ "loss": 0.2419,
1619
+ "step": 2260
1620
+ },
1621
+ {
1622
+ "epoch": 0.508416447944007,
1623
+ "grad_norm": 4.046864032745361,
1624
+ "learning_rate": 9.97772695661277e-06,
1625
+ "loss": 0.2259,
1626
+ "step": 2270
1627
+ },
1628
+ {
1629
+ "epoch": 0.5106561679790026,
1630
+ "grad_norm": 1.2429368495941162,
1631
+ "learning_rate": 9.976795851377312e-06,
1632
+ "loss": 0.2272,
1633
+ "step": 2280
1634
+ },
1635
+ {
1636
+ "epoch": 0.5128958880139982,
1637
+ "grad_norm": 3.9099016189575195,
1638
+ "learning_rate": 9.975845725827959e-06,
1639
+ "loss": 0.232,
1640
+ "step": 2290
1641
+ },
1642
+ {
1643
+ "epoch": 0.5151356080489938,
1644
+ "grad_norm": 1.1243308782577515,
1645
+ "learning_rate": 9.9748765835959e-06,
1646
+ "loss": 0.2198,
1647
+ "step": 2300
1648
+ },
1649
+ {
1650
+ "epoch": 0.5173753280839894,
1651
+ "grad_norm": 5.873220443725586,
1652
+ "learning_rate": 9.973888428385006e-06,
1653
+ "loss": 0.2435,
1654
+ "step": 2310
1655
+ },
1656
+ {
1657
+ "epoch": 0.5196150481189852,
1658
+ "grad_norm": 1.5939319133758545,
1659
+ "learning_rate": 9.972881263971803e-06,
1660
+ "loss": 0.2246,
1661
+ "step": 2320
1662
+ },
1663
+ {
1664
+ "epoch": 0.5218547681539808,
1665
+ "grad_norm": 4.427680969238281,
1666
+ "learning_rate": 9.971855094205473e-06,
1667
+ "loss": 0.2324,
1668
+ "step": 2330
1669
+ },
1670
+ {
1671
+ "epoch": 0.5240944881889764,
1672
+ "grad_norm": 2.819718360900879,
1673
+ "learning_rate": 9.97080992300783e-06,
1674
+ "loss": 0.2497,
1675
+ "step": 2340
1676
+ },
1677
+ {
1678
+ "epoch": 0.526334208223972,
1679
+ "grad_norm": 2.0332233905792236,
1680
+ "learning_rate": 9.969745754373311e-06,
1681
+ "loss": 0.2225,
1682
+ "step": 2350
1683
+ },
1684
+ {
1685
+ "epoch": 0.5285739282589677,
1686
+ "grad_norm": 12.013405799865723,
1687
+ "learning_rate": 9.968662592368952e-06,
1688
+ "loss": 0.2445,
1689
+ "step": 2360
1690
+ },
1691
+ {
1692
+ "epoch": 0.5308136482939633,
1693
+ "grad_norm": 10.702028274536133,
1694
+ "learning_rate": 9.967560441134381e-06,
1695
+ "loss": 0.2164,
1696
+ "step": 2370
1697
+ },
1698
+ {
1699
+ "epoch": 0.5330533683289589,
1700
+ "grad_norm": 5.7824907302856445,
1701
+ "learning_rate": 9.966439304881798e-06,
1702
+ "loss": 0.2213,
1703
+ "step": 2380
1704
+ },
1705
+ {
1706
+ "epoch": 0.5352930883639545,
1707
+ "grad_norm": 5.64496374130249,
1708
+ "learning_rate": 9.965299187895962e-06,
1709
+ "loss": 0.2288,
1710
+ "step": 2390
1711
+ },
1712
+ {
1713
+ "epoch": 0.5375328083989501,
1714
+ "grad_norm": 1.3699864149093628,
1715
+ "learning_rate": 9.964140094534169e-06,
1716
+ "loss": 0.2322,
1717
+ "step": 2400
1718
+ },
1719
+ {
1720
+ "epoch": 0.5397725284339457,
1721
+ "grad_norm": 4.486180305480957,
1722
+ "learning_rate": 9.962962029226244e-06,
1723
+ "loss": 0.2296,
1724
+ "step": 2410
1725
+ },
1726
+ {
1727
+ "epoch": 0.5420122484689414,
1728
+ "grad_norm": 2.6139583587646484,
1729
+ "learning_rate": 9.961764996474514e-06,
1730
+ "loss": 0.2161,
1731
+ "step": 2420
1732
+ },
1733
+ {
1734
+ "epoch": 0.544251968503937,
1735
+ "grad_norm": 1.0381577014923096,
1736
+ "learning_rate": 9.960549000853799e-06,
1737
+ "loss": 0.2275,
1738
+ "step": 2430
1739
+ },
1740
+ {
1741
+ "epoch": 0.5464916885389326,
1742
+ "grad_norm": 2.3962111473083496,
1743
+ "learning_rate": 9.959314047011389e-06,
1744
+ "loss": 0.23,
1745
+ "step": 2440
1746
+ },
1747
+ {
1748
+ "epoch": 0.5487314085739282,
1749
+ "grad_norm": 1.2226966619491577,
1750
+ "learning_rate": 9.958060139667027e-06,
1751
+ "loss": 0.2163,
1752
+ "step": 2450
1753
+ },
1754
+ {
1755
+ "epoch": 0.5509711286089238,
1756
+ "grad_norm": 6.7834062576293945,
1757
+ "learning_rate": 9.9567872836129e-06,
1758
+ "loss": 0.2324,
1759
+ "step": 2460
1760
+ },
1761
+ {
1762
+ "epoch": 0.5532108486439196,
1763
+ "grad_norm": 7.556985378265381,
1764
+ "learning_rate": 9.955495483713604e-06,
1765
+ "loss": 0.2186,
1766
+ "step": 2470
1767
+ },
1768
+ {
1769
+ "epoch": 0.5554505686789152,
1770
+ "grad_norm": 6.278934955596924,
1771
+ "learning_rate": 9.954184744906139e-06,
1772
+ "loss": 0.232,
1773
+ "step": 2480
1774
+ },
1775
+ {
1776
+ "epoch": 0.5576902887139108,
1777
+ "grad_norm": 1.6271597146987915,
1778
+ "learning_rate": 9.95285507219989e-06,
1779
+ "loss": 0.2324,
1780
+ "step": 2490
1781
+ },
1782
+ {
1783
+ "epoch": 0.5599300087489064,
1784
+ "grad_norm": 3.6434125900268555,
1785
+ "learning_rate": 9.951506470676592e-06,
1786
+ "loss": 0.2182,
1787
+ "step": 2500
1788
+ },
1789
+ {
1790
+ "epoch": 0.562169728783902,
1791
+ "grad_norm": 1.8215718269348145,
1792
+ "learning_rate": 9.950138945490335e-06,
1793
+ "loss": 0.2353,
1794
+ "step": 2510
1795
+ },
1796
+ {
1797
+ "epoch": 0.5644094488188977,
1798
+ "grad_norm": 3.433577299118042,
1799
+ "learning_rate": 9.948752501867522e-06,
1800
+ "loss": 0.2257,
1801
+ "step": 2520
1802
+ },
1803
+ {
1804
+ "epoch": 0.5666491688538933,
1805
+ "grad_norm": 4.02660608291626,
1806
+ "learning_rate": 9.947347145106865e-06,
1807
+ "loss": 0.2192,
1808
+ "step": 2530
1809
+ },
1810
+ {
1811
+ "epoch": 0.5688888888888889,
1812
+ "grad_norm": 1.360437273979187,
1813
+ "learning_rate": 9.945922880579351e-06,
1814
+ "loss": 0.2199,
1815
+ "step": 2540
1816
+ },
1817
+ {
1818
+ "epoch": 0.5711286089238845,
1819
+ "grad_norm": 1.155464768409729,
1820
+ "learning_rate": 9.944479713728237e-06,
1821
+ "loss": 0.1921,
1822
+ "step": 2550
1823
+ },
1824
+ {
1825
+ "epoch": 0.5733683289588801,
1826
+ "grad_norm": 7.086646556854248,
1827
+ "learning_rate": 9.943017650069013e-06,
1828
+ "loss": 0.242,
1829
+ "step": 2560
1830
+ },
1831
+ {
1832
+ "epoch": 0.5756080489938757,
1833
+ "grad_norm": 4.299362659454346,
1834
+ "learning_rate": 9.941536695189396e-06,
1835
+ "loss": 0.228,
1836
+ "step": 2570
1837
+ },
1838
+ {
1839
+ "epoch": 0.5778477690288714,
1840
+ "grad_norm": 8.915771484375,
1841
+ "learning_rate": 9.940036854749297e-06,
1842
+ "loss": 0.2308,
1843
+ "step": 2580
1844
+ },
1845
+ {
1846
+ "epoch": 0.580087489063867,
1847
+ "grad_norm": 2.6741573810577393,
1848
+ "learning_rate": 9.938518134480803e-06,
1849
+ "loss": 0.2398,
1850
+ "step": 2590
1851
+ },
1852
+ {
1853
+ "epoch": 0.5823272090988626,
1854
+ "grad_norm": 2.5663797855377197,
1855
+ "learning_rate": 9.93698054018816e-06,
1856
+ "loss": 0.2077,
1857
+ "step": 2600
1858
+ },
1859
+ {
1860
+ "epoch": 0.5845669291338582,
1861
+ "grad_norm": 2.6795785427093506,
1862
+ "learning_rate": 9.935424077747744e-06,
1863
+ "loss": 0.2367,
1864
+ "step": 2610
1865
+ },
1866
+ {
1867
+ "epoch": 0.5868066491688539,
1868
+ "grad_norm": 3.2071633338928223,
1869
+ "learning_rate": 9.933848753108041e-06,
1870
+ "loss": 0.2106,
1871
+ "step": 2620
1872
+ },
1873
+ {
1874
+ "epoch": 0.5890463692038496,
1875
+ "grad_norm": 3.1179144382476807,
1876
+ "learning_rate": 9.932254572289626e-06,
1877
+ "loss": 0.2446,
1878
+ "step": 2630
1879
+ },
1880
+ {
1881
+ "epoch": 0.5912860892388452,
1882
+ "grad_norm": 1.6278126239776611,
1883
+ "learning_rate": 9.930641541385138e-06,
1884
+ "loss": 0.2211,
1885
+ "step": 2640
1886
+ },
1887
+ {
1888
+ "epoch": 0.5935258092738408,
1889
+ "grad_norm": 5.3661885261535645,
1890
+ "learning_rate": 9.929009666559255e-06,
1891
+ "loss": 0.2107,
1892
+ "step": 2650
1893
+ },
1894
+ {
1895
+ "epoch": 0.5957655293088364,
1896
+ "grad_norm": 6.771117210388184,
1897
+ "learning_rate": 9.927358954048676e-06,
1898
+ "loss": 0.2399,
1899
+ "step": 2660
1900
+ },
1901
+ {
1902
+ "epoch": 0.598005249343832,
1903
+ "grad_norm": 2.178788185119629,
1904
+ "learning_rate": 9.925689410162095e-06,
1905
+ "loss": 0.21,
1906
+ "step": 2670
1907
+ },
1908
+ {
1909
+ "epoch": 0.6002449693788277,
1910
+ "grad_norm": 8.165093421936035,
1911
+ "learning_rate": 9.92400104128017e-06,
1912
+ "loss": 0.2246,
1913
+ "step": 2680
1914
+ },
1915
+ {
1916
+ "epoch": 0.6024846894138233,
1917
+ "grad_norm": 3.471679449081421,
1918
+ "learning_rate": 9.922293853855509e-06,
1919
+ "loss": 0.2131,
1920
+ "step": 2690
1921
+ },
1922
+ {
1923
+ "epoch": 0.6047244094488189,
1924
+ "grad_norm": 2.9488112926483154,
1925
+ "learning_rate": 9.92056785441264e-06,
1926
+ "loss": 0.2245,
1927
+ "step": 2700
1928
+ },
1929
+ {
1930
+ "epoch": 0.6069641294838145,
1931
+ "grad_norm": 5.4626145362854,
1932
+ "learning_rate": 9.918823049547984e-06,
1933
+ "loss": 0.2444,
1934
+ "step": 2710
1935
+ },
1936
+ {
1937
+ "epoch": 0.6092038495188101,
1938
+ "grad_norm": 1.8659770488739014,
1939
+ "learning_rate": 9.917059445929838e-06,
1940
+ "loss": 0.2226,
1941
+ "step": 2720
1942
+ },
1943
+ {
1944
+ "epoch": 0.6114435695538057,
1945
+ "grad_norm": 2.041576385498047,
1946
+ "learning_rate": 9.915277050298336e-06,
1947
+ "loss": 0.2401,
1948
+ "step": 2730
1949
+ },
1950
+ {
1951
+ "epoch": 0.6136832895888014,
1952
+ "grad_norm": 7.602120399475098,
1953
+ "learning_rate": 9.913475869465442e-06,
1954
+ "loss": 0.2105,
1955
+ "step": 2740
1956
+ },
1957
+ {
1958
+ "epoch": 0.615923009623797,
1959
+ "grad_norm": 1.5145801305770874,
1960
+ "learning_rate": 9.911655910314901e-06,
1961
+ "loss": 0.2168,
1962
+ "step": 2750
1963
+ },
1964
+ {
1965
+ "epoch": 0.6181627296587926,
1966
+ "grad_norm": 2.4858598709106445,
1967
+ "learning_rate": 9.909817179802234e-06,
1968
+ "loss": 0.2363,
1969
+ "step": 2760
1970
+ },
1971
+ {
1972
+ "epoch": 0.6204024496937883,
1973
+ "grad_norm": 2.1684751510620117,
1974
+ "learning_rate": 9.907959684954702e-06,
1975
+ "loss": 0.2234,
1976
+ "step": 2770
1977
+ },
1978
+ {
1979
+ "epoch": 0.622642169728784,
1980
+ "grad_norm": 1.7832976579666138,
1981
+ "learning_rate": 9.906083432871273e-06,
1982
+ "loss": 0.2282,
1983
+ "step": 2780
1984
+ },
1985
+ {
1986
+ "epoch": 0.6248818897637796,
1987
+ "grad_norm": 6.974465370178223,
1988
+ "learning_rate": 9.90418843072261e-06,
1989
+ "loss": 0.2264,
1990
+ "step": 2790
1991
+ },
1992
+ {
1993
+ "epoch": 0.6271216097987752,
1994
+ "grad_norm": 4.383815765380859,
1995
+ "learning_rate": 9.902274685751027e-06,
1996
+ "loss": 0.2145,
1997
+ "step": 2800
1998
+ },
1999
+ {
2000
+ "epoch": 0.6293613298337708,
2001
+ "grad_norm": 1.6938287019729614,
2002
+ "learning_rate": 9.900342205270475e-06,
2003
+ "loss": 0.2388,
2004
+ "step": 2810
2005
+ },
2006
+ {
2007
+ "epoch": 0.6316010498687664,
2008
+ "grad_norm": 14.980836868286133,
2009
+ "learning_rate": 9.898390996666502e-06,
2010
+ "loss": 0.2289,
2011
+ "step": 2820
2012
+ },
2013
+ {
2014
+ "epoch": 0.633840769903762,
2015
+ "grad_norm": 6.246406555175781,
2016
+ "learning_rate": 9.89642106739624e-06,
2017
+ "loss": 0.2319,
2018
+ "step": 2830
2019
+ },
2020
+ {
2021
+ "epoch": 0.6360804899387577,
2022
+ "grad_norm": 7.479663848876953,
2023
+ "learning_rate": 9.894432424988363e-06,
2024
+ "loss": 0.2224,
2025
+ "step": 2840
2026
+ },
2027
+ {
2028
+ "epoch": 0.6383202099737533,
2029
+ "grad_norm": 6.564876556396484,
2030
+ "learning_rate": 9.892425077043058e-06,
2031
+ "loss": 0.2185,
2032
+ "step": 2850
2033
+ },
2034
+ {
2035
+ "epoch": 0.6405599300087489,
2036
+ "grad_norm": 5.287945747375488,
2037
+ "learning_rate": 9.89039903123201e-06,
2038
+ "loss": 0.2352,
2039
+ "step": 2860
2040
+ },
2041
+ {
2042
+ "epoch": 0.6427996500437445,
2043
+ "grad_norm": 1.6189258098602295,
2044
+ "learning_rate": 9.888354295298356e-06,
2045
+ "loss": 0.2107,
2046
+ "step": 2870
2047
+ },
2048
+ {
2049
+ "epoch": 0.6450393700787401,
2050
+ "grad_norm": 1.7470353841781616,
2051
+ "learning_rate": 9.88629087705667e-06,
2052
+ "loss": 0.2284,
2053
+ "step": 2880
2054
+ },
2055
+ {
2056
+ "epoch": 0.6472790901137357,
2057
+ "grad_norm": 6.374598503112793,
2058
+ "learning_rate": 9.884208784392917e-06,
2059
+ "loss": 0.2295,
2060
+ "step": 2890
2061
+ },
2062
+ {
2063
+ "epoch": 0.6495188101487314,
2064
+ "grad_norm": 3.07970929145813,
2065
+ "learning_rate": 9.882108025264442e-06,
2066
+ "loss": 0.2199,
2067
+ "step": 2900
2068
+ },
2069
+ {
2070
+ "epoch": 0.651758530183727,
2071
+ "grad_norm": 5.308441638946533,
2072
+ "learning_rate": 9.87998860769992e-06,
2073
+ "loss": 0.2246,
2074
+ "step": 2910
2075
+ },
2076
+ {
2077
+ "epoch": 0.6539982502187227,
2078
+ "grad_norm": 8.279508590698242,
2079
+ "learning_rate": 9.877850539799341e-06,
2080
+ "loss": 0.2222,
2081
+ "step": 2920
2082
+ },
2083
+ {
2084
+ "epoch": 0.6562379702537183,
2085
+ "grad_norm": 6.9987006187438965,
2086
+ "learning_rate": 9.87569382973397e-06,
2087
+ "loss": 0.2319,
2088
+ "step": 2930
2089
+ },
2090
+ {
2091
+ "epoch": 0.658477690288714,
2092
+ "grad_norm": 4.786025524139404,
2093
+ "learning_rate": 9.873518485746321e-06,
2094
+ "loss": 0.2257,
2095
+ "step": 2940
2096
+ },
2097
+ {
2098
+ "epoch": 0.6607174103237096,
2099
+ "grad_norm": 3.0393166542053223,
2100
+ "learning_rate": 9.871324516150123e-06,
2101
+ "loss": 0.2153,
2102
+ "step": 2950
2103
+ },
2104
+ {
2105
+ "epoch": 0.6629571303587052,
2106
+ "grad_norm": 3.177739143371582,
2107
+ "learning_rate": 9.869111929330282e-06,
2108
+ "loss": 0.2232,
2109
+ "step": 2960
2110
+ },
2111
+ {
2112
+ "epoch": 0.6651968503937008,
2113
+ "grad_norm": 2.291867971420288,
2114
+ "learning_rate": 9.866880733742865e-06,
2115
+ "loss": 0.212,
2116
+ "step": 2970
2117
+ },
2118
+ {
2119
+ "epoch": 0.6674365704286964,
2120
+ "grad_norm": 3.9312686920166016,
2121
+ "learning_rate": 9.864630937915052e-06,
2122
+ "loss": 0.2262,
2123
+ "step": 2980
2124
+ },
2125
+ {
2126
+ "epoch": 0.669676290463692,
2127
+ "grad_norm": 2.110924482345581,
2128
+ "learning_rate": 9.862362550445108e-06,
2129
+ "loss": 0.2274,
2130
+ "step": 2990
2131
+ },
2132
+ {
2133
+ "epoch": 0.6719160104986877,
2134
+ "grad_norm": 1.4342049360275269,
2135
+ "learning_rate": 9.860075580002359e-06,
2136
+ "loss": 0.2171,
2137
+ "step": 3000
2138
+ },
2139
+ {
2140
+ "epoch": 0.6719160104986877,
2141
+ "eval_loss": 0.21779567003250122,
2142
+ "eval_runtime": 508.7876,
2143
+ "eval_samples_per_second": 236.476,
2144
+ "eval_steps_per_second": 14.78,
2145
+ "eval_token_accuracy": 0.7062596501325423,
2146
+ "step": 3000
2147
+ },
2148
+ {
2149
+ "epoch": 0.6741557305336833,
2150
+ "grad_norm": 1.5321989059448242,
2151
+ "learning_rate": 9.857770035327142e-06,
2152
+ "loss": 0.2255,
2153
+ "step": 3010
2154
+ },
2155
+ {
2156
+ "epoch": 0.6763954505686789,
2157
+ "grad_norm": 9.719202041625977,
2158
+ "learning_rate": 9.85544592523079e-06,
2159
+ "loss": 0.2185,
2160
+ "step": 3020
2161
+ },
2162
+ {
2163
+ "epoch": 0.6786351706036745,
2164
+ "grad_norm": 8.409933090209961,
2165
+ "learning_rate": 9.853103258595581e-06,
2166
+ "loss": 0.2295,
2167
+ "step": 3030
2168
+ },
2169
+ {
2170
+ "epoch": 0.6808748906386701,
2171
+ "grad_norm": 1.9369258880615234,
2172
+ "learning_rate": 9.85074204437472e-06,
2173
+ "loss": 0.2283,
2174
+ "step": 3040
2175
+ },
2176
+ {
2177
+ "epoch": 0.6831146106736657,
2178
+ "grad_norm": 2.4668948650360107,
2179
+ "learning_rate": 9.848362291592288e-06,
2180
+ "loss": 0.2123,
2181
+ "step": 3050
2182
+ },
2183
+ {
2184
+ "epoch": 0.6853543307086614,
2185
+ "grad_norm": 2.983069658279419,
2186
+ "learning_rate": 9.845964009343228e-06,
2187
+ "loss": 0.2315,
2188
+ "step": 3060
2189
+ },
2190
+ {
2191
+ "epoch": 0.6875940507436571,
2192
+ "grad_norm": 3.6539225578308105,
2193
+ "learning_rate": 9.843547206793289e-06,
2194
+ "loss": 0.2202,
2195
+ "step": 3070
2196
+ },
2197
+ {
2198
+ "epoch": 0.6898337707786527,
2199
+ "grad_norm": 9.64335823059082,
2200
+ "learning_rate": 9.841111893179006e-06,
2201
+ "loss": 0.2149,
2202
+ "step": 3080
2203
+ },
2204
+ {
2205
+ "epoch": 0.6920734908136483,
2206
+ "grad_norm": 11.048383712768555,
2207
+ "learning_rate": 9.838658077807657e-06,
2208
+ "loss": 0.2334,
2209
+ "step": 3090
2210
+ },
2211
+ {
2212
+ "epoch": 0.694313210848644,
2213
+ "grad_norm": 7.029510021209717,
2214
+ "learning_rate": 9.836185770057234e-06,
2215
+ "loss": 0.2093,
2216
+ "step": 3100
2217
+ },
2218
+ {
2219
+ "epoch": 0.6965529308836396,
2220
+ "grad_norm": 2.615210771560669,
2221
+ "learning_rate": 9.833694979376398e-06,
2222
+ "loss": 0.2294,
2223
+ "step": 3110
2224
+ },
2225
+ {
2226
+ "epoch": 0.6987926509186352,
2227
+ "grad_norm": 9.237405776977539,
2228
+ "learning_rate": 9.831185715284452e-06,
2229
+ "loss": 0.2225,
2230
+ "step": 3120
2231
+ },
2232
+ {
2233
+ "epoch": 0.7010323709536308,
2234
+ "grad_norm": 2.880690097808838,
2235
+ "learning_rate": 9.828657987371301e-06,
2236
+ "loss": 0.2291,
2237
+ "step": 3130
2238
+ },
2239
+ {
2240
+ "epoch": 0.7032720909886264,
2241
+ "grad_norm": 4.929702281951904,
2242
+ "learning_rate": 9.826111805297409e-06,
2243
+ "loss": 0.2338,
2244
+ "step": 3140
2245
+ },
2246
+ {
2247
+ "epoch": 0.705511811023622,
2248
+ "grad_norm": 1.8025418519973755,
2249
+ "learning_rate": 9.823547178793775e-06,
2250
+ "loss": 0.2087,
2251
+ "step": 3150
2252
+ },
2253
+ {
2254
+ "epoch": 0.7077515310586177,
2255
+ "grad_norm": 10.105000495910645,
2256
+ "learning_rate": 9.820964117661888e-06,
2257
+ "loss": 0.2376,
2258
+ "step": 3160
2259
+ },
2260
+ {
2261
+ "epoch": 0.7099912510936133,
2262
+ "grad_norm": 10.464491844177246,
2263
+ "learning_rate": 9.818362631773685e-06,
2264
+ "loss": 0.2162,
2265
+ "step": 3170
2266
+ },
2267
+ {
2268
+ "epoch": 0.7122309711286089,
2269
+ "grad_norm": 1.2578132152557373,
2270
+ "learning_rate": 9.815742731071524e-06,
2271
+ "loss": 0.2297,
2272
+ "step": 3180
2273
+ },
2274
+ {
2275
+ "epoch": 0.7144706911636045,
2276
+ "grad_norm": 7.663752555847168,
2277
+ "learning_rate": 9.813104425568138e-06,
2278
+ "loss": 0.2151,
2279
+ "step": 3190
2280
+ },
2281
+ {
2282
+ "epoch": 0.7167104111986001,
2283
+ "grad_norm": 3.7501728534698486,
2284
+ "learning_rate": 9.810447725346604e-06,
2285
+ "loss": 0.2226,
2286
+ "step": 3200
2287
+ },
2288
+ {
2289
+ "epoch": 0.7189501312335957,
2290
+ "grad_norm": 7.99606466293335,
2291
+ "learning_rate": 9.807772640560292e-06,
2292
+ "loss": 0.2327,
2293
+ "step": 3210
2294
+ },
2295
+ {
2296
+ "epoch": 0.7211898512685915,
2297
+ "grad_norm": 4.222293376922607,
2298
+ "learning_rate": 9.805079181432842e-06,
2299
+ "loss": 0.2206,
2300
+ "step": 3220
2301
+ },
2302
+ {
2303
+ "epoch": 0.7234295713035871,
2304
+ "grad_norm": 6.806760311126709,
2305
+ "learning_rate": 9.802367358258114e-06,
2306
+ "loss": 0.2147,
2307
+ "step": 3230
2308
+ },
2309
+ {
2310
+ "epoch": 0.7256692913385827,
2311
+ "grad_norm": 4.147646427154541,
2312
+ "learning_rate": 9.799637181400147e-06,
2313
+ "loss": 0.2097,
2314
+ "step": 3240
2315
+ },
2316
+ {
2317
+ "epoch": 0.7279090113735783,
2318
+ "grad_norm": 1.1662570238113403,
2319
+ "learning_rate": 9.796888661293133e-06,
2320
+ "loss": 0.2096,
2321
+ "step": 3250
2322
+ },
2323
+ {
2324
+ "epoch": 0.730148731408574,
2325
+ "grad_norm": 6.412913799285889,
2326
+ "learning_rate": 9.794121808441361e-06,
2327
+ "loss": 0.2361,
2328
+ "step": 3260
2329
+ },
2330
+ {
2331
+ "epoch": 0.7323884514435696,
2332
+ "grad_norm": 1.2370624542236328,
2333
+ "learning_rate": 9.791336633419189e-06,
2334
+ "loss": 0.2041,
2335
+ "step": 3270
2336
+ },
2337
+ {
2338
+ "epoch": 0.7346281714785652,
2339
+ "grad_norm": 0.9865597486495972,
2340
+ "learning_rate": 9.788533146870995e-06,
2341
+ "loss": 0.2199,
2342
+ "step": 3280
2343
+ },
2344
+ {
2345
+ "epoch": 0.7368678915135608,
2346
+ "grad_norm": 4.455222129821777,
2347
+ "learning_rate": 9.78571135951114e-06,
2348
+ "loss": 0.22,
2349
+ "step": 3290
2350
+ },
2351
+ {
2352
+ "epoch": 0.7391076115485564,
2353
+ "grad_norm": 4.656614780426025,
2354
+ "learning_rate": 9.782871282123932e-06,
2355
+ "loss": 0.2182,
2356
+ "step": 3300
2357
+ },
2358
+ {
2359
+ "epoch": 0.741347331583552,
2360
+ "grad_norm": 3.1240029335021973,
2361
+ "learning_rate": 9.780012925563573e-06,
2362
+ "loss": 0.2307,
2363
+ "step": 3310
2364
+ },
2365
+ {
2366
+ "epoch": 0.7435870516185477,
2367
+ "grad_norm": 3.156013011932373,
2368
+ "learning_rate": 9.777136300754129e-06,
2369
+ "loss": 0.2141,
2370
+ "step": 3320
2371
+ },
2372
+ {
2373
+ "epoch": 0.7458267716535433,
2374
+ "grad_norm": 4.594078540802002,
2375
+ "learning_rate": 9.774241418689481e-06,
2376
+ "loss": 0.2338,
2377
+ "step": 3330
2378
+ },
2379
+ {
2380
+ "epoch": 0.7480664916885389,
2381
+ "grad_norm": 6.030776023864746,
2382
+ "learning_rate": 9.771328290433287e-06,
2383
+ "loss": 0.222,
2384
+ "step": 3340
2385
+ },
2386
+ {
2387
+ "epoch": 0.7503062117235345,
2388
+ "grad_norm": 3.0741419792175293,
2389
+ "learning_rate": 9.768396927118939e-06,
2390
+ "loss": 0.2051,
2391
+ "step": 3350
2392
+ },
2393
+ {
2394
+ "epoch": 0.7525459317585301,
2395
+ "grad_norm": 1.915945053100586,
2396
+ "learning_rate": 9.76544733994952e-06,
2397
+ "loss": 0.2241,
2398
+ "step": 3360
2399
+ },
2400
+ {
2401
+ "epoch": 0.7547856517935259,
2402
+ "grad_norm": 6.906998634338379,
2403
+ "learning_rate": 9.762479540197753e-06,
2404
+ "loss": 0.2111,
2405
+ "step": 3370
2406
+ },
2407
+ {
2408
+ "epoch": 0.7570253718285215,
2409
+ "grad_norm": 1.6207234859466553,
2410
+ "learning_rate": 9.75949353920598e-06,
2411
+ "loss": 0.2209,
2412
+ "step": 3380
2413
+ },
2414
+ {
2415
+ "epoch": 0.7592650918635171,
2416
+ "grad_norm": 1.919848084449768,
2417
+ "learning_rate": 9.756489348386093e-06,
2418
+ "loss": 0.2238,
2419
+ "step": 3390
2420
+ },
2421
+ {
2422
+ "epoch": 0.7615048118985127,
2423
+ "grad_norm": 1.796086072921753,
2424
+ "learning_rate": 9.753466979219507e-06,
2425
+ "loss": 0.2103,
2426
+ "step": 3400
2427
+ },
2428
+ {
2429
+ "epoch": 0.7637445319335083,
2430
+ "grad_norm": 1.6984485387802124,
2431
+ "learning_rate": 9.75042644325711e-06,
2432
+ "loss": 0.2229,
2433
+ "step": 3410
2434
+ },
2435
+ {
2436
+ "epoch": 0.765984251968504,
2437
+ "grad_norm": 4.693706035614014,
2438
+ "learning_rate": 9.747367752119216e-06,
2439
+ "loss": 0.222,
2440
+ "step": 3420
2441
+ },
2442
+ {
2443
+ "epoch": 0.7682239720034996,
2444
+ "grad_norm": 3.6621434688568115,
2445
+ "learning_rate": 9.744290917495535e-06,
2446
+ "loss": 0.2253,
2447
+ "step": 3430
2448
+ },
2449
+ {
2450
+ "epoch": 0.7704636920384952,
2451
+ "grad_norm": 4.475337505340576,
2452
+ "learning_rate": 9.741195951145105e-06,
2453
+ "loss": 0.2198,
2454
+ "step": 3440
2455
+ },
2456
+ {
2457
+ "epoch": 0.7727034120734908,
2458
+ "grad_norm": 6.554020404815674,
2459
+ "learning_rate": 9.738082864896267e-06,
2460
+ "loss": 0.2082,
2461
+ "step": 3450
2462
+ },
2463
+ {
2464
+ "epoch": 0.7749431321084864,
2465
+ "grad_norm": 1.775171160697937,
2466
+ "learning_rate": 9.734951670646612e-06,
2467
+ "loss": 0.2345,
2468
+ "step": 3460
2469
+ },
2470
+ {
2471
+ "epoch": 0.777182852143482,
2472
+ "grad_norm": 10.973596572875977,
2473
+ "learning_rate": 9.731802380362936e-06,
2474
+ "loss": 0.2201,
2475
+ "step": 3470
2476
+ },
2477
+ {
2478
+ "epoch": 0.7794225721784777,
2479
+ "grad_norm": 3.4094762802124023,
2480
+ "learning_rate": 9.728635006081191e-06,
2481
+ "loss": 0.2299,
2482
+ "step": 3480
2483
+ },
2484
+ {
2485
+ "epoch": 0.7816622922134733,
2486
+ "grad_norm": 3.6424269676208496,
2487
+ "learning_rate": 9.725449559906448e-06,
2488
+ "loss": 0.2235,
2489
+ "step": 3490
2490
+ },
2491
+ {
2492
+ "epoch": 0.7839020122484689,
2493
+ "grad_norm": 4.4308319091796875,
2494
+ "learning_rate": 9.72224605401284e-06,
2495
+ "loss": 0.2117,
2496
+ "step": 3500
2497
+ },
2498
+ {
2499
+ "epoch": 0.7861417322834645,
2500
+ "grad_norm": 5.900296688079834,
2501
+ "learning_rate": 9.719024500643526e-06,
2502
+ "loss": 0.2298,
2503
+ "step": 3510
2504
+ },
2505
+ {
2506
+ "epoch": 0.7883814523184602,
2507
+ "grad_norm": 4.9092020988464355,
2508
+ "learning_rate": 9.715784912110632e-06,
2509
+ "loss": 0.2181,
2510
+ "step": 3520
2511
+ },
2512
+ {
2513
+ "epoch": 0.7906211723534559,
2514
+ "grad_norm": 3.5412979125976562,
2515
+ "learning_rate": 9.712527300795218e-06,
2516
+ "loss": 0.2213,
2517
+ "step": 3530
2518
+ },
2519
+ {
2520
+ "epoch": 0.7928608923884515,
2521
+ "grad_norm": 2.001018524169922,
2522
+ "learning_rate": 9.709251679147217e-06,
2523
+ "loss": 0.2176,
2524
+ "step": 3540
2525
+ },
2526
+ {
2527
+ "epoch": 0.7951006124234471,
2528
+ "grad_norm": 1.459351897239685,
2529
+ "learning_rate": 9.705958059685399e-06,
2530
+ "loss": 0.212,
2531
+ "step": 3550
2532
+ },
2533
+ {
2534
+ "epoch": 0.7973403324584427,
2535
+ "grad_norm": 2.8420157432556152,
2536
+ "learning_rate": 9.702646454997315e-06,
2537
+ "loss": 0.2299,
2538
+ "step": 3560
2539
+ },
2540
+ {
2541
+ "epoch": 0.7995800524934383,
2542
+ "grad_norm": 3.9574944972991943,
2543
+ "learning_rate": 9.699316877739253e-06,
2544
+ "loss": 0.204,
2545
+ "step": 3570
2546
+ },
2547
+ {
2548
+ "epoch": 0.801819772528434,
2549
+ "grad_norm": 1.4476044178009033,
2550
+ "learning_rate": 9.695969340636189e-06,
2551
+ "loss": 0.2102,
2552
+ "step": 3580
2553
+ },
2554
+ {
2555
+ "epoch": 0.8040594925634296,
2556
+ "grad_norm": 2.721195936203003,
2557
+ "learning_rate": 9.692603856481736e-06,
2558
+ "loss": 0.2274,
2559
+ "step": 3590
2560
+ },
2561
+ {
2562
+ "epoch": 0.8062992125984252,
2563
+ "grad_norm": 3.3109710216522217,
2564
+ "learning_rate": 9.689220438138099e-06,
2565
+ "loss": 0.2086,
2566
+ "step": 3600
2567
+ },
2568
+ {
2569
+ "epoch": 0.8085389326334208,
2570
+ "grad_norm": 6.931108474731445,
2571
+ "learning_rate": 9.685819098536024e-06,
2572
+ "loss": 0.223,
2573
+ "step": 3610
2574
+ },
2575
+ {
2576
+ "epoch": 0.8107786526684164,
2577
+ "grad_norm": 5.233217239379883,
2578
+ "learning_rate": 9.682399850674745e-06,
2579
+ "loss": 0.2303,
2580
+ "step": 3620
2581
+ },
2582
+ {
2583
+ "epoch": 0.813018372703412,
2584
+ "grad_norm": 1.7679535150527954,
2585
+ "learning_rate": 9.678962707621944e-06,
2586
+ "loss": 0.2211,
2587
+ "step": 3630
2588
+ },
2589
+ {
2590
+ "epoch": 0.8152580927384077,
2591
+ "grad_norm": 2.661332368850708,
2592
+ "learning_rate": 9.675507682513687e-06,
2593
+ "loss": 0.2066,
2594
+ "step": 3640
2595
+ },
2596
+ {
2597
+ "epoch": 0.8174978127734033,
2598
+ "grad_norm": 0.9035616517066956,
2599
+ "learning_rate": 9.67203478855439e-06,
2600
+ "loss": 0.2112,
2601
+ "step": 3650
2602
+ },
2603
+ {
2604
+ "epoch": 0.819737532808399,
2605
+ "grad_norm": 4.500796318054199,
2606
+ "learning_rate": 9.668544039016754e-06,
2607
+ "loss": 0.2215,
2608
+ "step": 3660
2609
+ },
2610
+ {
2611
+ "epoch": 0.8219772528433946,
2612
+ "grad_norm": 1.4607346057891846,
2613
+ "learning_rate": 9.665035447241721e-06,
2614
+ "loss": 0.2043,
2615
+ "step": 3670
2616
+ },
2617
+ {
2618
+ "epoch": 0.8242169728783902,
2619
+ "grad_norm": 1.0367200374603271,
2620
+ "learning_rate": 9.661509026638427e-06,
2621
+ "loss": 0.2284,
2622
+ "step": 3680
2623
+ },
2624
+ {
2625
+ "epoch": 0.8264566929133859,
2626
+ "grad_norm": 7.388817310333252,
2627
+ "learning_rate": 9.657964790684143e-06,
2628
+ "loss": 0.2177,
2629
+ "step": 3690
2630
+ },
2631
+ {
2632
+ "epoch": 0.8286964129483815,
2633
+ "grad_norm": 6.648113250732422,
2634
+ "learning_rate": 9.654402752924223e-06,
2635
+ "loss": 0.2108,
2636
+ "step": 3700
2637
+ },
2638
+ {
2639
+ "epoch": 0.8309361329833771,
2640
+ "grad_norm": 3.2974541187286377,
2641
+ "learning_rate": 9.650822926972064e-06,
2642
+ "loss": 0.2163,
2643
+ "step": 3710
2644
+ },
2645
+ {
2646
+ "epoch": 0.8331758530183727,
2647
+ "grad_norm": 1.5554214715957642,
2648
+ "learning_rate": 9.647225326509037e-06,
2649
+ "loss": 0.2127,
2650
+ "step": 3720
2651
+ },
2652
+ {
2653
+ "epoch": 0.8354155730533683,
2654
+ "grad_norm": 1.1680930852890015,
2655
+ "learning_rate": 9.643609965284452e-06,
2656
+ "loss": 0.21,
2657
+ "step": 3730
2658
+ },
2659
+ {
2660
+ "epoch": 0.837655293088364,
2661
+ "grad_norm": 3.262380599975586,
2662
+ "learning_rate": 9.639976857115492e-06,
2663
+ "loss": 0.2234,
2664
+ "step": 3740
2665
+ },
2666
+ {
2667
+ "epoch": 0.8398950131233596,
2668
+ "grad_norm": 1.6106623411178589,
2669
+ "learning_rate": 9.636326015887167e-06,
2670
+ "loss": 0.2102,
2671
+ "step": 3750
2672
+ },
2673
+ {
2674
+ "epoch": 0.8421347331583552,
2675
+ "grad_norm": 3.9737660884857178,
2676
+ "learning_rate": 9.632657455552258e-06,
2677
+ "loss": 0.214,
2678
+ "step": 3760
2679
+ },
2680
+ {
2681
+ "epoch": 0.8443744531933508,
2682
+ "grad_norm": 5.535833358764648,
2683
+ "learning_rate": 9.628971190131266e-06,
2684
+ "loss": 0.2171,
2685
+ "step": 3770
2686
+ },
2687
+ {
2688
+ "epoch": 0.8466141732283464,
2689
+ "grad_norm": 1.5001612901687622,
2690
+ "learning_rate": 9.625267233712357e-06,
2691
+ "loss": 0.2084,
2692
+ "step": 3780
2693
+ },
2694
+ {
2695
+ "epoch": 0.848853893263342,
2696
+ "grad_norm": 3.5858755111694336,
2697
+ "learning_rate": 9.621545600451308e-06,
2698
+ "loss": 0.2122,
2699
+ "step": 3790
2700
+ },
2701
+ {
2702
+ "epoch": 0.8510936132983377,
2703
+ "grad_norm": 2.433242082595825,
2704
+ "learning_rate": 9.617806304571455e-06,
2705
+ "loss": 0.203,
2706
+ "step": 3800
2707
+ },
2708
+ {
2709
+ "epoch": 0.8533333333333334,
2710
+ "grad_norm": 1.7443480491638184,
2711
+ "learning_rate": 9.614049360363632e-06,
2712
+ "loss": 0.2245,
2713
+ "step": 3810
2714
+ },
2715
+ {
2716
+ "epoch": 0.855573053368329,
2717
+ "grad_norm": 10.19050407409668,
2718
+ "learning_rate": 9.610274782186128e-06,
2719
+ "loss": 0.2178,
2720
+ "step": 3820
2721
+ },
2722
+ {
2723
+ "epoch": 0.8578127734033246,
2724
+ "grad_norm": 4.910433769226074,
2725
+ "learning_rate": 9.606482584464622e-06,
2726
+ "loss": 0.219,
2727
+ "step": 3830
2728
+ },
2729
+ {
2730
+ "epoch": 0.8600524934383202,
2731
+ "grad_norm": 4.34425687789917,
2732
+ "learning_rate": 9.602672781692133e-06,
2733
+ "loss": 0.2109,
2734
+ "step": 3840
2735
+ },
2736
+ {
2737
+ "epoch": 0.8622922134733159,
2738
+ "grad_norm": 1.3024218082427979,
2739
+ "learning_rate": 9.598845388428957e-06,
2740
+ "loss": 0.1945,
2741
+ "step": 3850
2742
+ },
2743
+ {
2744
+ "epoch": 0.8645319335083115,
2745
+ "grad_norm": 3.875014066696167,
2746
+ "learning_rate": 9.59500041930263e-06,
2747
+ "loss": 0.2177,
2748
+ "step": 3860
2749
+ },
2750
+ {
2751
+ "epoch": 0.8667716535433071,
2752
+ "grad_norm": 6.025304317474365,
2753
+ "learning_rate": 9.591137889007843e-06,
2754
+ "loss": 0.2087,
2755
+ "step": 3870
2756
+ },
2757
+ {
2758
+ "epoch": 0.8690113735783027,
2759
+ "grad_norm": 1.9392833709716797,
2760
+ "learning_rate": 9.587257812306417e-06,
2761
+ "loss": 0.2315,
2762
+ "step": 3880
2763
+ },
2764
+ {
2765
+ "epoch": 0.8712510936132983,
2766
+ "grad_norm": 1.3517054319381714,
2767
+ "learning_rate": 9.583360204027224e-06,
2768
+ "loss": 0.2164,
2769
+ "step": 3890
2770
+ },
2771
+ {
2772
+ "epoch": 0.873490813648294,
2773
+ "grad_norm": 2.300626039505005,
2774
+ "learning_rate": 9.579445079066136e-06,
2775
+ "loss": 0.2034,
2776
+ "step": 3900
2777
+ },
2778
+ {
2779
+ "epoch": 0.8757305336832896,
2780
+ "grad_norm": 6.352007865905762,
2781
+ "learning_rate": 9.57551245238598e-06,
2782
+ "loss": 0.2157,
2783
+ "step": 3910
2784
+ },
2785
+ {
2786
+ "epoch": 0.8779702537182852,
2787
+ "grad_norm": 8.510796546936035,
2788
+ "learning_rate": 9.571562339016463e-06,
2789
+ "loss": 0.2137,
2790
+ "step": 3920
2791
+ },
2792
+ {
2793
+ "epoch": 0.8802099737532808,
2794
+ "grad_norm": 1.9818949699401855,
2795
+ "learning_rate": 9.567594754054122e-06,
2796
+ "loss": 0.2188,
2797
+ "step": 3930
2798
+ },
2799
+ {
2800
+ "epoch": 0.8824496937882764,
2801
+ "grad_norm": 5.979051113128662,
2802
+ "learning_rate": 9.563609712662274e-06,
2803
+ "loss": 0.2103,
2804
+ "step": 3940
2805
+ },
2806
+ {
2807
+ "epoch": 0.884689413823272,
2808
+ "grad_norm": 4.557352066040039,
2809
+ "learning_rate": 9.559607230070943e-06,
2810
+ "loss": 0.2065,
2811
+ "step": 3950
2812
+ },
2813
+ {
2814
+ "epoch": 0.8869291338582678,
2815
+ "grad_norm": 1.7298095226287842,
2816
+ "learning_rate": 9.555587321576816e-06,
2817
+ "loss": 0.2199,
2818
+ "step": 3960
2819
+ },
2820
+ {
2821
+ "epoch": 0.8891688538932634,
2822
+ "grad_norm": 7.868046760559082,
2823
+ "learning_rate": 9.551550002543172e-06,
2824
+ "loss": 0.2195,
2825
+ "step": 3970
2826
+ },
2827
+ {
2828
+ "epoch": 0.891408573928259,
2829
+ "grad_norm": 4.001582145690918,
2830
+ "learning_rate": 9.547495288399837e-06,
2831
+ "loss": 0.2244,
2832
+ "step": 3980
2833
+ },
2834
+ {
2835
+ "epoch": 0.8936482939632546,
2836
+ "grad_norm": 5.262408256530762,
2837
+ "learning_rate": 9.543423194643113e-06,
2838
+ "loss": 0.21,
2839
+ "step": 3990
2840
+ },
2841
+ {
2842
+ "epoch": 0.8958880139982502,
2843
+ "grad_norm": 1.7287728786468506,
2844
+ "learning_rate": 9.539333736835723e-06,
2845
+ "loss": 0.2081,
2846
+ "step": 4000
2847
+ },
2848
+ {
2849
+ "epoch": 0.8958880139982502,
2850
+ "eval_loss": 0.21423038840293884,
2851
+ "eval_runtime": 508.214,
2852
+ "eval_samples_per_second": 236.743,
2853
+ "eval_steps_per_second": 14.797,
2854
+ "eval_token_accuracy": 0.7443897795757539,
2855
+ "step": 4000
2856
+ }
2857
+ ],
2858
+ "logging_steps": 10,
2859
+ "max_steps": 17856,
2860
+ "num_input_tokens_seen": 0,
2861
+ "num_train_epochs": 4,
2862
+ "save_steps": 1000,
2863
+ "stateful_callbacks": {
2864
+ "TrainerControl": {
2865
+ "args": {
2866
+ "should_epoch_stop": false,
2867
+ "should_evaluate": false,
2868
+ "should_log": false,
2869
+ "should_save": true,
2870
+ "should_training_stop": false
2871
+ },
2872
+ "attributes": {}
2873
+ }
2874
+ },
2875
+ "total_flos": 2.8984694308677353e+19,
2876
+ "train_batch_size": 2,
2877
+ "trial_name": null,
2878
+ "trial_params": null
2879
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22f9ce7a88c83e56706349177602a9159f64407953ba60193107c05d465599cd
3
+ size 6712
vocab.json ADDED
The diff for this file is too large to render. See raw diff