masani commited on
Commit
54ddbb0
Β·
verified Β·
1 Parent(s): 22cc27f

End of training

Browse files
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  base_model: openai-community/gpt2-xl
3
  library_name: transformers
4
- model_name: 'gpt2-xl-gsm8k-epoch6-acc0-1. Always '
5
  tags:
6
  - generated_from_trainer
7
  - trl
@@ -9,7 +9,7 @@ tags:
9
  licence: license
10
  ---
11
 
12
- # Model Card for gpt2-xl-gsm8k-epoch6-acc0-1. Always
13
 
14
  This model is a fine-tuned version of [openai-community/gpt2-xl](https://huggingface.co/openai-community/gpt2-xl).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
 
1
  ---
2
  base_model: openai-community/gpt2-xl
3
  library_name: transformers
4
+ model_name: 'gpt2-xl-gsm8k-epoch7-acc0-1. Always '
5
  tags:
6
  - generated_from_trainer
7
  - trl
 
9
  licence: license
10
  ---
11
 
12
+ # Model Card for gpt2-xl-gsm8k-epoch7-acc0-1. Always
13
 
14
  This model is a fine-tuned version of [openai-community/gpt2-xl](https://huggingface.co/openai-community/gpt2-xl).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
epoch7/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "openai-community/gpt2-xl",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 1600,
16
+ "n_head": 25,
17
+ "n_inner": null,
18
+ "n_layer": 48,
19
+ "n_positions": 1024,
20
+ "output_past": true,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.49.0",
38
+ "use_cache": true,
39
+ "vocab_size": 50257
40
+ }
epoch7/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.49.0"
6
+ }
epoch7/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
epoch7/model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:589c8bccadace8f954fb735e4e807a86ece90b299cce3bf320443cb98537f35b
3
+ size 4959881464
epoch7/model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a790ae2383014a9516b95d5c9fd5ca3b95cd87d76a6ae661bab82e4d2fcabf98
3
+ size 1270624096
epoch7/model.safetensors.index.json ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 6230444800
4
+ },
5
+ "weight_map": {
6
+ "transformer.h.0.attn.c_attn.bias": "model-00001-of-00002.safetensors",
7
+ "transformer.h.0.attn.c_attn.weight": "model-00001-of-00002.safetensors",
8
+ "transformer.h.0.attn.c_proj.bias": "model-00001-of-00002.safetensors",
9
+ "transformer.h.0.attn.c_proj.weight": "model-00001-of-00002.safetensors",
10
+ "transformer.h.0.ln_1.bias": "model-00001-of-00002.safetensors",
11
+ "transformer.h.0.ln_1.weight": "model-00001-of-00002.safetensors",
12
+ "transformer.h.0.ln_2.bias": "model-00001-of-00002.safetensors",
13
+ "transformer.h.0.ln_2.weight": "model-00001-of-00002.safetensors",
14
+ "transformer.h.0.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
15
+ "transformer.h.0.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
16
+ "transformer.h.0.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
17
+ "transformer.h.0.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
18
+ "transformer.h.1.attn.c_attn.bias": "model-00001-of-00002.safetensors",
19
+ "transformer.h.1.attn.c_attn.weight": "model-00001-of-00002.safetensors",
20
+ "transformer.h.1.attn.c_proj.bias": "model-00001-of-00002.safetensors",
21
+ "transformer.h.1.attn.c_proj.weight": "model-00001-of-00002.safetensors",
22
+ "transformer.h.1.ln_1.bias": "model-00001-of-00002.safetensors",
23
+ "transformer.h.1.ln_1.weight": "model-00001-of-00002.safetensors",
24
+ "transformer.h.1.ln_2.bias": "model-00001-of-00002.safetensors",
25
+ "transformer.h.1.ln_2.weight": "model-00001-of-00002.safetensors",
26
+ "transformer.h.1.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
27
+ "transformer.h.1.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
28
+ "transformer.h.1.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
29
+ "transformer.h.1.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
30
+ "transformer.h.10.attn.c_attn.bias": "model-00001-of-00002.safetensors",
31
+ "transformer.h.10.attn.c_attn.weight": "model-00001-of-00002.safetensors",
32
+ "transformer.h.10.attn.c_proj.bias": "model-00001-of-00002.safetensors",
33
+ "transformer.h.10.attn.c_proj.weight": "model-00001-of-00002.safetensors",
34
+ "transformer.h.10.ln_1.bias": "model-00001-of-00002.safetensors",
35
+ "transformer.h.10.ln_1.weight": "model-00001-of-00002.safetensors",
36
+ "transformer.h.10.ln_2.bias": "model-00001-of-00002.safetensors",
37
+ "transformer.h.10.ln_2.weight": "model-00001-of-00002.safetensors",
38
+ "transformer.h.10.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
39
+ "transformer.h.10.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
40
+ "transformer.h.10.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
41
+ "transformer.h.10.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
42
+ "transformer.h.11.attn.c_attn.bias": "model-00001-of-00002.safetensors",
43
+ "transformer.h.11.attn.c_attn.weight": "model-00001-of-00002.safetensors",
44
+ "transformer.h.11.attn.c_proj.bias": "model-00001-of-00002.safetensors",
45
+ "transformer.h.11.attn.c_proj.weight": "model-00001-of-00002.safetensors",
46
+ "transformer.h.11.ln_1.bias": "model-00001-of-00002.safetensors",
47
+ "transformer.h.11.ln_1.weight": "model-00001-of-00002.safetensors",
48
+ "transformer.h.11.ln_2.bias": "model-00001-of-00002.safetensors",
49
+ "transformer.h.11.ln_2.weight": "model-00001-of-00002.safetensors",
50
+ "transformer.h.11.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
51
+ "transformer.h.11.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
52
+ "transformer.h.11.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
53
+ "transformer.h.11.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
54
+ "transformer.h.12.attn.c_attn.bias": "model-00001-of-00002.safetensors",
55
+ "transformer.h.12.attn.c_attn.weight": "model-00001-of-00002.safetensors",
56
+ "transformer.h.12.attn.c_proj.bias": "model-00001-of-00002.safetensors",
57
+ "transformer.h.12.attn.c_proj.weight": "model-00001-of-00002.safetensors",
58
+ "transformer.h.12.ln_1.bias": "model-00001-of-00002.safetensors",
59
+ "transformer.h.12.ln_1.weight": "model-00001-of-00002.safetensors",
60
+ "transformer.h.12.ln_2.bias": "model-00001-of-00002.safetensors",
61
+ "transformer.h.12.ln_2.weight": "model-00001-of-00002.safetensors",
62
+ "transformer.h.12.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
63
+ "transformer.h.12.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
64
+ "transformer.h.12.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
65
+ "transformer.h.12.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
66
+ "transformer.h.13.attn.c_attn.bias": "model-00001-of-00002.safetensors",
67
+ "transformer.h.13.attn.c_attn.weight": "model-00001-of-00002.safetensors",
68
+ "transformer.h.13.attn.c_proj.bias": "model-00001-of-00002.safetensors",
69
+ "transformer.h.13.attn.c_proj.weight": "model-00001-of-00002.safetensors",
70
+ "transformer.h.13.ln_1.bias": "model-00001-of-00002.safetensors",
71
+ "transformer.h.13.ln_1.weight": "model-00001-of-00002.safetensors",
72
+ "transformer.h.13.ln_2.bias": "model-00001-of-00002.safetensors",
73
+ "transformer.h.13.ln_2.weight": "model-00001-of-00002.safetensors",
74
+ "transformer.h.13.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
75
+ "transformer.h.13.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
76
+ "transformer.h.13.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
77
+ "transformer.h.13.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
78
+ "transformer.h.14.attn.c_attn.bias": "model-00001-of-00002.safetensors",
79
+ "transformer.h.14.attn.c_attn.weight": "model-00001-of-00002.safetensors",
80
+ "transformer.h.14.attn.c_proj.bias": "model-00001-of-00002.safetensors",
81
+ "transformer.h.14.attn.c_proj.weight": "model-00001-of-00002.safetensors",
82
+ "transformer.h.14.ln_1.bias": "model-00001-of-00002.safetensors",
83
+ "transformer.h.14.ln_1.weight": "model-00001-of-00002.safetensors",
84
+ "transformer.h.14.ln_2.bias": "model-00001-of-00002.safetensors",
85
+ "transformer.h.14.ln_2.weight": "model-00001-of-00002.safetensors",
86
+ "transformer.h.14.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
87
+ "transformer.h.14.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
88
+ "transformer.h.14.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
89
+ "transformer.h.14.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
90
+ "transformer.h.15.attn.c_attn.bias": "model-00001-of-00002.safetensors",
91
+ "transformer.h.15.attn.c_attn.weight": "model-00001-of-00002.safetensors",
92
+ "transformer.h.15.attn.c_proj.bias": "model-00001-of-00002.safetensors",
93
+ "transformer.h.15.attn.c_proj.weight": "model-00001-of-00002.safetensors",
94
+ "transformer.h.15.ln_1.bias": "model-00001-of-00002.safetensors",
95
+ "transformer.h.15.ln_1.weight": "model-00001-of-00002.safetensors",
96
+ "transformer.h.15.ln_2.bias": "model-00001-of-00002.safetensors",
97
+ "transformer.h.15.ln_2.weight": "model-00001-of-00002.safetensors",
98
+ "transformer.h.15.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
99
+ "transformer.h.15.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
100
+ "transformer.h.15.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
101
+ "transformer.h.15.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
102
+ "transformer.h.16.attn.c_attn.bias": "model-00001-of-00002.safetensors",
103
+ "transformer.h.16.attn.c_attn.weight": "model-00001-of-00002.safetensors",
104
+ "transformer.h.16.attn.c_proj.bias": "model-00001-of-00002.safetensors",
105
+ "transformer.h.16.attn.c_proj.weight": "model-00001-of-00002.safetensors",
106
+ "transformer.h.16.ln_1.bias": "model-00001-of-00002.safetensors",
107
+ "transformer.h.16.ln_1.weight": "model-00001-of-00002.safetensors",
108
+ "transformer.h.16.ln_2.bias": "model-00001-of-00002.safetensors",
109
+ "transformer.h.16.ln_2.weight": "model-00001-of-00002.safetensors",
110
+ "transformer.h.16.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
111
+ "transformer.h.16.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
112
+ "transformer.h.16.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
113
+ "transformer.h.16.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
114
+ "transformer.h.17.attn.c_attn.bias": "model-00001-of-00002.safetensors",
115
+ "transformer.h.17.attn.c_attn.weight": "model-00001-of-00002.safetensors",
116
+ "transformer.h.17.attn.c_proj.bias": "model-00001-of-00002.safetensors",
117
+ "transformer.h.17.attn.c_proj.weight": "model-00001-of-00002.safetensors",
118
+ "transformer.h.17.ln_1.bias": "model-00001-of-00002.safetensors",
119
+ "transformer.h.17.ln_1.weight": "model-00001-of-00002.safetensors",
120
+ "transformer.h.17.ln_2.bias": "model-00001-of-00002.safetensors",
121
+ "transformer.h.17.ln_2.weight": "model-00001-of-00002.safetensors",
122
+ "transformer.h.17.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
123
+ "transformer.h.17.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
124
+ "transformer.h.17.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
125
+ "transformer.h.17.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
126
+ "transformer.h.18.attn.c_attn.bias": "model-00001-of-00002.safetensors",
127
+ "transformer.h.18.attn.c_attn.weight": "model-00001-of-00002.safetensors",
128
+ "transformer.h.18.attn.c_proj.bias": "model-00001-of-00002.safetensors",
129
+ "transformer.h.18.attn.c_proj.weight": "model-00001-of-00002.safetensors",
130
+ "transformer.h.18.ln_1.bias": "model-00001-of-00002.safetensors",
131
+ "transformer.h.18.ln_1.weight": "model-00001-of-00002.safetensors",
132
+ "transformer.h.18.ln_2.bias": "model-00001-of-00002.safetensors",
133
+ "transformer.h.18.ln_2.weight": "model-00001-of-00002.safetensors",
134
+ "transformer.h.18.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
135
+ "transformer.h.18.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
136
+ "transformer.h.18.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
137
+ "transformer.h.18.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
138
+ "transformer.h.19.attn.c_attn.bias": "model-00001-of-00002.safetensors",
139
+ "transformer.h.19.attn.c_attn.weight": "model-00001-of-00002.safetensors",
140
+ "transformer.h.19.attn.c_proj.bias": "model-00001-of-00002.safetensors",
141
+ "transformer.h.19.attn.c_proj.weight": "model-00001-of-00002.safetensors",
142
+ "transformer.h.19.ln_1.bias": "model-00001-of-00002.safetensors",
143
+ "transformer.h.19.ln_1.weight": "model-00001-of-00002.safetensors",
144
+ "transformer.h.19.ln_2.bias": "model-00001-of-00002.safetensors",
145
+ "transformer.h.19.ln_2.weight": "model-00001-of-00002.safetensors",
146
+ "transformer.h.19.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
147
+ "transformer.h.19.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
148
+ "transformer.h.19.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
149
+ "transformer.h.19.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
150
+ "transformer.h.2.attn.c_attn.bias": "model-00001-of-00002.safetensors",
151
+ "transformer.h.2.attn.c_attn.weight": "model-00001-of-00002.safetensors",
152
+ "transformer.h.2.attn.c_proj.bias": "model-00001-of-00002.safetensors",
153
+ "transformer.h.2.attn.c_proj.weight": "model-00001-of-00002.safetensors",
154
+ "transformer.h.2.ln_1.bias": "model-00001-of-00002.safetensors",
155
+ "transformer.h.2.ln_1.weight": "model-00001-of-00002.safetensors",
156
+ "transformer.h.2.ln_2.bias": "model-00001-of-00002.safetensors",
157
+ "transformer.h.2.ln_2.weight": "model-00001-of-00002.safetensors",
158
+ "transformer.h.2.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
159
+ "transformer.h.2.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
160
+ "transformer.h.2.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
161
+ "transformer.h.2.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
162
+ "transformer.h.20.attn.c_attn.bias": "model-00001-of-00002.safetensors",
163
+ "transformer.h.20.attn.c_attn.weight": "model-00001-of-00002.safetensors",
164
+ "transformer.h.20.attn.c_proj.bias": "model-00001-of-00002.safetensors",
165
+ "transformer.h.20.attn.c_proj.weight": "model-00001-of-00002.safetensors",
166
+ "transformer.h.20.ln_1.bias": "model-00001-of-00002.safetensors",
167
+ "transformer.h.20.ln_1.weight": "model-00001-of-00002.safetensors",
168
+ "transformer.h.20.ln_2.bias": "model-00001-of-00002.safetensors",
169
+ "transformer.h.20.ln_2.weight": "model-00001-of-00002.safetensors",
170
+ "transformer.h.20.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
171
+ "transformer.h.20.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
172
+ "transformer.h.20.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
173
+ "transformer.h.20.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
174
+ "transformer.h.21.attn.c_attn.bias": "model-00001-of-00002.safetensors",
175
+ "transformer.h.21.attn.c_attn.weight": "model-00001-of-00002.safetensors",
176
+ "transformer.h.21.attn.c_proj.bias": "model-00001-of-00002.safetensors",
177
+ "transformer.h.21.attn.c_proj.weight": "model-00001-of-00002.safetensors",
178
+ "transformer.h.21.ln_1.bias": "model-00001-of-00002.safetensors",
179
+ "transformer.h.21.ln_1.weight": "model-00001-of-00002.safetensors",
180
+ "transformer.h.21.ln_2.bias": "model-00001-of-00002.safetensors",
181
+ "transformer.h.21.ln_2.weight": "model-00001-of-00002.safetensors",
182
+ "transformer.h.21.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
183
+ "transformer.h.21.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
184
+ "transformer.h.21.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
185
+ "transformer.h.21.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
186
+ "transformer.h.22.attn.c_attn.bias": "model-00001-of-00002.safetensors",
187
+ "transformer.h.22.attn.c_attn.weight": "model-00001-of-00002.safetensors",
188
+ "transformer.h.22.attn.c_proj.bias": "model-00001-of-00002.safetensors",
189
+ "transformer.h.22.attn.c_proj.weight": "model-00001-of-00002.safetensors",
190
+ "transformer.h.22.ln_1.bias": "model-00001-of-00002.safetensors",
191
+ "transformer.h.22.ln_1.weight": "model-00001-of-00002.safetensors",
192
+ "transformer.h.22.ln_2.bias": "model-00001-of-00002.safetensors",
193
+ "transformer.h.22.ln_2.weight": "model-00001-of-00002.safetensors",
194
+ "transformer.h.22.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
195
+ "transformer.h.22.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
196
+ "transformer.h.22.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
197
+ "transformer.h.22.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
198
+ "transformer.h.23.attn.c_attn.bias": "model-00001-of-00002.safetensors",
199
+ "transformer.h.23.attn.c_attn.weight": "model-00001-of-00002.safetensors",
200
+ "transformer.h.23.attn.c_proj.bias": "model-00001-of-00002.safetensors",
201
+ "transformer.h.23.attn.c_proj.weight": "model-00001-of-00002.safetensors",
202
+ "transformer.h.23.ln_1.bias": "model-00001-of-00002.safetensors",
203
+ "transformer.h.23.ln_1.weight": "model-00001-of-00002.safetensors",
204
+ "transformer.h.23.ln_2.bias": "model-00001-of-00002.safetensors",
205
+ "transformer.h.23.ln_2.weight": "model-00001-of-00002.safetensors",
206
+ "transformer.h.23.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
207
+ "transformer.h.23.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
208
+ "transformer.h.23.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
209
+ "transformer.h.23.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
210
+ "transformer.h.24.attn.c_attn.bias": "model-00001-of-00002.safetensors",
211
+ "transformer.h.24.attn.c_attn.weight": "model-00001-of-00002.safetensors",
212
+ "transformer.h.24.attn.c_proj.bias": "model-00001-of-00002.safetensors",
213
+ "transformer.h.24.attn.c_proj.weight": "model-00001-of-00002.safetensors",
214
+ "transformer.h.24.ln_1.bias": "model-00001-of-00002.safetensors",
215
+ "transformer.h.24.ln_1.weight": "model-00001-of-00002.safetensors",
216
+ "transformer.h.24.ln_2.bias": "model-00001-of-00002.safetensors",
217
+ "transformer.h.24.ln_2.weight": "model-00001-of-00002.safetensors",
218
+ "transformer.h.24.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
219
+ "transformer.h.24.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
220
+ "transformer.h.24.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
221
+ "transformer.h.24.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
222
+ "transformer.h.25.attn.c_attn.bias": "model-00001-of-00002.safetensors",
223
+ "transformer.h.25.attn.c_attn.weight": "model-00001-of-00002.safetensors",
224
+ "transformer.h.25.attn.c_proj.bias": "model-00001-of-00002.safetensors",
225
+ "transformer.h.25.attn.c_proj.weight": "model-00001-of-00002.safetensors",
226
+ "transformer.h.25.ln_1.bias": "model-00001-of-00002.safetensors",
227
+ "transformer.h.25.ln_1.weight": "model-00001-of-00002.safetensors",
228
+ "transformer.h.25.ln_2.bias": "model-00001-of-00002.safetensors",
229
+ "transformer.h.25.ln_2.weight": "model-00001-of-00002.safetensors",
230
+ "transformer.h.25.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
231
+ "transformer.h.25.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
232
+ "transformer.h.25.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
233
+ "transformer.h.25.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
234
+ "transformer.h.26.attn.c_attn.bias": "model-00001-of-00002.safetensors",
235
+ "transformer.h.26.attn.c_attn.weight": "model-00001-of-00002.safetensors",
236
+ "transformer.h.26.attn.c_proj.bias": "model-00001-of-00002.safetensors",
237
+ "transformer.h.26.attn.c_proj.weight": "model-00001-of-00002.safetensors",
238
+ "transformer.h.26.ln_1.bias": "model-00001-of-00002.safetensors",
239
+ "transformer.h.26.ln_1.weight": "model-00001-of-00002.safetensors",
240
+ "transformer.h.26.ln_2.bias": "model-00001-of-00002.safetensors",
241
+ "transformer.h.26.ln_2.weight": "model-00001-of-00002.safetensors",
242
+ "transformer.h.26.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
243
+ "transformer.h.26.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
244
+ "transformer.h.26.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
245
+ "transformer.h.26.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
246
+ "transformer.h.27.attn.c_attn.bias": "model-00001-of-00002.safetensors",
247
+ "transformer.h.27.attn.c_attn.weight": "model-00001-of-00002.safetensors",
248
+ "transformer.h.27.attn.c_proj.bias": "model-00001-of-00002.safetensors",
249
+ "transformer.h.27.attn.c_proj.weight": "model-00001-of-00002.safetensors",
250
+ "transformer.h.27.ln_1.bias": "model-00001-of-00002.safetensors",
251
+ "transformer.h.27.ln_1.weight": "model-00001-of-00002.safetensors",
252
+ "transformer.h.27.ln_2.bias": "model-00001-of-00002.safetensors",
253
+ "transformer.h.27.ln_2.weight": "model-00001-of-00002.safetensors",
254
+ "transformer.h.27.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
255
+ "transformer.h.27.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
256
+ "transformer.h.27.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
257
+ "transformer.h.27.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
258
+ "transformer.h.28.attn.c_attn.bias": "model-00001-of-00002.safetensors",
259
+ "transformer.h.28.attn.c_attn.weight": "model-00001-of-00002.safetensors",
260
+ "transformer.h.28.attn.c_proj.bias": "model-00001-of-00002.safetensors",
261
+ "transformer.h.28.attn.c_proj.weight": "model-00001-of-00002.safetensors",
262
+ "transformer.h.28.ln_1.bias": "model-00001-of-00002.safetensors",
263
+ "transformer.h.28.ln_1.weight": "model-00001-of-00002.safetensors",
264
+ "transformer.h.28.ln_2.bias": "model-00001-of-00002.safetensors",
265
+ "transformer.h.28.ln_2.weight": "model-00001-of-00002.safetensors",
266
+ "transformer.h.28.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
267
+ "transformer.h.28.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
268
+ "transformer.h.28.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
269
+ "transformer.h.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
270
+ "transformer.h.29.attn.c_attn.bias": "model-00001-of-00002.safetensors",
271
+ "transformer.h.29.attn.c_attn.weight": "model-00001-of-00002.safetensors",
272
+ "transformer.h.29.attn.c_proj.bias": "model-00001-of-00002.safetensors",
273
+ "transformer.h.29.attn.c_proj.weight": "model-00001-of-00002.safetensors",
274
+ "transformer.h.29.ln_1.bias": "model-00001-of-00002.safetensors",
275
+ "transformer.h.29.ln_1.weight": "model-00001-of-00002.safetensors",
276
+ "transformer.h.29.ln_2.bias": "model-00001-of-00002.safetensors",
277
+ "transformer.h.29.ln_2.weight": "model-00001-of-00002.safetensors",
278
+ "transformer.h.29.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
279
+ "transformer.h.29.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
280
+ "transformer.h.29.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
281
+ "transformer.h.29.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
282
+ "transformer.h.3.attn.c_attn.bias": "model-00001-of-00002.safetensors",
283
+ "transformer.h.3.attn.c_attn.weight": "model-00001-of-00002.safetensors",
284
+ "transformer.h.3.attn.c_proj.bias": "model-00001-of-00002.safetensors",
285
+ "transformer.h.3.attn.c_proj.weight": "model-00001-of-00002.safetensors",
286
+ "transformer.h.3.ln_1.bias": "model-00001-of-00002.safetensors",
287
+ "transformer.h.3.ln_1.weight": "model-00001-of-00002.safetensors",
288
+ "transformer.h.3.ln_2.bias": "model-00001-of-00002.safetensors",
289
+ "transformer.h.3.ln_2.weight": "model-00001-of-00002.safetensors",
290
+ "transformer.h.3.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
291
+ "transformer.h.3.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
292
+ "transformer.h.3.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
293
+ "transformer.h.3.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
294
+ "transformer.h.30.attn.c_attn.bias": "model-00001-of-00002.safetensors",
295
+ "transformer.h.30.attn.c_attn.weight": "model-00001-of-00002.safetensors",
296
+ "transformer.h.30.attn.c_proj.bias": "model-00001-of-00002.safetensors",
297
+ "transformer.h.30.attn.c_proj.weight": "model-00001-of-00002.safetensors",
298
+ "transformer.h.30.ln_1.bias": "model-00001-of-00002.safetensors",
299
+ "transformer.h.30.ln_1.weight": "model-00001-of-00002.safetensors",
300
+ "transformer.h.30.ln_2.bias": "model-00001-of-00002.safetensors",
301
+ "transformer.h.30.ln_2.weight": "model-00001-of-00002.safetensors",
302
+ "transformer.h.30.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
303
+ "transformer.h.30.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
304
+ "transformer.h.30.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
305
+ "transformer.h.30.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
306
+ "transformer.h.31.attn.c_attn.bias": "model-00001-of-00002.safetensors",
307
+ "transformer.h.31.attn.c_attn.weight": "model-00001-of-00002.safetensors",
308
+ "transformer.h.31.attn.c_proj.bias": "model-00001-of-00002.safetensors",
309
+ "transformer.h.31.attn.c_proj.weight": "model-00001-of-00002.safetensors",
310
+ "transformer.h.31.ln_1.bias": "model-00001-of-00002.safetensors",
311
+ "transformer.h.31.ln_1.weight": "model-00001-of-00002.safetensors",
312
+ "transformer.h.31.ln_2.bias": "model-00001-of-00002.safetensors",
313
+ "transformer.h.31.ln_2.weight": "model-00001-of-00002.safetensors",
314
+ "transformer.h.31.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
315
+ "transformer.h.31.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
316
+ "transformer.h.31.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
317
+ "transformer.h.31.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
318
+ "transformer.h.32.attn.c_attn.bias": "model-00001-of-00002.safetensors",
319
+ "transformer.h.32.attn.c_attn.weight": "model-00001-of-00002.safetensors",
320
+ "transformer.h.32.attn.c_proj.bias": "model-00001-of-00002.safetensors",
321
+ "transformer.h.32.attn.c_proj.weight": "model-00001-of-00002.safetensors",
322
+ "transformer.h.32.ln_1.bias": "model-00001-of-00002.safetensors",
323
+ "transformer.h.32.ln_1.weight": "model-00001-of-00002.safetensors",
324
+ "transformer.h.32.ln_2.bias": "model-00001-of-00002.safetensors",
325
+ "transformer.h.32.ln_2.weight": "model-00001-of-00002.safetensors",
326
+ "transformer.h.32.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
327
+ "transformer.h.32.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
328
+ "transformer.h.32.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
329
+ "transformer.h.32.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
330
+ "transformer.h.33.attn.c_attn.bias": "model-00001-of-00002.safetensors",
331
+ "transformer.h.33.attn.c_attn.weight": "model-00001-of-00002.safetensors",
332
+ "transformer.h.33.attn.c_proj.bias": "model-00001-of-00002.safetensors",
333
+ "transformer.h.33.attn.c_proj.weight": "model-00001-of-00002.safetensors",
334
+ "transformer.h.33.ln_1.bias": "model-00001-of-00002.safetensors",
335
+ "transformer.h.33.ln_1.weight": "model-00001-of-00002.safetensors",
336
+ "transformer.h.33.ln_2.bias": "model-00001-of-00002.safetensors",
337
+ "transformer.h.33.ln_2.weight": "model-00001-of-00002.safetensors",
338
+ "transformer.h.33.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
339
+ "transformer.h.33.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
340
+ "transformer.h.33.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
341
+ "transformer.h.33.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
342
+ "transformer.h.34.attn.c_attn.bias": "model-00001-of-00002.safetensors",
343
+ "transformer.h.34.attn.c_attn.weight": "model-00001-of-00002.safetensors",
344
+ "transformer.h.34.attn.c_proj.bias": "model-00001-of-00002.safetensors",
345
+ "transformer.h.34.attn.c_proj.weight": "model-00001-of-00002.safetensors",
346
+ "transformer.h.34.ln_1.bias": "model-00001-of-00002.safetensors",
347
+ "transformer.h.34.ln_1.weight": "model-00001-of-00002.safetensors",
348
+ "transformer.h.34.ln_2.bias": "model-00001-of-00002.safetensors",
349
+ "transformer.h.34.ln_2.weight": "model-00001-of-00002.safetensors",
350
+ "transformer.h.34.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
351
+ "transformer.h.34.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
352
+ "transformer.h.34.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
353
+ "transformer.h.34.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
354
+ "transformer.h.35.attn.c_attn.bias": "model-00001-of-00002.safetensors",
355
+ "transformer.h.35.attn.c_attn.weight": "model-00001-of-00002.safetensors",
356
+ "transformer.h.35.attn.c_proj.bias": "model-00001-of-00002.safetensors",
357
+ "transformer.h.35.attn.c_proj.weight": "model-00001-of-00002.safetensors",
358
+ "transformer.h.35.ln_1.bias": "model-00001-of-00002.safetensors",
359
+ "transformer.h.35.ln_1.weight": "model-00001-of-00002.safetensors",
360
+ "transformer.h.35.ln_2.bias": "model-00001-of-00002.safetensors",
361
+ "transformer.h.35.ln_2.weight": "model-00001-of-00002.safetensors",
362
+ "transformer.h.35.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
363
+ "transformer.h.35.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
364
+ "transformer.h.35.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
365
+ "transformer.h.35.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
366
+ "transformer.h.36.attn.c_attn.bias": "model-00001-of-00002.safetensors",
367
+ "transformer.h.36.attn.c_attn.weight": "model-00001-of-00002.safetensors",
368
+ "transformer.h.36.attn.c_proj.bias": "model-00001-of-00002.safetensors",
369
+ "transformer.h.36.attn.c_proj.weight": "model-00001-of-00002.safetensors",
370
+ "transformer.h.36.ln_1.bias": "model-00001-of-00002.safetensors",
371
+ "transformer.h.36.ln_1.weight": "model-00001-of-00002.safetensors",
372
+ "transformer.h.36.ln_2.bias": "model-00001-of-00002.safetensors",
373
+ "transformer.h.36.ln_2.weight": "model-00001-of-00002.safetensors",
374
+ "transformer.h.36.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
375
+ "transformer.h.36.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
376
+ "transformer.h.36.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
377
+ "transformer.h.36.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
378
+ "transformer.h.37.attn.c_attn.bias": "model-00001-of-00002.safetensors",
379
+ "transformer.h.37.attn.c_attn.weight": "model-00001-of-00002.safetensors",
380
+ "transformer.h.37.attn.c_proj.bias": "model-00001-of-00002.safetensors",
381
+ "transformer.h.37.attn.c_proj.weight": "model-00001-of-00002.safetensors",
382
+ "transformer.h.37.ln_1.bias": "model-00001-of-00002.safetensors",
383
+ "transformer.h.37.ln_1.weight": "model-00001-of-00002.safetensors",
384
+ "transformer.h.37.ln_2.bias": "model-00001-of-00002.safetensors",
385
+ "transformer.h.37.ln_2.weight": "model-00001-of-00002.safetensors",
386
+ "transformer.h.37.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
387
+ "transformer.h.37.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
388
+ "transformer.h.37.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
389
+ "transformer.h.37.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
390
+ "transformer.h.38.attn.c_attn.bias": "model-00002-of-00002.safetensors",
391
+ "transformer.h.38.attn.c_attn.weight": "model-00002-of-00002.safetensors",
392
+ "transformer.h.38.attn.c_proj.bias": "model-00002-of-00002.safetensors",
393
+ "transformer.h.38.attn.c_proj.weight": "model-00002-of-00002.safetensors",
394
+ "transformer.h.38.ln_1.bias": "model-00002-of-00002.safetensors",
395
+ "transformer.h.38.ln_1.weight": "model-00002-of-00002.safetensors",
396
+ "transformer.h.38.ln_2.bias": "model-00002-of-00002.safetensors",
397
+ "transformer.h.38.ln_2.weight": "model-00002-of-00002.safetensors",
398
+ "transformer.h.38.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
399
+ "transformer.h.38.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
400
+ "transformer.h.38.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
401
+ "transformer.h.38.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
402
+ "transformer.h.39.attn.c_attn.bias": "model-00002-of-00002.safetensors",
403
+ "transformer.h.39.attn.c_attn.weight": "model-00002-of-00002.safetensors",
404
+ "transformer.h.39.attn.c_proj.bias": "model-00002-of-00002.safetensors",
405
+ "transformer.h.39.attn.c_proj.weight": "model-00002-of-00002.safetensors",
406
+ "transformer.h.39.ln_1.bias": "model-00002-of-00002.safetensors",
407
+ "transformer.h.39.ln_1.weight": "model-00002-of-00002.safetensors",
408
+ "transformer.h.39.ln_2.bias": "model-00002-of-00002.safetensors",
409
+ "transformer.h.39.ln_2.weight": "model-00002-of-00002.safetensors",
410
+ "transformer.h.39.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
411
+ "transformer.h.39.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
412
+ "transformer.h.39.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
413
+ "transformer.h.39.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
414
+ "transformer.h.4.attn.c_attn.bias": "model-00001-of-00002.safetensors",
415
+ "transformer.h.4.attn.c_attn.weight": "model-00001-of-00002.safetensors",
416
+ "transformer.h.4.attn.c_proj.bias": "model-00001-of-00002.safetensors",
417
+ "transformer.h.4.attn.c_proj.weight": "model-00001-of-00002.safetensors",
418
+ "transformer.h.4.ln_1.bias": "model-00001-of-00002.safetensors",
419
+ "transformer.h.4.ln_1.weight": "model-00001-of-00002.safetensors",
420
+ "transformer.h.4.ln_2.bias": "model-00001-of-00002.safetensors",
421
+ "transformer.h.4.ln_2.weight": "model-00001-of-00002.safetensors",
422
+ "transformer.h.4.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
423
+ "transformer.h.4.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
424
+ "transformer.h.4.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
425
+ "transformer.h.4.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
426
+ "transformer.h.40.attn.c_attn.bias": "model-00002-of-00002.safetensors",
427
+ "transformer.h.40.attn.c_attn.weight": "model-00002-of-00002.safetensors",
428
+ "transformer.h.40.attn.c_proj.bias": "model-00002-of-00002.safetensors",
429
+ "transformer.h.40.attn.c_proj.weight": "model-00002-of-00002.safetensors",
430
+ "transformer.h.40.ln_1.bias": "model-00002-of-00002.safetensors",
431
+ "transformer.h.40.ln_1.weight": "model-00002-of-00002.safetensors",
432
+ "transformer.h.40.ln_2.bias": "model-00002-of-00002.safetensors",
433
+ "transformer.h.40.ln_2.weight": "model-00002-of-00002.safetensors",
434
+ "transformer.h.40.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
435
+ "transformer.h.40.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
436
+ "transformer.h.40.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
437
+ "transformer.h.40.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
438
+ "transformer.h.41.attn.c_attn.bias": "model-00002-of-00002.safetensors",
439
+ "transformer.h.41.attn.c_attn.weight": "model-00002-of-00002.safetensors",
440
+ "transformer.h.41.attn.c_proj.bias": "model-00002-of-00002.safetensors",
441
+ "transformer.h.41.attn.c_proj.weight": "model-00002-of-00002.safetensors",
442
+ "transformer.h.41.ln_1.bias": "model-00002-of-00002.safetensors",
443
+ "transformer.h.41.ln_1.weight": "model-00002-of-00002.safetensors",
444
+ "transformer.h.41.ln_2.bias": "model-00002-of-00002.safetensors",
445
+ "transformer.h.41.ln_2.weight": "model-00002-of-00002.safetensors",
446
+ "transformer.h.41.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
447
+ "transformer.h.41.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
448
+ "transformer.h.41.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
449
+ "transformer.h.41.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
450
+ "transformer.h.42.attn.c_attn.bias": "model-00002-of-00002.safetensors",
451
+ "transformer.h.42.attn.c_attn.weight": "model-00002-of-00002.safetensors",
452
+ "transformer.h.42.attn.c_proj.bias": "model-00002-of-00002.safetensors",
453
+ "transformer.h.42.attn.c_proj.weight": "model-00002-of-00002.safetensors",
454
+ "transformer.h.42.ln_1.bias": "model-00002-of-00002.safetensors",
455
+ "transformer.h.42.ln_1.weight": "model-00002-of-00002.safetensors",
456
+ "transformer.h.42.ln_2.bias": "model-00002-of-00002.safetensors",
457
+ "transformer.h.42.ln_2.weight": "model-00002-of-00002.safetensors",
458
+ "transformer.h.42.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
459
+ "transformer.h.42.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
460
+ "transformer.h.42.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
461
+ "transformer.h.42.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
462
+ "transformer.h.43.attn.c_attn.bias": "model-00002-of-00002.safetensors",
463
+ "transformer.h.43.attn.c_attn.weight": "model-00002-of-00002.safetensors",
464
+ "transformer.h.43.attn.c_proj.bias": "model-00002-of-00002.safetensors",
465
+ "transformer.h.43.attn.c_proj.weight": "model-00002-of-00002.safetensors",
466
+ "transformer.h.43.ln_1.bias": "model-00002-of-00002.safetensors",
467
+ "transformer.h.43.ln_1.weight": "model-00002-of-00002.safetensors",
468
+ "transformer.h.43.ln_2.bias": "model-00002-of-00002.safetensors",
469
+ "transformer.h.43.ln_2.weight": "model-00002-of-00002.safetensors",
470
+ "transformer.h.43.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
471
+ "transformer.h.43.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
472
+ "transformer.h.43.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
473
+ "transformer.h.43.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
474
+ "transformer.h.44.attn.c_attn.bias": "model-00002-of-00002.safetensors",
475
+ "transformer.h.44.attn.c_attn.weight": "model-00002-of-00002.safetensors",
476
+ "transformer.h.44.attn.c_proj.bias": "model-00002-of-00002.safetensors",
477
+ "transformer.h.44.attn.c_proj.weight": "model-00002-of-00002.safetensors",
478
+ "transformer.h.44.ln_1.bias": "model-00002-of-00002.safetensors",
479
+ "transformer.h.44.ln_1.weight": "model-00002-of-00002.safetensors",
480
+ "transformer.h.44.ln_2.bias": "model-00002-of-00002.safetensors",
481
+ "transformer.h.44.ln_2.weight": "model-00002-of-00002.safetensors",
482
+ "transformer.h.44.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
483
+ "transformer.h.44.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
484
+ "transformer.h.44.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
485
+ "transformer.h.44.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
486
+ "transformer.h.45.attn.c_attn.bias": "model-00002-of-00002.safetensors",
487
+ "transformer.h.45.attn.c_attn.weight": "model-00002-of-00002.safetensors",
488
+ "transformer.h.45.attn.c_proj.bias": "model-00002-of-00002.safetensors",
489
+ "transformer.h.45.attn.c_proj.weight": "model-00002-of-00002.safetensors",
490
+ "transformer.h.45.ln_1.bias": "model-00002-of-00002.safetensors",
491
+ "transformer.h.45.ln_1.weight": "model-00002-of-00002.safetensors",
492
+ "transformer.h.45.ln_2.bias": "model-00002-of-00002.safetensors",
493
+ "transformer.h.45.ln_2.weight": "model-00002-of-00002.safetensors",
494
+ "transformer.h.45.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
495
+ "transformer.h.45.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
496
+ "transformer.h.45.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
497
+ "transformer.h.45.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
498
+ "transformer.h.46.attn.c_attn.bias": "model-00002-of-00002.safetensors",
499
+ "transformer.h.46.attn.c_attn.weight": "model-00002-of-00002.safetensors",
500
+ "transformer.h.46.attn.c_proj.bias": "model-00002-of-00002.safetensors",
501
+ "transformer.h.46.attn.c_proj.weight": "model-00002-of-00002.safetensors",
502
+ "transformer.h.46.ln_1.bias": "model-00002-of-00002.safetensors",
503
+ "transformer.h.46.ln_1.weight": "model-00002-of-00002.safetensors",
504
+ "transformer.h.46.ln_2.bias": "model-00002-of-00002.safetensors",
505
+ "transformer.h.46.ln_2.weight": "model-00002-of-00002.safetensors",
506
+ "transformer.h.46.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
507
+ "transformer.h.46.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
508
+ "transformer.h.46.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
509
+ "transformer.h.46.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
510
+ "transformer.h.47.attn.c_attn.bias": "model-00002-of-00002.safetensors",
511
+ "transformer.h.47.attn.c_attn.weight": "model-00002-of-00002.safetensors",
512
+ "transformer.h.47.attn.c_proj.bias": "model-00002-of-00002.safetensors",
513
+ "transformer.h.47.attn.c_proj.weight": "model-00002-of-00002.safetensors",
514
+ "transformer.h.47.ln_1.bias": "model-00002-of-00002.safetensors",
515
+ "transformer.h.47.ln_1.weight": "model-00002-of-00002.safetensors",
516
+ "transformer.h.47.ln_2.bias": "model-00002-of-00002.safetensors",
517
+ "transformer.h.47.ln_2.weight": "model-00002-of-00002.safetensors",
518
+ "transformer.h.47.mlp.c_fc.bias": "model-00002-of-00002.safetensors",
519
+ "transformer.h.47.mlp.c_fc.weight": "model-00002-of-00002.safetensors",
520
+ "transformer.h.47.mlp.c_proj.bias": "model-00002-of-00002.safetensors",
521
+ "transformer.h.47.mlp.c_proj.weight": "model-00002-of-00002.safetensors",
522
+ "transformer.h.5.attn.c_attn.bias": "model-00001-of-00002.safetensors",
523
+ "transformer.h.5.attn.c_attn.weight": "model-00001-of-00002.safetensors",
524
+ "transformer.h.5.attn.c_proj.bias": "model-00001-of-00002.safetensors",
525
+ "transformer.h.5.attn.c_proj.weight": "model-00001-of-00002.safetensors",
526
+ "transformer.h.5.ln_1.bias": "model-00001-of-00002.safetensors",
527
+ "transformer.h.5.ln_1.weight": "model-00001-of-00002.safetensors",
528
+ "transformer.h.5.ln_2.bias": "model-00001-of-00002.safetensors",
529
+ "transformer.h.5.ln_2.weight": "model-00001-of-00002.safetensors",
530
+ "transformer.h.5.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
531
+ "transformer.h.5.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
532
+ "transformer.h.5.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
533
+ "transformer.h.5.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
534
+ "transformer.h.6.attn.c_attn.bias": "model-00001-of-00002.safetensors",
535
+ "transformer.h.6.attn.c_attn.weight": "model-00001-of-00002.safetensors",
536
+ "transformer.h.6.attn.c_proj.bias": "model-00001-of-00002.safetensors",
537
+ "transformer.h.6.attn.c_proj.weight": "model-00001-of-00002.safetensors",
538
+ "transformer.h.6.ln_1.bias": "model-00001-of-00002.safetensors",
539
+ "transformer.h.6.ln_1.weight": "model-00001-of-00002.safetensors",
540
+ "transformer.h.6.ln_2.bias": "model-00001-of-00002.safetensors",
541
+ "transformer.h.6.ln_2.weight": "model-00001-of-00002.safetensors",
542
+ "transformer.h.6.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
543
+ "transformer.h.6.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
544
+ "transformer.h.6.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
545
+ "transformer.h.6.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
546
+ "transformer.h.7.attn.c_attn.bias": "model-00001-of-00002.safetensors",
547
+ "transformer.h.7.attn.c_attn.weight": "model-00001-of-00002.safetensors",
548
+ "transformer.h.7.attn.c_proj.bias": "model-00001-of-00002.safetensors",
549
+ "transformer.h.7.attn.c_proj.weight": "model-00001-of-00002.safetensors",
550
+ "transformer.h.7.ln_1.bias": "model-00001-of-00002.safetensors",
551
+ "transformer.h.7.ln_1.weight": "model-00001-of-00002.safetensors",
552
+ "transformer.h.7.ln_2.bias": "model-00001-of-00002.safetensors",
553
+ "transformer.h.7.ln_2.weight": "model-00001-of-00002.safetensors",
554
+ "transformer.h.7.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
555
+ "transformer.h.7.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
556
+ "transformer.h.7.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
557
+ "transformer.h.7.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
558
+ "transformer.h.8.attn.c_attn.bias": "model-00001-of-00002.safetensors",
559
+ "transformer.h.8.attn.c_attn.weight": "model-00001-of-00002.safetensors",
560
+ "transformer.h.8.attn.c_proj.bias": "model-00001-of-00002.safetensors",
561
+ "transformer.h.8.attn.c_proj.weight": "model-00001-of-00002.safetensors",
562
+ "transformer.h.8.ln_1.bias": "model-00001-of-00002.safetensors",
563
+ "transformer.h.8.ln_1.weight": "model-00001-of-00002.safetensors",
564
+ "transformer.h.8.ln_2.bias": "model-00001-of-00002.safetensors",
565
+ "transformer.h.8.ln_2.weight": "model-00001-of-00002.safetensors",
566
+ "transformer.h.8.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
567
+ "transformer.h.8.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
568
+ "transformer.h.8.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
569
+ "transformer.h.8.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
570
+ "transformer.h.9.attn.c_attn.bias": "model-00001-of-00002.safetensors",
571
+ "transformer.h.9.attn.c_attn.weight": "model-00001-of-00002.safetensors",
572
+ "transformer.h.9.attn.c_proj.bias": "model-00001-of-00002.safetensors",
573
+ "transformer.h.9.attn.c_proj.weight": "model-00001-of-00002.safetensors",
574
+ "transformer.h.9.ln_1.bias": "model-00001-of-00002.safetensors",
575
+ "transformer.h.9.ln_1.weight": "model-00001-of-00002.safetensors",
576
+ "transformer.h.9.ln_2.bias": "model-00001-of-00002.safetensors",
577
+ "transformer.h.9.ln_2.weight": "model-00001-of-00002.safetensors",
578
+ "transformer.h.9.mlp.c_fc.bias": "model-00001-of-00002.safetensors",
579
+ "transformer.h.9.mlp.c_fc.weight": "model-00001-of-00002.safetensors",
580
+ "transformer.h.9.mlp.c_proj.bias": "model-00001-of-00002.safetensors",
581
+ "transformer.h.9.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
582
+ "transformer.ln_f.bias": "model-00002-of-00002.safetensors",
583
+ "transformer.ln_f.weight": "model-00002-of-00002.safetensors",
584
+ "transformer.wpe.weight": "model-00001-of-00002.safetensors",
585
+ "transformer.wte.weight": "model-00001-of-00002.safetensors"
586
+ }
587
+ }
epoch7/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "pad_token": "<|endoftext|>",
5
+ "unk_token": "<|endoftext|>"
6
+ }
epoch7/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
epoch7/tokenizer_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "50256": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ }
12
+ },
13
+ "bos_token": "<|endoftext|>",
14
+ "clean_up_tokenization_spaces": false,
15
+ "eos_token": "<|endoftext|>",
16
+ "extra_special_tokens": {},
17
+ "model_max_length": 1024,
18
+ "pad_token": "<|endoftext|>",
19
+ "padding_side": "left",
20
+ "tokenizer_class": "GPT2Tokenizer",
21
+ "unk_token": "<|endoftext|>"
22
+ }
epoch7/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f14f3c6d299bfb369a2106aab54c59a032e03c366e1d1fecdcf02f954b66a25b
3
+ size 5624
epoch7/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:605b76884d2cfefba809452bd2fd82d636ea92f32dd088283d8d98310914040a
3
  size 4959881464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:589c8bccadace8f954fb735e4e807a86ece90b299cce3bf320443cb98537f35b
3
  size 4959881464
model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb78572d997fffd032abcb5b2395190b1879797adf5a8258fa7fe963e4f8fa52
3
  size 1270624096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a790ae2383014a9516b95d5c9fd5ca3b95cd87d76a6ae661bab82e4d2fcabf98
3
  size 1270624096
wandb/run-20250402_145246-e1n3xkh6/files/output.log CHANGED
@@ -619,8 +619,107 @@ Upload 5 LFS files: 60%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ | 3/5 [03:10<01:50, 55.26s/it]
619
  {'loss': 0.1045, 'grad_norm': 3.081059217453003, 'learning_rate': 8.000000000000001e-06, 'mean_token_accuracy': 0.9669223129749298, 'epoch': 6.0}
620
  {'eval_loss': 1.0028502941131592, 'eval_runtime': 97.3663, 'eval_samples_per_second': 13.547, 'eval_steps_per_second': 0.852, 'eval_mean_token_accuracy': 0.8052722014576555, 'epoch': 6.0}
621
  Upload 5 LFS files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [04:04<00:00, 48.97s/it] :04<00:00, 27.1MB/s]
622
- model-00001-of-00002.safetensors: 28%|β–ˆβ–ˆβ–Š | 1.37G/4.96G [00:40<01:12, 49.1MB/s]
623
  model-00001-of-00002.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 4.96G/4.96G [02:22<00:00, 37.5MB/s]
624
 
625
 
626
  Upload 5 LFS files: 20%|β–ˆβ–ˆ | 1/5 [04:04<16:19, 244.85s/it]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
619
  {'loss': 0.1045, 'grad_norm': 3.081059217453003, 'learning_rate': 8.000000000000001e-06, 'mean_token_accuracy': 0.9669223129749298, 'epoch': 6.0}
620
  {'eval_loss': 1.0028502941131592, 'eval_runtime': 97.3663, 'eval_samples_per_second': 13.547, 'eval_steps_per_second': 0.852, 'eval_mean_token_accuracy': 0.8052722014576555, 'epoch': 6.0}
621
  Upload 5 LFS files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [04:04<00:00, 48.97s/it] :04<00:00, 27.1MB/s]
622
+ .96G [00:40<01:12, 49.1MB/s]
623
  model-00001-of-00002.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 4.96G/4.96G [02:22<00:00, 37.5MB/s]
624
 
625
 
626
  Upload 5 LFS files: 20%|β–ˆβ–ˆ | 1/5 [04:04<16:19, 244.85s/it]
627
+ {'loss': 0.0751, 'grad_norm': 1.2614115476608276, 'learning_rate': 7.97860962566845e-06, 'mean_token_accuracy': 0.9784071505069732, 'epoch': 6.01}
628
+ {'loss': 0.0769, 'grad_norm': 1.5462486743927002, 'learning_rate': 7.9572192513369e-06, 'mean_token_accuracy': 0.9812693119049072, 'epoch': 6.02}
629
+ {'loss': 0.0765, 'grad_norm': 1.133501410484314, 'learning_rate': 7.935828877005348e-06, 'mean_token_accuracy': 0.9798434019088745, 'epoch': 6.03}
630
+ {'loss': 0.0762, 'grad_norm': 1.4378265142440796, 'learning_rate': 7.914438502673799e-06, 'mean_token_accuracy': 0.9798154890537262, 'epoch': 6.04}
631
+ {'loss': 0.0821, 'grad_norm': 1.6636074781417847, 'learning_rate': 7.893048128342246e-06, 'mean_token_accuracy': 0.9762451410293579, 'epoch': 6.05}
632
+ {'loss': 0.0601, 'grad_norm': 1.3429431915283203, 'learning_rate': 7.871657754010695e-06, 'mean_token_accuracy': 0.983588719367981, 'epoch': 6.06}
633
+ {'loss': 0.0703, 'grad_norm': 1.3611433506011963, 'learning_rate': 7.850267379679145e-06, 'mean_token_accuracy': 0.9797813415527343, 'epoch': 6.07}
634
+ {'loss': 0.0674, 'grad_norm': 1.416050672531128, 'learning_rate': 7.828877005347594e-06, 'mean_token_accuracy': 0.9822567105293274, 'epoch': 6.09}
635
+ {'loss': 0.0774, 'grad_norm': 1.6981250047683716, 'learning_rate': 7.807486631016043e-06, 'mean_token_accuracy': 0.9763238668441773, 'epoch': 6.1}
636
+ {'loss': 0.0882, 'grad_norm': 1.4732884168624878, 'learning_rate': 7.786096256684492e-06, 'mean_token_accuracy': 0.9750996351242065, 'epoch': 6.11}
637
+ {'loss': 0.0656, 'grad_norm': 1.4530909061431885, 'learning_rate': 7.764705882352941e-06, 'mean_token_accuracy': 0.9819111764431, 'epoch': 6.12}
638
+ {'loss': 0.0684, 'grad_norm': 1.1294395923614502, 'learning_rate': 7.74331550802139e-06, 'mean_token_accuracy': 0.9803543865680695, 'epoch': 6.13}
639
+ {'loss': 0.07, 'grad_norm': 1.2950372695922852, 'learning_rate': 7.72192513368984e-06, 'mean_token_accuracy': 0.9795331358909607, 'epoch': 6.14}
640
+ {'loss': 0.0751, 'grad_norm': 1.6320174932479858, 'learning_rate': 7.70053475935829e-06, 'mean_token_accuracy': 0.9776553273200989, 'epoch': 6.15}
641
+ {'loss': 0.0789, 'grad_norm': 1.2903655767440796, 'learning_rate': 7.679144385026739e-06, 'mean_token_accuracy': 0.9776297032833099, 'epoch': 6.16}
642
+ {'loss': 0.076, 'grad_norm': 1.4791460037231445, 'learning_rate': 7.657754010695187e-06, 'mean_token_accuracy': 0.9785918891429901, 'epoch': 6.17}
643
+ {'loss': 0.0684, 'grad_norm': 1.3365899324417114, 'learning_rate': 7.636363636363638e-06, 'mean_token_accuracy': 0.9793748497962952, 'epoch': 6.18}
644
+ {'loss': 0.0784, 'grad_norm': 1.31467866897583, 'learning_rate': 7.614973262032086e-06, 'mean_token_accuracy': 0.9782681226730346, 'epoch': 6.19}
645
+ {'loss': 0.0764, 'grad_norm': 1.5984809398651123, 'learning_rate': 7.593582887700536e-06, 'mean_token_accuracy': 0.9779844224452973, 'epoch': 6.2}
646
+ {'loss': 0.0707, 'grad_norm': 1.3290109634399414, 'learning_rate': 7.572192513368984e-06, 'mean_token_accuracy': 0.979961884021759, 'epoch': 6.21}
647
+ {'loss': 0.09, 'grad_norm': 1.6741454601287842, 'learning_rate': 7.550802139037434e-06, 'mean_token_accuracy': 0.9754992425441742, 'epoch': 6.22}
648
+ {'loss': 0.0688, 'grad_norm': 1.1926649808883667, 'learning_rate': 7.529411764705883e-06, 'mean_token_accuracy': 0.980828070640564, 'epoch': 6.24}
649
+ {'loss': 0.07, 'grad_norm': 1.1557010412216187, 'learning_rate': 7.5080213903743325e-06, 'mean_token_accuracy': 0.9807213306427002, 'epoch': 6.25}
650
+ {'loss': 0.0795, 'grad_norm': 1.1448363065719604, 'learning_rate': 7.486631016042781e-06, 'mean_token_accuracy': 0.978685611486435, 'epoch': 6.26}
651
+ {'loss': 0.0736, 'grad_norm': 1.1292977333068848, 'learning_rate': 7.465240641711231e-06, 'mean_token_accuracy': 0.978838461637497, 'epoch': 6.27}
652
+ {'loss': 0.0622, 'grad_norm': 1.191189169883728, 'learning_rate': 7.44385026737968e-06, 'mean_token_accuracy': 0.9828511118888855, 'epoch': 6.28}
653
+ {'loss': 0.0706, 'grad_norm': 1.605857014656067, 'learning_rate': 7.422459893048128e-06, 'mean_token_accuracy': 0.9807406783103942, 'epoch': 6.29}
654
+ {'loss': 0.0908, 'grad_norm': 1.519134283065796, 'learning_rate': 7.401069518716578e-06, 'mean_token_accuracy': 0.97318195104599, 'epoch': 6.3}
655
+ {'loss': 0.0676, 'grad_norm': 1.4005078077316284, 'learning_rate': 7.379679144385027e-06, 'mean_token_accuracy': 0.9825226604938507, 'epoch': 6.31}
656
+ {'loss': 0.0827, 'grad_norm': 1.2273274660110474, 'learning_rate': 7.358288770053477e-06, 'mean_token_accuracy': 0.9774100840091705, 'epoch': 6.32}
657
+ {'loss': 0.0736, 'grad_norm': 1.1349341869354248, 'learning_rate': 7.3368983957219256e-06, 'mean_token_accuracy': 0.9788651049137116, 'epoch': 6.33}
658
+ {'loss': 0.0774, 'grad_norm': 1.269400954246521, 'learning_rate': 7.315508021390375e-06, 'mean_token_accuracy': 0.977211731672287, 'epoch': 6.34}
659
+ {'loss': 0.0676, 'grad_norm': 0.9149906039237976, 'learning_rate': 7.294117647058823e-06, 'mean_token_accuracy': 0.9831490278244018, 'epoch': 6.35}
660
+ {'loss': 0.0743, 'grad_norm': 1.531225562095642, 'learning_rate': 7.272727272727273e-06, 'mean_token_accuracy': 0.9783931136131286, 'epoch': 6.36}
661
+ {'loss': 0.0794, 'grad_norm': 1.5063401460647583, 'learning_rate': 7.251336898395722e-06, 'mean_token_accuracy': 0.9784937739372254, 'epoch': 6.37}
662
+ {'loss': 0.0662, 'grad_norm': 1.1360453367233276, 'learning_rate': 7.229946524064172e-06, 'mean_token_accuracy': 0.9812704741954803, 'epoch': 6.39}
663
+ {'loss': 0.0708, 'grad_norm': 1.5988572835922241, 'learning_rate': 7.208556149732621e-06, 'mean_token_accuracy': 0.981311148405075, 'epoch': 6.4}
664
+ {'loss': 0.0803, 'grad_norm': 1.5322248935699463, 'learning_rate': 7.1871657754010706e-06, 'mean_token_accuracy': 0.9763558447360993, 'epoch': 6.41}
665
+ {'loss': 0.0729, 'grad_norm': 1.542902946472168, 'learning_rate': 7.1657754010695195e-06, 'mean_token_accuracy': 0.9798608005046845, 'epoch': 6.42}
666
+ {'loss': 0.0794, 'grad_norm': 1.1680364608764648, 'learning_rate': 7.144385026737969e-06, 'mean_token_accuracy': 0.9773648381233215, 'epoch': 6.43}
667
+ {'loss': 0.0787, 'grad_norm': 1.7390414476394653, 'learning_rate': 7.122994652406417e-06, 'mean_token_accuracy': 0.9769323229789734, 'epoch': 6.44}
668
+ {'loss': 0.0715, 'grad_norm': 1.1585503816604614, 'learning_rate': 7.101604278074867e-06, 'mean_token_accuracy': 0.9798881709575653, 'epoch': 6.45}
669
+ {'loss': 0.0709, 'grad_norm': 1.2926788330078125, 'learning_rate': 7.080213903743316e-06, 'mean_token_accuracy': 0.9788601398468018, 'epoch': 6.46}
670
+ {'loss': 0.0703, 'grad_norm': 1.3362752199172974, 'learning_rate': 7.058823529411766e-06, 'mean_token_accuracy': 0.9818453311920166, 'epoch': 6.47}
671
+ {'loss': 0.0757, 'grad_norm': 1.5291205644607544, 'learning_rate': 7.037433155080215e-06, 'mean_token_accuracy': 0.9791202425956727, 'epoch': 6.48}
672
+ {'loss': 0.077, 'grad_norm': 1.100151777267456, 'learning_rate': 7.0160427807486645e-06, 'mean_token_accuracy': 0.9778393864631653, 'epoch': 6.49}
673
+ {'loss': 0.0716, 'grad_norm': 1.2845083475112915, 'learning_rate': 6.9946524064171125e-06, 'mean_token_accuracy': 0.9807174503803253, 'epoch': 6.5}
674
+ {'loss': 0.0776, 'grad_norm': 1.8058385848999023, 'learning_rate': 6.9732620320855615e-06, 'mean_token_accuracy': 0.9790221691131592, 'epoch': 6.51}
675
+ {'loss': 0.0713, 'grad_norm': 1.4914551973342896, 'learning_rate': 6.951871657754011e-06, 'mean_token_accuracy': 0.9818285644054413, 'epoch': 6.52}
676
+ {'loss': 0.0756, 'grad_norm': 1.3771636486053467, 'learning_rate': 6.93048128342246e-06, 'mean_token_accuracy': 0.9768440783023834, 'epoch': 6.53}
677
+ {'loss': 0.0752, 'grad_norm': 1.0714784860610962, 'learning_rate': 6.90909090909091e-06, 'mean_token_accuracy': 0.9785768926143646, 'epoch': 6.55}
678
+ {'loss': 0.0724, 'grad_norm': 1.1797113418579102, 'learning_rate': 6.887700534759358e-06, 'mean_token_accuracy': 0.9799900412559509, 'epoch': 6.56}
679
+ {'loss': 0.0778, 'grad_norm': 1.8257405757904053, 'learning_rate': 6.866310160427808e-06, 'mean_token_accuracy': 0.9764438331127167, 'epoch': 6.57}
680
+ {'loss': 0.0798, 'grad_norm': 2.3359692096710205, 'learning_rate': 6.844919786096257e-06, 'mean_token_accuracy': 0.9771210074424743, 'epoch': 6.58}
681
+ {'loss': 0.0704, 'grad_norm': 1.2211873531341553, 'learning_rate': 6.8235294117647065e-06, 'mean_token_accuracy': 0.9808654069900513, 'epoch': 6.59}
682
+ {'loss': 0.0833, 'grad_norm': 1.1809934377670288, 'learning_rate': 6.802139037433155e-06, 'mean_token_accuracy': 0.9761099457740784, 'epoch': 6.6}
683
+ {'loss': 0.0683, 'grad_norm': 1.1736034154891968, 'learning_rate': 6.780748663101605e-06, 'mean_token_accuracy': 0.9787311553955078, 'epoch': 6.61}
684
+ {'loss': 0.0743, 'grad_norm': 1.209066390991211, 'learning_rate': 6.759358288770054e-06, 'mean_token_accuracy': 0.9774844884872437, 'epoch': 6.62}
685
+ {'loss': 0.0744, 'grad_norm': 1.9650102853775024, 'learning_rate': 6.737967914438504e-06, 'mean_token_accuracy': 0.9783697545528411, 'epoch': 6.63}
686
+ {'loss': 0.0746, 'grad_norm': 1.22538423538208, 'learning_rate': 6.716577540106952e-06, 'mean_token_accuracy': 0.9794271349906921, 'epoch': 6.64}
687
+ {'loss': 0.0769, 'grad_norm': 1.6272855997085571, 'learning_rate': 6.695187165775402e-06, 'mean_token_accuracy': 0.9774741470813751, 'epoch': 6.65}
688
+ {'loss': 0.0692, 'grad_norm': 1.2272307872772217, 'learning_rate': 6.673796791443851e-06, 'mean_token_accuracy': 0.9774088799953461, 'epoch': 6.66}
689
+ {'loss': 0.0709, 'grad_norm': 1.611185908317566, 'learning_rate': 6.6524064171123e-06, 'mean_token_accuracy': 0.9799948036670685, 'epoch': 6.67}
690
+ {'loss': 0.0829, 'grad_norm': 1.1995888948440552, 'learning_rate': 6.631016042780749e-06, 'mean_token_accuracy': 0.9759232044219971, 'epoch': 6.68}
691
+ {'loss': 0.0803, 'grad_norm': 1.3638901710510254, 'learning_rate': 6.609625668449199e-06, 'mean_token_accuracy': 0.9769881784915924, 'epoch': 6.7}
692
+ {'loss': 0.0721, 'grad_norm': 1.2634999752044678, 'learning_rate': 6.588235294117647e-06, 'mean_token_accuracy': 0.9790867626667022, 'epoch': 6.71}
693
+ {'loss': 0.092, 'grad_norm': 1.9220893383026123, 'learning_rate': 6.566844919786097e-06, 'mean_token_accuracy': 0.9740871965885163, 'epoch': 6.72}
694
+ {'loss': 0.0852, 'grad_norm': 1.561281681060791, 'learning_rate': 6.545454545454546e-06, 'mean_token_accuracy': 0.9774147212505341, 'epoch': 6.73}
695
+ {'loss': 0.0717, 'grad_norm': 1.455639362335205, 'learning_rate': 6.524064171122996e-06, 'mean_token_accuracy': 0.9791371405124665, 'epoch': 6.74}
696
+ {'loss': 0.0708, 'grad_norm': 1.3064255714416504, 'learning_rate': 6.5026737967914445e-06, 'mean_token_accuracy': 0.9805355131626129, 'epoch': 6.75}
697
+ {'loss': 0.08, 'grad_norm': 1.8594157695770264, 'learning_rate': 6.4812834224598935e-06, 'mean_token_accuracy': 0.9780042350292206, 'epoch': 6.76}
698
+ {'loss': 0.0738, 'grad_norm': 2.009476900100708, 'learning_rate': 6.459893048128343e-06, 'mean_token_accuracy': 0.9791532456874847, 'epoch': 6.77}
699
+ {'loss': 0.0721, 'grad_norm': 1.392528772354126, 'learning_rate': 6.438502673796791e-06, 'mean_token_accuracy': 0.9799114346504212, 'epoch': 6.78}
700
+ {'loss': 0.0769, 'grad_norm': 1.2471076250076294, 'learning_rate': 6.417112299465241e-06, 'mean_token_accuracy': 0.9788577139377594, 'epoch': 6.79}
701
+ {'loss': 0.0714, 'grad_norm': 1.102941632270813, 'learning_rate': 6.39572192513369e-06, 'mean_token_accuracy': 0.9777648031711579, 'epoch': 6.8}
702
+ {'loss': 0.0702, 'grad_norm': 1.3659664392471313, 'learning_rate': 6.37433155080214e-06, 'mean_token_accuracy': 0.9804715156555176, 'epoch': 6.81}
703
+ {'loss': 0.0785, 'grad_norm': 1.4983594417572021, 'learning_rate': 6.352941176470589e-06, 'mean_token_accuracy': 0.9782240986824036, 'epoch': 6.82}
704
+ {'loss': 0.0755, 'grad_norm': 1.8034380674362183, 'learning_rate': 6.3315508021390385e-06, 'mean_token_accuracy': 0.9771873950958252, 'epoch': 6.83}
705
+ {'loss': 0.0676, 'grad_norm': 1.3483493328094482, 'learning_rate': 6.3101604278074865e-06, 'mean_token_accuracy': 0.979723185300827, 'epoch': 6.84}
706
+ {'loss': 0.0742, 'grad_norm': 1.5553311109542847, 'learning_rate': 6.288770053475936e-06, 'mean_token_accuracy': 0.9785968244075776, 'epoch': 6.86}
707
+ {'loss': 0.0696, 'grad_norm': 1.6261348724365234, 'learning_rate': 6.267379679144385e-06, 'mean_token_accuracy': 0.9814589977264404, 'epoch': 6.87}
708
+ {'loss': 0.0737, 'grad_norm': 1.3198540210723877, 'learning_rate': 6.245989304812835e-06, 'mean_token_accuracy': 0.9793163061141967, 'epoch': 6.88}
709
+ {'loss': 0.0729, 'grad_norm': 1.303162693977356, 'learning_rate': 6.224598930481284e-06, 'mean_token_accuracy': 0.9804379165172576, 'epoch': 6.89}
710
+ {'loss': 0.0818, 'grad_norm': 1.7500677108764648, 'learning_rate': 6.203208556149734e-06, 'mean_token_accuracy': 0.976429671049118, 'epoch': 6.9}
711
+ {'loss': 0.0792, 'grad_norm': 1.299370527267456, 'learning_rate': 6.181818181818182e-06, 'mean_token_accuracy': 0.9785773515701294, 'epoch': 6.91}
712
+ {'loss': 0.0612, 'grad_norm': 1.605732798576355, 'learning_rate': 6.1604278074866315e-06, 'mean_token_accuracy': 0.9837532937526703, 'epoch': 6.92}
713
+ {'loss': 0.072, 'grad_norm': 1.311928153038025, 'learning_rate': 6.1390374331550805e-06, 'mean_token_accuracy': 0.9810349762439727, 'epoch': 6.93}
714
+ {'loss': 0.0715, 'grad_norm': 1.3814005851745605, 'learning_rate': 6.11764705882353e-06, 'mean_token_accuracy': 0.9787093818187713, 'epoch': 6.94}
715
+ {'loss': 0.0762, 'grad_norm': 1.477414608001709, 'learning_rate': 6.096256684491979e-06, 'mean_token_accuracy': 0.9765677690505982, 'epoch': 6.95}
716
+ {'loss': 0.078, 'grad_norm': 1.0910104513168335, 'learning_rate': 6.074866310160429e-06, 'mean_token_accuracy': 0.9769729673862457, 'epoch': 6.96}
717
+ {'loss': 0.0647, 'grad_norm': 1.2030363082885742, 'learning_rate': 6.053475935828878e-06, 'mean_token_accuracy': 0.9822547733783722, 'epoch': 6.97}
718
+ {'loss': 0.0751, 'grad_norm': 1.7072525024414062, 'learning_rate': 6.032085561497326e-06, 'mean_token_accuracy': 0.9792082965373993, 'epoch': 6.98}
719
+ {'loss': 0.0657, 'grad_norm': 1.899641513824463, 'learning_rate': 6.010695187165776e-06, 'mean_token_accuracy': 0.981491768360138, 'epoch': 6.99}
720
+ {'eval_loss': 1.0816563367843628, 'eval_runtime': 97.3267, 'eval_samples_per_second': 13.552, 'eval_steps_per_second': 0.853, 'eval_mean_token_accuracy': 0.8144699301231991, 'epoch': 7.0}
721
+ Upload 5 LFS files: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 5/5 [05:22<00:00, 64.54s/it] :42<00:38, 15.6MB/s]
722
+ model-00001-of-00002.safetensors: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‰| 4.96G/4.96G [05:21<00:00, 24.2MB/s]
723
+
724
+
725
+ Upload 5 LFS files: 20%|β–ˆβ–ˆ | 1/5 [05:22<21:30, 322.70s/it]
wandb/run-20250402_145246-e1n3xkh6/run-e1n3xkh6.wandb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7d5e63f2d8c41c9e7d69f18258219e0e4c9986a77722c53b4ead3db8bf1f279
3
- size 4489216
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0401b7c289100c853999761d085d818b6fedc5e89eef641e6ef61f369dfa9426
3
+ size 5537792