nileshmalpeddi commited on
Commit
799ec3c
·
verified ·
1 Parent(s): be1203d

End of training

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  base_model: LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct
3
- datasets: testing_layer_29_Matrix_mlp.c_proj
4
  library_name: transformers
5
  model_name: output_exaone_tucker_rank16_newdata_b1_g2_gradient1_e1_b512
6
  tags:
@@ -12,7 +12,7 @@ licence: license
12
 
13
  # Model Card for output_exaone_tucker_rank16_newdata_b1_g2_gradient1_e1_b512
14
 
15
- This model is a fine-tuned version of [LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct) on the [testing_layer_29_Matrix_mlp.c_proj](https://huggingface.co/datasets/testing_layer_29_Matrix_mlp.c_proj) dataset.
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
 
1
  ---
2
  base_model: LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct
3
+ datasets: testing_layer_29_Matrix_attn.attention.v_proj
4
  library_name: transformers
5
  model_name: output_exaone_tucker_rank16_newdata_b1_g2_gradient1_e1_b512
6
  tags:
 
12
 
13
  # Model Card for output_exaone_tucker_rank16_newdata_b1_g2_gradient1_e1_b512
14
 
15
+ This model is a fine-tuned version of [LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct) on the [testing_layer_29_Matrix_attn.attention.v_proj](https://huggingface.co/datasets/testing_layer_29_Matrix_attn.attention.v_proj) dataset.
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
all_results.json CHANGED
@@ -1,15 +1,15 @@
1
  {
2
- "eval_accuracy": 0.8084207240251016,
3
- "eval_loss": 1.6845703125,
4
- "eval_runtime": 933.2948,
5
  "eval_samples": 25971,
6
- "eval_samples_per_second": 27.827,
7
- "eval_steps_per_second": 13.914,
8
- "perplexity": 5.390134361325962,
9
- "total_flos": 9.396165357758054e+17,
10
- "train_loss": 1.722812063241433,
11
- "train_runtime": 12082.0087,
12
  "train_samples": 294393,
13
- "train_samples_per_second": 24.366,
14
- "train_steps_per_second": 12.183
15
  }
 
1
  {
2
+ "eval_accuracy": 0.8090923506567009,
3
+ "eval_loss": 1.6630859375,
4
+ "eval_runtime": 949.3026,
5
  "eval_samples": 25971,
6
+ "eval_samples_per_second": 27.358,
7
+ "eval_steps_per_second": 13.68,
8
+ "perplexity": 5.2755658170209605,
9
+ "total_flos": 9.323659620390011e+17,
10
+ "train_loss": 1.669835741941364,
11
+ "train_runtime": 14265.6788,
12
  "train_samples": 294393,
13
+ "train_samples_per_second": 20.636,
14
+ "train_steps_per_second": 10.318
15
  }
eval_layer_index_29_mat_attn.attention.v_proj_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eval_accuracy": 0.8090923506567009,
3
+ "eval_loss": 1.6630859375,
4
+ "eval_runtime": 949.3026,
5
+ "eval_samples": 25971,
6
+ "eval_samples_per_second": 27.358,
7
+ "eval_steps_per_second": 13.68,
8
+ "perplexity": 5.2755658170209605
9
+ }
model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d7f8ac55165156d24fd68aee4367a9732cf51e950a3f1e760a2a14a26fae8da
3
- size 4701521288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dcf78f6714bde10266e3f9400e22a6573d0a01dbdef65f5e53d6b29dd7e1453
3
+ size 4669288872
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 5225777664
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00002-of-00002.safetensors",
@@ -202,10 +202,18 @@
202
  "transformer.h.28.mlp.c_fc_0.weight": "model-00001-of-00002.safetensors",
203
  "transformer.h.28.mlp.c_fc_1.weight": "model-00001-of-00002.safetensors",
204
  "transformer.h.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
205
- "transformer.h.29.attn.attention.k_proj.weight": "model-00001-of-00002.safetensors",
206
- "transformer.h.29.attn.attention.out_proj.weight": "model-00001-of-00002.safetensors",
207
- "transformer.h.29.attn.attention.q_proj.weight": "model-00001-of-00002.safetensors",
208
- "transformer.h.29.attn.attention.v_proj.weight": "model-00001-of-00002.safetensors",
 
 
 
 
 
 
 
 
209
  "transformer.h.29.ln_1.weight": "model-00001-of-00002.safetensors",
210
  "transformer.h.29.ln_2.weight": "model-00001-of-00002.safetensors",
211
  "transformer.h.29.mlp.c_fc_0.tucker_core": "model-00001-of-00002.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 5193544192
4
  },
5
  "weight_map": {
6
  "lm_head.weight": "model-00002-of-00002.safetensors",
 
202
  "transformer.h.28.mlp.c_fc_0.weight": "model-00001-of-00002.safetensors",
203
  "transformer.h.28.mlp.c_fc_1.weight": "model-00001-of-00002.safetensors",
204
  "transformer.h.28.mlp.c_proj.weight": "model-00001-of-00002.safetensors",
205
+ "transformer.h.29.attn.attention.k_proj.tucker_core": "model-00001-of-00002.safetensors",
206
+ "transformer.h.29.attn.attention.k_proj.tucker_factors.0": "model-00001-of-00002.safetensors",
207
+ "transformer.h.29.attn.attention.k_proj.tucker_factors.1": "model-00001-of-00002.safetensors",
208
+ "transformer.h.29.attn.attention.out_proj.tucker_core": "model-00001-of-00002.safetensors",
209
+ "transformer.h.29.attn.attention.out_proj.tucker_factors.0": "model-00001-of-00002.safetensors",
210
+ "transformer.h.29.attn.attention.out_proj.tucker_factors.1": "model-00001-of-00002.safetensors",
211
+ "transformer.h.29.attn.attention.q_proj.tucker_core": "model-00001-of-00002.safetensors",
212
+ "transformer.h.29.attn.attention.q_proj.tucker_factors.0": "model-00001-of-00002.safetensors",
213
+ "transformer.h.29.attn.attention.q_proj.tucker_factors.1": "model-00001-of-00002.safetensors",
214
+ "transformer.h.29.attn.attention.v_proj.tucker_core": "model-00001-of-00002.safetensors",
215
+ "transformer.h.29.attn.attention.v_proj.tucker_factors.0": "model-00001-of-00002.safetensors",
216
+ "transformer.h.29.attn.attention.v_proj.tucker_factors.1": "model-00001-of-00002.safetensors",
217
  "transformer.h.29.ln_1.weight": "model-00001-of-00002.safetensors",
218
  "transformer.h.29.ln_2.weight": "model-00001-of-00002.safetensors",
219
  "transformer.h.29.mlp.c_fc_0.tucker_core": "model-00001-of-00002.safetensors",
runs/Mar15_01-18-11_demo2/events.out.tfevents.1742042106.demo2.212607.2 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31ecb633773d843b8760393649d5731486bb64a0998cdfe978cc4247e9f5b9c4
3
+ size 88588
runs/Mar15_01-18-11_demo2/events.out.tfevents.1742057348.demo2.212607.3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f33d18e169ebf196c3b52402913694d14174b4910c32ff5d56541b722c948ad2
3
+ size 481
train_layer_index_29_mat_attn.attention.v_proj_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 9.323659620390011e+17,
3
+ "train_loss": 1.669835741941364,
4
+ "train_runtime": 14265.6788,
5
+ "train_samples": 294393,
6
+ "train_samples_per_second": 20.636,
7
+ "train_steps_per_second": 10.318
8
+ }
trainer_state.json CHANGED
The diff for this file is too large to render. See raw diff