rbelanec commited on
Commit
942b608
verified
1 Parent(s): 24ae4b4

Model save

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: gemma
4
+ base_model: google/gemma-3-1b-it
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: train_2025-04-09-14-52-53
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # train_2025-04-09-14-52-53
18
+
19
+ This model is a fine-tuned version of [google/gemma-3-1b-it](https://huggingface.co/google/gemma-3-1b-it) on an unknown dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 32
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - gradient_accumulation_steps: 8
43
+ - total_train_batch_size: 256
44
+ - optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
45
+ - lr_scheduler_type: cosine
46
+ - num_epochs: 3.0
47
+
48
+ ### Training results
49
+
50
+
51
+
52
+ ### Framework versions
53
+
54
+ - PEFT 0.15.0
55
+ - Transformers 4.50.0
56
+ - Pytorch 2.6.0+cu124
57
+ - Datasets 3.4.1
58
+ - Tokenizers 0.21.0
adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "google/gemma-3-1b-it",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 16,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 8,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "up_proj",
28
+ "o_proj",
29
+ "k_proj",
30
+ "down_proj",
31
+ "gate_proj",
32
+ "v_proj",
33
+ "q_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ba242fb6fb06c848ee6a6a39d94cb3de39fe4b910af75f17648b9f4b923a30a
3
+ size 26139264
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.992,
3
+ "eval_glue_mnli_eval_loss": 0.15313956141471863,
4
+ "eval_glue_mnli_eval_runtime": 9.6591,
5
+ "eval_glue_mnli_eval_samples_per_second": 103.529,
6
+ "eval_glue_mnli_eval_steps_per_second": 12.941,
7
+ "num_input_tokens_seen": 194480,
8
+ "total_flos": 821972377374720.0,
9
+ "train_loss": 0.07222598022030245,
10
+ "train_runtime": 189.9362,
11
+ "train_samples_per_second": 15.795,
12
+ "train_steps_per_second": 0.979
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.992,
3
+ "eval_glue_mnli_eval_loss": 0.15313956141471863,
4
+ "eval_glue_mnli_eval_runtime": 9.6591,
5
+ "eval_glue_mnli_eval_samples_per_second": 103.529,
6
+ "eval_glue_mnli_eval_steps_per_second": 12.941,
7
+ "num_input_tokens_seen": 194480
8
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<end_of_turn>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "boi_token": "<start_of_image>",
12
+ "bos_token": {
13
+ "content": "<bos>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eoi_token": "<end_of_image>",
20
+ "eos_token": {
21
+ "content": "<eos>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "image_token": "<image_soft_token>",
28
+ "pad_token": {
29
+ "content": "<pad>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "unk_token": {
36
+ "content": "<unk>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.992,
3
+ "num_input_tokens_seen": 194480,
4
+ "total_flos": 821972377374720.0,
5
+ "train_loss": 0.07222598022030245,
6
+ "train_runtime": 189.9362,
7
+ "train_samples_per_second": 15.795,
8
+ "train_steps_per_second": 0.979
9
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 18750, "loss": 15.5942, "lr": 4.999999122701883e-05, "epoch": 0.0008, "percentage": 0.03, "elapsed_time": "0:00:11", "remaining_time": "11:57:17", "throughput": 443.22, "total_tokens": 5088}
2
+ {"current_steps": 10, "total_steps": 18750, "loss": 6.4983, "lr": 4.9999964908081455e-05, "epoch": 0.0016, "percentage": 0.05, "elapsed_time": "0:00:22", "remaining_time": "11:36:53", "throughput": 443.87, "total_tokens": 9904}
3
+ {"current_steps": 15, "total_steps": 18750, "loss": 2.6258, "lr": 4.999992104320636e-05, "epoch": 0.0024, "percentage": 0.08, "elapsed_time": "0:00:32", "remaining_time": "11:26:34", "throughput": 456.98, "total_tokens": 15072}
4
+ {"current_steps": 20, "total_steps": 18750, "loss": 0.8446, "lr": 4.999985963242432e-05, "epoch": 0.0032, "percentage": 0.11, "elapsed_time": "0:00:43", "remaining_time": "11:19:09", "throughput": 474.34, "total_tokens": 20640}
5
+ {"current_steps": 25, "total_steps": 18750, "loss": 0.4124, "lr": 4.999978067577844e-05, "epoch": 0.004, "percentage": 0.13, "elapsed_time": "0:00:54", "remaining_time": "11:16:53", "throughput": 472.41, "total_tokens": 25616}
6
+ {"current_steps": 30, "total_steps": 18750, "loss": 0.4033, "lr": 4.999968417332415e-05, "epoch": 0.0048, "percentage": 0.16, "elapsed_time": "0:01:04", "remaining_time": "11:12:39", "throughput": 474.71, "total_tokens": 30704}
7
+ {"current_steps": 35, "total_steps": 18750, "loss": 0.3182, "lr": 4.999957012512916e-05, "epoch": 0.0056, "percentage": 0.19, "elapsed_time": "0:01:15", "remaining_time": "11:09:48", "throughput": 484.94, "total_tokens": 36448}
8
+ {"current_steps": 40, "total_steps": 18750, "loss": 0.2778, "lr": 4.999943853127351e-05, "epoch": 0.0064, "percentage": 0.21, "elapsed_time": "0:01:25", "remaining_time": "11:06:30", "throughput": 488.26, "total_tokens": 41744}
9
+ {"current_steps": 45, "total_steps": 18750, "loss": 0.2592, "lr": 4.999928939184958e-05, "epoch": 0.0072, "percentage": 0.24, "elapsed_time": "0:01:36", "remaining_time": "11:06:07", "throughput": 487.39, "total_tokens": 46864}
10
+ {"current_steps": 50, "total_steps": 18750, "loss": 0.2738, "lr": 4.999912270696202e-05, "epoch": 0.008, "percentage": 0.27, "elapsed_time": "0:01:46", "remaining_time": "11:04:40", "throughput": 486.01, "total_tokens": 51824}
11
+ {"current_steps": 55, "total_steps": 18750, "loss": 0.2352, "lr": 4.9998938476727826e-05, "epoch": 0.0088, "percentage": 0.29, "elapsed_time": "0:01:56", "remaining_time": "11:02:41", "throughput": 486.12, "total_tokens": 56864}
12
+ {"current_steps": 60, "total_steps": 18750, "loss": 0.2474, "lr": 4.9998736701276295e-05, "epoch": 0.0096, "percentage": 0.32, "elapsed_time": "0:02:07", "remaining_time": "11:01:43", "throughput": 489.57, "total_tokens": 62400}
13
+ {"current_steps": 65, "total_steps": 18750, "loss": 0.2061, "lr": 4.999851738074904e-05, "epoch": 0.0104, "percentage": 0.35, "elapsed_time": "0:02:18", "remaining_time": "11:01:24", "throughput": 489.44, "total_tokens": 67568}
14
+ {"current_steps": 70, "total_steps": 18750, "loss": 0.1937, "lr": 4.99982805153e-05, "epoch": 0.0112, "percentage": 0.37, "elapsed_time": "0:02:28", "remaining_time": "11:01:35", "throughput": 488.33, "total_tokens": 72640}
15
+ {"current_steps": 75, "total_steps": 18750, "loss": 0.2521, "lr": 4.9998026105095405e-05, "epoch": 0.012, "percentage": 0.4, "elapsed_time": "0:02:39", "remaining_time": "11:01:27", "throughput": 487.06, "total_tokens": 77632}
16
+ {"current_steps": 80, "total_steps": 18750, "loss": 0.2537, "lr": 4.9997754150313815e-05, "epoch": 0.0128, "percentage": 0.43, "elapsed_time": "0:02:49", "remaining_time": "11:01:02", "throughput": 488.8, "total_tokens": 83072}
17
+ {"current_steps": 85, "total_steps": 18750, "loss": 0.1923, "lr": 4.999746465114609e-05, "epoch": 0.0136, "percentage": 0.45, "elapsed_time": "0:03:00", "remaining_time": "11:00:32", "throughput": 488.55, "total_tokens": 88176}
18
+ {"current_steps": 90, "total_steps": 18750, "loss": 0.2011, "lr": 4.999715760779541e-05, "epoch": 0.0144, "percentage": 0.48, "elapsed_time": "0:03:10", "remaining_time": "10:59:51", "throughput": 487.4, "total_tokens": 93072}
19
+ {"current_steps": 95, "total_steps": 18750, "loss": 0.2032, "lr": 4.9996833020477285e-05, "epoch": 0.0152, "percentage": 0.51, "elapsed_time": "0:03:21", "remaining_time": "10:58:59", "throughput": 489.17, "total_tokens": 98496}
20
+ {"current_steps": 100, "total_steps": 18750, "loss": 0.1789, "lr": 4.9996490889419514e-05, "epoch": 0.016, "percentage": 0.53, "elapsed_time": "0:03:31", "remaining_time": "10:58:09", "throughput": 489.58, "total_tokens": 103664}
21
+ {"current_steps": 105, "total_steps": 18750, "loss": 0.1652, "lr": 4.999613121486222e-05, "epoch": 0.0168, "percentage": 0.56, "elapsed_time": "0:03:43", "remaining_time": "11:01:38", "throughput": 486.45, "total_tokens": 108752}
22
+ {"current_steps": 110, "total_steps": 18750, "loss": 0.1764, "lr": 4.999575399705783e-05, "epoch": 0.0176, "percentage": 0.59, "elapsed_time": "0:03:54", "remaining_time": "11:01:05", "throughput": 487.84, "total_tokens": 114192}
23
+ {"current_steps": 115, "total_steps": 18750, "loss": 0.2337, "lr": 4.999535923627109e-05, "epoch": 0.0184, "percentage": 0.61, "elapsed_time": "0:04:04", "remaining_time": "11:01:05", "throughput": 486.89, "total_tokens": 119184}
24
+ {"current_steps": 120, "total_steps": 18750, "loss": 0.1412, "lr": 4.999494693277907e-05, "epoch": 0.0192, "percentage": 0.64, "elapsed_time": "0:04:15", "remaining_time": "11:00:54", "throughput": 485.65, "total_tokens": 124048}
25
+ {"current_steps": 105, "total_steps": 73629, "loss": 0.138, "lr": 4.999974910625973e-05, "epoch": 0.0042780530784156945, "percentage": 0.14, "elapsed_time": "0:00:11", "remaining_time": "2:11:15", "throughput": 9729.49, "total_tokens": 109424}
26
+ {"current_steps": 110, "total_steps": 73629, "loss": 0.1529, "lr": 4.999972464274266e-05, "epoch": 0.004481769891673585, "percentage": 0.15, "elapsed_time": "0:00:21", "remaining_time": "4:03:47", "throughput": 5230.71, "total_tokens": 114480}
27
+ {"current_steps": 115, "total_steps": 73629, "loss": 0.1215, "lr": 4.999969904139614e-05, "epoch": 0.004685486704931475, "percentage": 0.16, "elapsed_time": "0:00:32", "remaining_time": "5:45:40", "throughput": 3673.93, "total_tokens": 119200}
28
+ {"current_steps": 120, "total_steps": 73629, "loss": 0.1755, "lr": 4.999967230222132e-05, "epoch": 0.004889203518189365, "percentage": 0.16, "elapsed_time": "0:00:43", "remaining_time": "7:19:28", "throughput": 2892.56, "total_tokens": 124512}
29
+ {"current_steps": 125, "total_steps": 73629, "loss": 0.2017, "lr": 4.9999644425219425e-05, "epoch": 0.005092920331447255, "percentage": 0.17, "elapsed_time": "0:00:53", "remaining_time": "8:45:44", "throughput": 2406.98, "total_tokens": 129120}
30
+ {"current_steps": 130, "total_steps": 73629, "loss": 0.2325, "lr": 4.999961541039172e-05, "epoch": 0.005296637144705145, "percentage": 0.18, "elapsed_time": "0:01:04", "remaining_time": "10:06:30", "throughput": 2081.85, "total_tokens": 134000}
31
+ {"current_steps": 135, "total_steps": 73629, "loss": 0.2013, "lr": 4.999958525773953e-05, "epoch": 0.005500353957963036, "percentage": 0.18, "elapsed_time": "0:01:14", "remaining_time": "11:19:27", "throughput": 1853.49, "total_tokens": 138800}
32
+ {"current_steps": 140, "total_steps": 73629, "loss": 0.2315, "lr": 4.9999553967264225e-05, "epoch": 0.005704070771220926, "percentage": 0.19, "elapsed_time": "0:01:25", "remaining_time": "12:27:27", "throughput": 1681.73, "total_tokens": 143680}
33
+ {"current_steps": 145, "total_steps": 73629, "loss": 0.2167, "lr": 4.999952153896723e-05, "epoch": 0.005907787584478816, "percentage": 0.2, "elapsed_time": "0:01:35", "remaining_time": "13:30:48", "throughput": 1548.75, "total_tokens": 148672}
34
+ {"current_steps": 150, "total_steps": 73629, "loss": 0.1608, "lr": 4.999948797285002e-05, "epoch": 0.006111504397736706, "percentage": 0.2, "elapsed_time": "0:01:46", "remaining_time": "14:28:35", "throughput": 1447.54, "total_tokens": 154000}
35
+ {"current_steps": 155, "total_steps": 73629, "loss": 0.2012, "lr": 4.999945326891413e-05, "epoch": 0.006315221210994596, "percentage": 0.21, "elapsed_time": "0:01:57", "remaining_time": "15:24:54", "throughput": 1356.45, "total_tokens": 158800}
36
+ {"current_steps": 160, "total_steps": 73629, "loss": 0.1255, "lr": 4.999941742716113e-05, "epoch": 0.0065189380242524865, "percentage": 0.22, "elapsed_time": "0:02:07", "remaining_time": "16:14:22", "throughput": 1289.37, "total_tokens": 164160}
37
+ {"current_steps": 165, "total_steps": 73629, "loss": 0.1888, "lr": 4.9999380447592646e-05, "epoch": 0.006722654837510377, "percentage": 0.22, "elapsed_time": "0:02:17", "remaining_time": "16:59:36", "throughput": 1231.88, "total_tokens": 169264}
38
+ {"current_steps": 105, "total_steps": 186, "loss": 0.1236, "lr": 1.9967536997783494e-05, "epoch": 1.688, "percentage": 56.45, "elapsed_time": "0:00:11", "remaining_time": "0:00:08", "throughput": 9657.79, "total_tokens": 108992}
39
+ {"current_steps": 110, "total_steps": 186, "loss": 0.2096, "lr": 1.79198623329424e-05, "epoch": 1.768, "percentage": 59.14, "elapsed_time": "0:00:22", "remaining_time": "0:00:15", "throughput": 5089.64, "total_tokens": 113888}
40
+ {"current_steps": 115, "total_steps": 186, "loss": 0.1727, "lr": 1.5922653499838137e-05, "epoch": 1.8479999999999999, "percentage": 61.83, "elapsed_time": "0:00:33", "remaining_time": "0:00:20", "throughput": 3593.31, "total_tokens": 119520}
41
+ {"current_steps": 120, "total_steps": 186, "loss": 0.1751, "lr": 1.399014621105914e-05, "epoch": 1.928, "percentage": 64.52, "elapsed_time": "0:00:44", "remaining_time": "0:00:24", "throughput": 2815.55, "total_tokens": 124624}
42
+ {"current_steps": 125, "total_steps": 186, "loss": 0.1907, "lr": 1.2136114999284288e-05, "epoch": 2.016, "percentage": 67.2, "elapsed_time": "0:00:55", "remaining_time": "0:00:27", "throughput": 2324.66, "total_tokens": 130096}
43
+ {"current_steps": 130, "total_steps": 186, "loss": 0.1838, "lr": 1.0373775035117305e-05, "epoch": 2.096, "percentage": 69.89, "elapsed_time": "0:01:06", "remaining_time": "0:00:28", "throughput": 2040.13, "total_tokens": 135040}
44
+ {"current_steps": 135, "total_steps": 186, "loss": 0.1376, "lr": 8.715687931944449e-06, "epoch": 2.176, "percentage": 72.58, "elapsed_time": "0:01:16", "remaining_time": "0:00:28", "throughput": 1844.31, "total_tokens": 140480}
45
+ {"current_steps": 140, "total_steps": 186, "loss": 0.139, "lr": 7.173672209219495e-06, "epoch": 2.2560000000000002, "percentage": 75.27, "elapsed_time": "0:01:26", "remaining_time": "0:00:28", "throughput": 1678.74, "total_tokens": 145744}
46
+ {"current_steps": 145, "total_steps": 186, "loss": 0.1949, "lr": 5.758719052376693e-06, "epoch": 2.336, "percentage": 77.96, "elapsed_time": "0:01:38", "remaining_time": "0:00:27", "throughput": 1538.88, "total_tokens": 150864}
47
+ {"current_steps": 150, "total_steps": 186, "loss": 0.1297, "lr": 4.480913969818098e-06, "epoch": 2.416, "percentage": 80.65, "elapsed_time": "0:01:48", "remaining_time": "0:00:26", "throughput": 1432.55, "total_tokens": 156064}
48
+ {"current_steps": 155, "total_steps": 186, "loss": 0.1518, "lr": 3.3493649053890326e-06, "epoch": 2.496, "percentage": 83.33, "elapsed_time": "0:01:59", "remaining_time": "0:00:23", "throughput": 1347.92, "total_tokens": 161584}
49
+ {"current_steps": 160, "total_steps": 186, "loss": 0.1426, "lr": 2.372137318741968e-06, "epoch": 2.576, "percentage": 86.02, "elapsed_time": "0:02:10", "remaining_time": "0:00:21", "throughput": 1273.36, "total_tokens": 166624}
50
+ {"current_steps": 165, "total_steps": 186, "loss": 0.1039, "lr": 1.5561966963229924e-06, "epoch": 2.656, "percentage": 88.71, "elapsed_time": "0:02:21", "remaining_time": "0:00:18", "throughput": 1213.15, "total_tokens": 172144}
51
+ {"current_steps": 170, "total_steps": 186, "loss": 0.1457, "lr": 9.073589027514789e-07, "epoch": 2.7359999999999998, "percentage": 91.4, "elapsed_time": "0:02:32", "remaining_time": "0:00:14", "throughput": 1162.41, "total_tokens": 177536}
52
+ {"current_steps": 175, "total_steps": 186, "loss": 0.1217, "lr": 4.302487264785521e-07, "epoch": 2.816, "percentage": 94.09, "elapsed_time": "0:02:43", "remaining_time": "0:00:10", "throughput": 1112.2, "total_tokens": 182352}
53
+ {"current_steps": 180, "total_steps": 186, "loss": 0.1646, "lr": 1.2826691520262114e-07, "epoch": 2.896, "percentage": 96.77, "elapsed_time": "0:02:54", "remaining_time": "0:00:05", "throughput": 1074.45, "total_tokens": 187520}
54
+ {"current_steps": 185, "total_steps": 186, "loss": 0.1734, "lr": 3.565936007254855e-09, "epoch": 2.976, "percentage": 99.46, "elapsed_time": "0:03:05", "remaining_time": "0:00:01", "throughput": 1044.75, "total_tokens": 193280}
55
+ {"current_steps": 186, "total_steps": 186, "epoch": 2.992, "percentage": 100.0, "elapsed_time": "0:03:08", "remaining_time": "0:00:00", "throughput": 1032.12, "total_tokens": 194480}
56
+ {"current_steps": 186, "total_steps": 186, "epoch": 2.992, "percentage": 100.0, "elapsed_time": "0:00:00", "remaining_time": "0:00:00", "throughput": 37395509.19, "total_tokens": 194480}
trainer_state.json ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.992,
6
+ "eval_steps": 500,
7
+ "global_step": 186,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0008,
14
+ "grad_norm": 56.061981201171875,
15
+ "learning_rate": 4.999999122701883e-05,
16
+ "loss": 15.5942,
17
+ "num_input_tokens_seen": 5088,
18
+ "step": 5
19
+ },
20
+ {
21
+ "epoch": 0.0016,
22
+ "grad_norm": 28.854398727416992,
23
+ "learning_rate": 4.9999964908081455e-05,
24
+ "loss": 6.4983,
25
+ "num_input_tokens_seen": 9904,
26
+ "step": 10
27
+ },
28
+ {
29
+ "epoch": 0.0024,
30
+ "grad_norm": 20.881488800048828,
31
+ "learning_rate": 4.999992104320636e-05,
32
+ "loss": 2.6258,
33
+ "num_input_tokens_seen": 15072,
34
+ "step": 15
35
+ },
36
+ {
37
+ "epoch": 0.0032,
38
+ "grad_norm": 16.009796142578125,
39
+ "learning_rate": 4.999985963242432e-05,
40
+ "loss": 0.8446,
41
+ "num_input_tokens_seen": 20640,
42
+ "step": 20
43
+ },
44
+ {
45
+ "epoch": 0.004,
46
+ "grad_norm": 12.420453071594238,
47
+ "learning_rate": 4.999978067577844e-05,
48
+ "loss": 0.4124,
49
+ "num_input_tokens_seen": 25616,
50
+ "step": 25
51
+ },
52
+ {
53
+ "epoch": 0.0048,
54
+ "grad_norm": 20.00946807861328,
55
+ "learning_rate": 4.999968417332415e-05,
56
+ "loss": 0.4033,
57
+ "num_input_tokens_seen": 30704,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.0056,
62
+ "grad_norm": 6.828528881072998,
63
+ "learning_rate": 4.999957012512916e-05,
64
+ "loss": 0.3182,
65
+ "num_input_tokens_seen": 36448,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 0.0064,
70
+ "grad_norm": 7.609297752380371,
71
+ "learning_rate": 4.999943853127351e-05,
72
+ "loss": 0.2778,
73
+ "num_input_tokens_seen": 41744,
74
+ "step": 40
75
+ },
76
+ {
77
+ "epoch": 0.0072,
78
+ "grad_norm": 10.703560829162598,
79
+ "learning_rate": 4.999928939184958e-05,
80
+ "loss": 0.2592,
81
+ "num_input_tokens_seen": 46864,
82
+ "step": 45
83
+ },
84
+ {
85
+ "epoch": 0.008,
86
+ "grad_norm": 9.821195602416992,
87
+ "learning_rate": 4.999912270696202e-05,
88
+ "loss": 0.2738,
89
+ "num_input_tokens_seen": 51824,
90
+ "step": 50
91
+ },
92
+ {
93
+ "epoch": 0.0088,
94
+ "grad_norm": 8.621257781982422,
95
+ "learning_rate": 4.9998938476727826e-05,
96
+ "loss": 0.2352,
97
+ "num_input_tokens_seen": 56864,
98
+ "step": 55
99
+ },
100
+ {
101
+ "epoch": 0.0096,
102
+ "grad_norm": 7.617590427398682,
103
+ "learning_rate": 4.9998736701276295e-05,
104
+ "loss": 0.2474,
105
+ "num_input_tokens_seen": 62400,
106
+ "step": 60
107
+ },
108
+ {
109
+ "epoch": 0.0104,
110
+ "grad_norm": 6.092931747436523,
111
+ "learning_rate": 4.999851738074904e-05,
112
+ "loss": 0.2061,
113
+ "num_input_tokens_seen": 67568,
114
+ "step": 65
115
+ },
116
+ {
117
+ "epoch": 0.0112,
118
+ "grad_norm": 7.17249059677124,
119
+ "learning_rate": 4.99982805153e-05,
120
+ "loss": 0.1937,
121
+ "num_input_tokens_seen": 72640,
122
+ "step": 70
123
+ },
124
+ {
125
+ "epoch": 0.012,
126
+ "grad_norm": 9.022090911865234,
127
+ "learning_rate": 4.9998026105095405e-05,
128
+ "loss": 0.2521,
129
+ "num_input_tokens_seen": 77632,
130
+ "step": 75
131
+ },
132
+ {
133
+ "epoch": 0.0128,
134
+ "grad_norm": 10.729183197021484,
135
+ "learning_rate": 4.9997754150313815e-05,
136
+ "loss": 0.2537,
137
+ "num_input_tokens_seen": 83072,
138
+ "step": 80
139
+ },
140
+ {
141
+ "epoch": 0.0136,
142
+ "grad_norm": 2.3064322471618652,
143
+ "learning_rate": 4.999746465114609e-05,
144
+ "loss": 0.1923,
145
+ "num_input_tokens_seen": 88176,
146
+ "step": 85
147
+ },
148
+ {
149
+ "epoch": 0.0144,
150
+ "grad_norm": 4.6601881980896,
151
+ "learning_rate": 4.999715760779541e-05,
152
+ "loss": 0.2011,
153
+ "num_input_tokens_seen": 93072,
154
+ "step": 90
155
+ },
156
+ {
157
+ "epoch": 0.0152,
158
+ "grad_norm": 3.4692001342773438,
159
+ "learning_rate": 4.9996833020477285e-05,
160
+ "loss": 0.2032,
161
+ "num_input_tokens_seen": 98496,
162
+ "step": 95
163
+ },
164
+ {
165
+ "epoch": 0.016,
166
+ "grad_norm": 3.452565908432007,
167
+ "learning_rate": 4.9996490889419514e-05,
168
+ "loss": 0.1789,
169
+ "num_input_tokens_seen": 103664,
170
+ "step": 100
171
+ },
172
+ {
173
+ "epoch": 1.688,
174
+ "grad_norm": 4.022520065307617,
175
+ "learning_rate": 1.9967536997783494e-05,
176
+ "loss": 0.1236,
177
+ "num_input_tokens_seen": 108992,
178
+ "step": 105
179
+ },
180
+ {
181
+ "epoch": 1.768,
182
+ "grad_norm": 8.722804069519043,
183
+ "learning_rate": 1.79198623329424e-05,
184
+ "loss": 0.2096,
185
+ "num_input_tokens_seen": 113888,
186
+ "step": 110
187
+ },
188
+ {
189
+ "epoch": 1.8479999999999999,
190
+ "grad_norm": 5.1005401611328125,
191
+ "learning_rate": 1.5922653499838137e-05,
192
+ "loss": 0.1727,
193
+ "num_input_tokens_seen": 119520,
194
+ "step": 115
195
+ },
196
+ {
197
+ "epoch": 1.928,
198
+ "grad_norm": 3.2674338817596436,
199
+ "learning_rate": 1.399014621105914e-05,
200
+ "loss": 0.1751,
201
+ "num_input_tokens_seen": 124624,
202
+ "step": 120
203
+ },
204
+ {
205
+ "epoch": 2.016,
206
+ "grad_norm": 5.578003406524658,
207
+ "learning_rate": 1.2136114999284288e-05,
208
+ "loss": 0.1907,
209
+ "num_input_tokens_seen": 130096,
210
+ "step": 125
211
+ },
212
+ {
213
+ "epoch": 2.096,
214
+ "grad_norm": 4.567383766174316,
215
+ "learning_rate": 1.0373775035117305e-05,
216
+ "loss": 0.1838,
217
+ "num_input_tokens_seen": 135040,
218
+ "step": 130
219
+ },
220
+ {
221
+ "epoch": 2.176,
222
+ "grad_norm": 4.175036907196045,
223
+ "learning_rate": 8.715687931944449e-06,
224
+ "loss": 0.1376,
225
+ "num_input_tokens_seen": 140480,
226
+ "step": 135
227
+ },
228
+ {
229
+ "epoch": 2.2560000000000002,
230
+ "grad_norm": 1.873349905014038,
231
+ "learning_rate": 7.173672209219495e-06,
232
+ "loss": 0.139,
233
+ "num_input_tokens_seen": 145744,
234
+ "step": 140
235
+ },
236
+ {
237
+ "epoch": 2.336,
238
+ "grad_norm": 5.774956226348877,
239
+ "learning_rate": 5.758719052376693e-06,
240
+ "loss": 0.1949,
241
+ "num_input_tokens_seen": 150864,
242
+ "step": 145
243
+ },
244
+ {
245
+ "epoch": 2.416,
246
+ "grad_norm": 3.4830589294433594,
247
+ "learning_rate": 4.480913969818098e-06,
248
+ "loss": 0.1297,
249
+ "num_input_tokens_seen": 156064,
250
+ "step": 150
251
+ },
252
+ {
253
+ "epoch": 2.496,
254
+ "grad_norm": 6.819346904754639,
255
+ "learning_rate": 3.3493649053890326e-06,
256
+ "loss": 0.1518,
257
+ "num_input_tokens_seen": 161584,
258
+ "step": 155
259
+ },
260
+ {
261
+ "epoch": 2.576,
262
+ "grad_norm": 3.335056781768799,
263
+ "learning_rate": 2.372137318741968e-06,
264
+ "loss": 0.1426,
265
+ "num_input_tokens_seen": 166624,
266
+ "step": 160
267
+ },
268
+ {
269
+ "epoch": 2.656,
270
+ "grad_norm": 4.781857490539551,
271
+ "learning_rate": 1.5561966963229924e-06,
272
+ "loss": 0.1039,
273
+ "num_input_tokens_seen": 172144,
274
+ "step": 165
275
+ },
276
+ {
277
+ "epoch": 2.7359999999999998,
278
+ "grad_norm": 4.5153985023498535,
279
+ "learning_rate": 9.073589027514789e-07,
280
+ "loss": 0.1457,
281
+ "num_input_tokens_seen": 177536,
282
+ "step": 170
283
+ },
284
+ {
285
+ "epoch": 2.816,
286
+ "grad_norm": 3.883399486541748,
287
+ "learning_rate": 4.302487264785521e-07,
288
+ "loss": 0.1217,
289
+ "num_input_tokens_seen": 182352,
290
+ "step": 175
291
+ },
292
+ {
293
+ "epoch": 2.896,
294
+ "grad_norm": 3.4770634174346924,
295
+ "learning_rate": 1.2826691520262114e-07,
296
+ "loss": 0.1646,
297
+ "num_input_tokens_seen": 187520,
298
+ "step": 180
299
+ },
300
+ {
301
+ "epoch": 2.976,
302
+ "grad_norm": 3.4242215156555176,
303
+ "learning_rate": 3.565936007254855e-09,
304
+ "loss": 0.1734,
305
+ "num_input_tokens_seen": 193280,
306
+ "step": 185
307
+ },
308
+ {
309
+ "epoch": 2.992,
310
+ "num_input_tokens_seen": 194480,
311
+ "step": 186,
312
+ "total_flos": 821972377374720.0,
313
+ "train_loss": 0.07222598022030245,
314
+ "train_runtime": 189.9362,
315
+ "train_samples_per_second": 15.795,
316
+ "train_steps_per_second": 0.979
317
+ }
318
+ ],
319
+ "logging_steps": 5,
320
+ "max_steps": 186,
321
+ "num_input_tokens_seen": 194480,
322
+ "num_train_epochs": 3,
323
+ "save_steps": 100,
324
+ "stateful_callbacks": {
325
+ "TrainerControl": {
326
+ "args": {
327
+ "should_epoch_stop": false,
328
+ "should_evaluate": false,
329
+ "should_log": false,
330
+ "should_save": true,
331
+ "should_training_stop": true
332
+ },
333
+ "attributes": {}
334
+ }
335
+ },
336
+ "total_flos": 821972377374720.0,
337
+ "train_batch_size": 2,
338
+ "trial_name": null,
339
+ "trial_params": null
340
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47fbac85789d46f3ce18a8131c81a7b87491662951030d2a2747373639d0716d
3
+ size 5880
training_loss.png ADDED