HiteshKamwal commited on
Commit
dcd3f6e
·
verified ·
1 Parent(s): b30915d

Upload 16 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,58 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: prithivMLmods/Qwen2-VL-OCR-2B-Instruct
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: train_2025-04-01-09-06-36
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # train_2025-04-01-09-06-36
18
+
19
+ This model is a fine-tuned version of [prithivMLmods/Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct) on the OCR_Finetuning_Dataset dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 1
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - gradient_accumulation_steps: 8
43
+ - total_train_batch_size: 8
44
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
45
+ - lr_scheduler_type: cosine
46
+ - num_epochs: 3.0
47
+
48
+ ### Training results
49
+
50
+
51
+
52
+ ### Framework versions
53
+
54
+ - PEFT 0.15.0
55
+ - Transformers 4.50.0
56
+ - Pytorch 2.6.0+cu124
57
+ - Datasets 3.4.1
58
+ - Tokenizers 0.21.0
added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|box_end|>": 151649,
3
+ "<|box_start|>": 151648,
4
+ "<|endoftext|>": 151643,
5
+ "<|im_end|>": 151645,
6
+ "<|im_start|>": 151644,
7
+ "<|image_pad|>": 151655,
8
+ "<|object_ref_end|>": 151647,
9
+ "<|object_ref_start|>": 151646,
10
+ "<|quad_end|>": 151651,
11
+ "<|quad_start|>": 151650,
12
+ "<|video_pad|>": 151656,
13
+ "<|vision_end|>": 151653,
14
+ "<|vision_pad|>": 151654,
15
+ "<|vision_start|>": 151652
16
+ }
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9932279909706545,
3
+ "num_input_tokens_seen": 1157808,
4
+ "total_flos": 1.3788411572404224e+16,
5
+ "train_loss": 0.939127180590687,
6
+ "train_runtime": 10484.6402,
7
+ "train_samples_per_second": 0.761,
8
+ "train_steps_per_second": 0.095
9
+ }
llamaboard_config.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ top.booster: auto
2
+ top.checkpoint_path: []
3
+ top.finetuning_type: lora
4
+ top.model_name: Custom
5
+ top.quantization_bit: none
6
+ top.quantization_method: bitsandbytes
7
+ top.rope_scaling: none
8
+ top.template: default
9
+ train.additional_target: ''
10
+ train.apollo_rank: 16
11
+ train.apollo_scale: 32
12
+ train.apollo_target: all
13
+ train.apollo_update_interval: 200
14
+ train.badam_mode: layer
15
+ train.badam_switch_interval: 50
16
+ train.badam_switch_mode: ascending
17
+ train.badam_update_ratio: 0.05
18
+ train.batch_size: 1
19
+ train.compute_type: bf16
20
+ train.create_new_adapter: false
21
+ train.cutoff_len: 2048
22
+ train.dataset:
23
+ - OCR_Finetuning_Dataset
24
+ train.dataset_dir: /content/drive/MyDrive
25
+ train.ds_offload: false
26
+ train.ds_stage: none
27
+ train.extra_args: '{"optim": "adamw_torch"}'
28
+ train.freeze_extra_modules: ''
29
+ train.freeze_trainable_layers: 2
30
+ train.freeze_trainable_modules: all
31
+ train.galore_rank: 16
32
+ train.galore_scale: 2
33
+ train.galore_target: all
34
+ train.galore_update_interval: 200
35
+ train.gradient_accumulation_steps: 8
36
+ train.learning_rate: 5e-5
37
+ train.logging_steps: 5
38
+ train.lora_alpha: 16
39
+ train.lora_dropout: 0
40
+ train.lora_rank: 8
41
+ train.lora_target: ''
42
+ train.loraplus_lr_ratio: 0
43
+ train.lr_scheduler_type: cosine
44
+ train.mask_history: false
45
+ train.max_grad_norm: '1.0'
46
+ train.max_samples: '100000'
47
+ train.neat_packing: false
48
+ train.neftune_alpha: 0
49
+ train.num_train_epochs: '3.0'
50
+ train.packing: false
51
+ train.ppo_score_norm: false
52
+ train.ppo_whiten_rewards: false
53
+ train.pref_beta: 0.1
54
+ train.pref_ftx: 0
55
+ train.pref_loss: sigmoid
56
+ train.report_to:
57
+ - none
58
+ train.resize_vocab: false
59
+ train.reward_model: []
60
+ train.save_steps: 100
61
+ train.swanlab_api_key: ''
62
+ train.swanlab_link: ''
63
+ train.swanlab_mode: cloud
64
+ train.swanlab_project: llamafactory
65
+ train.swanlab_run_name: ''
66
+ train.swanlab_workspace: ''
67
+ train.train_on_prompt: false
68
+ train.training_stage: Supervised Fine-Tuning
69
+ train.use_apollo: false
70
+ train.use_badam: false
71
+ train.use_dora: false
72
+ train.use_galore: false
73
+ train.use_llama_pro: false
74
+ train.use_pissa: false
75
+ train.use_rslora: false
76
+ train.use_swanlab: false
77
+ train.val_size: 0
78
+ train.warmup_steps: 0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
running_log.txt ADDED
@@ -0,0 +1,1492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [INFO|2025-04-01 09:07:44] tokenization_auto.py:759 >> Could not locate the tokenizer configuration file, will try to use the model config instead.
2
+
3
+ [INFO|2025-04-01 09:07:44] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
4
+
5
+ [INFO|2025-04-01 09:07:44] configuration_utils.py:771 >> Model config Qwen2VLConfig {
6
+ "architectures": [
7
+ "Qwen2VLForConditionalGeneration"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 151643,
11
+ "eos_token_id": 151645,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 1536,
14
+ "image_token_id": 151655,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 8960,
17
+ "max_position_embeddings": 32768,
18
+ "max_window_layers": 28,
19
+ "model_type": "qwen2_vl",
20
+ "num_attention_heads": 12,
21
+ "num_hidden_layers": 28,
22
+ "num_key_value_heads": 2,
23
+ "pad_token_id": 151654,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_scaling": {
26
+ "mrope_section": [
27
+ 16,
28
+ 24,
29
+ 24
30
+ ],
31
+ "rope_type": "default",
32
+ "type": "default"
33
+ },
34
+ "rope_theta": 1000000.0,
35
+ "sliding_window": 32768,
36
+ "tie_word_embeddings": true,
37
+ "torch_dtype": "bfloat16",
38
+ "transformers_version": "4.50.0",
39
+ "use_cache": true,
40
+ "use_sliding_window": false,
41
+ "video_token_id": 151656,
42
+ "vision_config": {
43
+ "depth": 32,
44
+ "embed_dim": 1280,
45
+ "hidden_act": "quick_gelu",
46
+ "hidden_size": 1536,
47
+ "in_channels": 3,
48
+ "in_chans": 3,
49
+ "mlp_ratio": 4,
50
+ "model_type": "qwen2_vl",
51
+ "num_heads": 16,
52
+ "patch_size": 14,
53
+ "spatial_merge_size": 2,
54
+ "spatial_patch_size": 14,
55
+ "temporal_patch_size": 2
56
+ },
57
+ "vision_end_token_id": 151653,
58
+ "vision_start_token_id": 151652,
59
+ "vision_token_id": 151654,
60
+ "vocab_size": 151936
61
+ }
62
+
63
+
64
+ [INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file vocab.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/vocab.json
65
+
66
+ [INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file merges.txt from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/merges.txt
67
+
68
+ [INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file tokenizer.json from cache at None
69
+
70
+ [INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/added_tokens.json
71
+
72
+ [INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file special_tokens_map.json from cache at None
73
+
74
+ [INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file tokenizer_config.json from cache at None
75
+
76
+ [INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file chat_template.jinja from cache at None
77
+
78
+ [INFO|2025-04-01 09:07:44] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
79
+
80
+ [INFO|2025-04-01 09:07:44] configuration_utils.py:771 >> Model config Qwen2VLConfig {
81
+ "architectures": [
82
+ "Qwen2VLForConditionalGeneration"
83
+ ],
84
+ "attention_dropout": 0.0,
85
+ "bos_token_id": 151643,
86
+ "eos_token_id": 151645,
87
+ "hidden_act": "silu",
88
+ "hidden_size": 1536,
89
+ "image_token_id": 151655,
90
+ "initializer_range": 0.02,
91
+ "intermediate_size": 8960,
92
+ "max_position_embeddings": 32768,
93
+ "max_window_layers": 28,
94
+ "model_type": "qwen2_vl",
95
+ "num_attention_heads": 12,
96
+ "num_hidden_layers": 28,
97
+ "num_key_value_heads": 2,
98
+ "pad_token_id": 151654,
99
+ "rms_norm_eps": 1e-06,
100
+ "rope_scaling": {
101
+ "mrope_section": [
102
+ 16,
103
+ 24,
104
+ 24
105
+ ],
106
+ "rope_type": "default",
107
+ "type": "default"
108
+ },
109
+ "rope_theta": 1000000.0,
110
+ "sliding_window": 32768,
111
+ "tie_word_embeddings": true,
112
+ "torch_dtype": "bfloat16",
113
+ "transformers_version": "4.50.0",
114
+ "use_cache": true,
115
+ "use_sliding_window": false,
116
+ "video_token_id": 151656,
117
+ "vision_config": {
118
+ "depth": 32,
119
+ "embed_dim": 1280,
120
+ "hidden_act": "quick_gelu",
121
+ "hidden_size": 1536,
122
+ "in_channels": 3,
123
+ "in_chans": 3,
124
+ "mlp_ratio": 4,
125
+ "model_type": "qwen2_vl",
126
+ "num_heads": 16,
127
+ "patch_size": 14,
128
+ "spatial_merge_size": 2,
129
+ "spatial_patch_size": 14,
130
+ "temporal_patch_size": 2
131
+ },
132
+ "vision_end_token_id": 151653,
133
+ "vision_start_token_id": 151652,
134
+ "vision_token_id": 151654,
135
+ "vocab_size": 151936
136
+ }
137
+
138
+
139
+ [INFO|2025-04-01 09:07:45] tokenization_utils_base.py:2323 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
140
+
141
+ [INFO|2025-04-01 09:07:45] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
142
+
143
+ [INFO|2025-04-01 09:07:45] configuration_utils.py:771 >> Model config Qwen2VLConfig {
144
+ "architectures": [
145
+ "Qwen2VLForConditionalGeneration"
146
+ ],
147
+ "attention_dropout": 0.0,
148
+ "bos_token_id": 151643,
149
+ "eos_token_id": 151645,
150
+ "hidden_act": "silu",
151
+ "hidden_size": 1536,
152
+ "image_token_id": 151655,
153
+ "initializer_range": 0.02,
154
+ "intermediate_size": 8960,
155
+ "max_position_embeddings": 32768,
156
+ "max_window_layers": 28,
157
+ "model_type": "qwen2_vl",
158
+ "num_attention_heads": 12,
159
+ "num_hidden_layers": 28,
160
+ "num_key_value_heads": 2,
161
+ "pad_token_id": 151654,
162
+ "rms_norm_eps": 1e-06,
163
+ "rope_scaling": {
164
+ "mrope_section": [
165
+ 16,
166
+ 24,
167
+ 24
168
+ ],
169
+ "rope_type": "default",
170
+ "type": "default"
171
+ },
172
+ "rope_theta": 1000000.0,
173
+ "sliding_window": 32768,
174
+ "tie_word_embeddings": true,
175
+ "torch_dtype": "bfloat16",
176
+ "transformers_version": "4.50.0",
177
+ "use_cache": true,
178
+ "use_sliding_window": false,
179
+ "video_token_id": 151656,
180
+ "vision_config": {
181
+ "depth": 32,
182
+ "embed_dim": 1280,
183
+ "hidden_act": "quick_gelu",
184
+ "hidden_size": 1536,
185
+ "in_channels": 3,
186
+ "in_chans": 3,
187
+ "mlp_ratio": 4,
188
+ "model_type": "qwen2_vl",
189
+ "num_heads": 16,
190
+ "patch_size": 14,
191
+ "spatial_merge_size": 2,
192
+ "spatial_patch_size": 14,
193
+ "temporal_patch_size": 2
194
+ },
195
+ "vision_end_token_id": 151653,
196
+ "vision_start_token_id": 151652,
197
+ "vision_token_id": 151654,
198
+ "vocab_size": 151936
199
+ }
200
+
201
+
202
+ [INFO|2025-04-01 09:07:45] tokenization_utils_base.py:2323 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
203
+
204
+ [INFO|2025-04-01 09:07:46] image_processing_base.py:381 >> loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/preprocessor_config.json
205
+
206
+ [INFO|2025-04-01 09:07:46] image_processing_base.py:381 >> loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/preprocessor_config.json
207
+
208
+ [WARNING|2025-04-01 09:07:46] logging.py:329 >> Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
209
+
210
+ [INFO|2025-04-01 09:07:46] logging.py:143 >> Loading dataset /content/drive/MyDrive/dataset.jsonl...
211
+
212
+ [INFO|2025-04-01 09:07:47] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
213
+
214
+ [INFO|2025-04-01 09:07:47] configuration_utils.py:771 >> Model config Qwen2VLConfig {
215
+ "architectures": [
216
+ "Qwen2VLForConditionalGeneration"
217
+ ],
218
+ "attention_dropout": 0.0,
219
+ "bos_token_id": 151643,
220
+ "eos_token_id": 151645,
221
+ "hidden_act": "silu",
222
+ "hidden_size": 1536,
223
+ "image_token_id": 151655,
224
+ "initializer_range": 0.02,
225
+ "intermediate_size": 8960,
226
+ "max_position_embeddings": 32768,
227
+ "max_window_layers": 28,
228
+ "model_type": "qwen2_vl",
229
+ "num_attention_heads": 12,
230
+ "num_hidden_layers": 28,
231
+ "num_key_value_heads": 2,
232
+ "pad_token_id": 151654,
233
+ "rms_norm_eps": 1e-06,
234
+ "rope_scaling": {
235
+ "mrope_section": [
236
+ 16,
237
+ 24,
238
+ 24
239
+ ],
240
+ "rope_type": "default",
241
+ "type": "default"
242
+ },
243
+ "rope_theta": 1000000.0,
244
+ "sliding_window": 32768,
245
+ "tie_word_embeddings": true,
246
+ "torch_dtype": "bfloat16",
247
+ "transformers_version": "4.50.0",
248
+ "use_cache": true,
249
+ "use_sliding_window": false,
250
+ "video_token_id": 151656,
251
+ "vision_config": {
252
+ "depth": 32,
253
+ "embed_dim": 1280,
254
+ "hidden_act": "quick_gelu",
255
+ "hidden_size": 1536,
256
+ "in_channels": 3,
257
+ "in_chans": 3,
258
+ "mlp_ratio": 4,
259
+ "model_type": "qwen2_vl",
260
+ "num_heads": 16,
261
+ "patch_size": 14,
262
+ "spatial_merge_size": 2,
263
+ "spatial_patch_size": 14,
264
+ "temporal_patch_size": 2
265
+ },
266
+ "vision_end_token_id": 151653,
267
+ "vision_start_token_id": 151652,
268
+ "vision_token_id": 151654,
269
+ "vocab_size": 151936
270
+ }
271
+
272
+
273
+ [INFO|2025-04-01 09:07:47] modeling_utils.py:1154 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/model.safetensors
274
+
275
+ [INFO|2025-04-01 09:07:47] modeling_utils.py:2170 >> Instantiating Qwen2VLForConditionalGeneration model under default dtype torch.bfloat16.
276
+
277
+ [INFO|2025-04-01 09:07:47] configuration_utils.py:1139 >> Generate config GenerationConfig {
278
+ "bos_token_id": 151643,
279
+ "eos_token_id": 151645,
280
+ "pad_token_id": 151654
281
+ }
282
+
283
+
284
+ [INFO|2025-04-01 09:07:47] modeling_utils.py:2170 >> Instantiating Qwen2VisionTransformerPretrainedModel model under default dtype torch.bfloat16.
285
+
286
+ [INFO|2025-04-01 09:07:50] modeling_utils.py:4987 >> All model checkpoint weights were used when initializing Qwen2VLForConditionalGeneration.
287
+
288
+
289
+ [INFO|2025-04-01 09:07:50] modeling_utils.py:4995 >> All the weights of Qwen2VLForConditionalGeneration were initialized from the model checkpoint at prithivMLmods/Qwen2-VL-OCR-2B-Instruct.
290
+ If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2VLForConditionalGeneration for predictions without further training.
291
+
292
+ [INFO|2025-04-01 09:07:50] configuration_utils.py:1094 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/generation_config.json
293
+
294
+ [INFO|2025-04-01 09:07:50] configuration_utils.py:1139 >> Generate config GenerationConfig {
295
+ "bos_token_id": 151643,
296
+ "do_sample": true,
297
+ "eos_token_id": [
298
+ 151645,
299
+ 151643
300
+ ],
301
+ "max_length": 32768,
302
+ "pad_token_id": 151654,
303
+ "temperature": 0.01,
304
+ "top_k": 1,
305
+ "top_p": 0.001
306
+ }
307
+
308
+
309
+ [INFO|2025-04-01 09:07:50] logging.py:143 >> Gradient checkpointing enabled.
310
+
311
+ [INFO|2025-04-01 09:07:50] logging.py:143 >> Using torch SDPA for faster training and inference.
312
+
313
+ [INFO|2025-04-01 09:07:50] logging.py:143 >> Upcasting trainable params to float32.
314
+
315
+ [INFO|2025-04-01 09:07:50] logging.py:143 >> Fine-tuning method: LoRA
316
+
317
+ [INFO|2025-04-01 09:07:50] logging.py:143 >> Found linear modules: q_proj,v_proj,gate_proj,down_proj,up_proj,o_proj,k_proj
318
+
319
+ [INFO|2025-04-01 09:07:50] logging.py:143 >> Set vision model not trainable: ['visual.patch_embed', 'visual.blocks'].
320
+
321
+ [INFO|2025-04-01 09:07:50] logging.py:143 >> Set multi model projector not trainable: visual.merger.
322
+
323
+ [INFO|2025-04-01 09:07:51] logging.py:143 >> trainable params: 9,232,384 || all params: 2,218,217,984 || trainable%: 0.4162
324
+
325
+ [INFO|2025-04-01 09:07:51] trainer.py:748 >> Using auto half precision backend
326
+
327
+ [WARNING|2025-04-01 09:07:51] trainer.py:783 >> No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
328
+
329
+ [INFO|2025-04-01 09:07:51] trainer.py:2409 >> ***** Running training *****
330
+
331
+ [INFO|2025-04-01 09:07:51] trainer.py:2410 >> Num examples = 2,658
332
+
333
+ [INFO|2025-04-01 09:07:51] trainer.py:2411 >> Num Epochs = 3
334
+
335
+ [INFO|2025-04-01 09:07:51] trainer.py:2412 >> Instantaneous batch size per device = 1
336
+
337
+ [INFO|2025-04-01 09:07:51] trainer.py:2415 >> Total train batch size (w. parallel, distributed & accumulation) = 8
338
+
339
+ [INFO|2025-04-01 09:07:51] trainer.py:2416 >> Gradient Accumulation steps = 8
340
+
341
+ [INFO|2025-04-01 09:07:51] trainer.py:2417 >> Total optimization steps = 996
342
+
343
+ [INFO|2025-04-01 09:07:51] trainer.py:2418 >> Number of trainable parameters = 9,232,384
344
+
345
+ [INFO|2025-04-01 09:08:45] logging.py:143 >> {'loss': 2.4707, 'learning_rate': 4.9997e-05, 'epoch': 0.02, 'throughput': 108.07}
346
+
347
+ [INFO|2025-04-01 09:09:39] logging.py:143 >> {'loss': 2.2509, 'learning_rate': 4.9988e-05, 'epoch': 0.03, 'throughput': 106.09}
348
+
349
+ [INFO|2025-04-01 09:10:31] logging.py:143 >> {'loss': 1.6895, 'learning_rate': 4.9972e-05, 'epoch': 0.05, 'throughput': 106.20}
350
+
351
+ [INFO|2025-04-01 09:11:24] logging.py:143 >> {'loss': 1.4876, 'learning_rate': 4.9950e-05, 'epoch': 0.06, 'throughput': 107.12}
352
+
353
+ [INFO|2025-04-01 09:12:19] logging.py:143 >> {'loss': 1.4812, 'learning_rate': 4.9922e-05, 'epoch': 0.08, 'throughput': 108.37}
354
+
355
+ [INFO|2025-04-01 09:13:11] logging.py:143 >> {'loss': 1.3642, 'learning_rate': 4.9888e-05, 'epoch': 0.09, 'throughput': 108.89}
356
+
357
+ [INFO|2025-04-01 09:14:06] logging.py:143 >> {'loss': 1.3651, 'learning_rate': 4.9848e-05, 'epoch': 0.11, 'throughput': 109.89}
358
+
359
+ [INFO|2025-04-01 09:15:00] logging.py:143 >> {'loss': 1.1321, 'learning_rate': 4.9801e-05, 'epoch': 0.12, 'throughput': 110.22}
360
+
361
+ [INFO|2025-04-01 09:15:53] logging.py:143 >> {'loss': 1.3012, 'learning_rate': 4.9749e-05, 'epoch': 0.14, 'throughput': 110.28}
362
+
363
+ [INFO|2025-04-01 09:16:46] logging.py:143 >> {'loss': 0.9827, 'learning_rate': 4.9690e-05, 'epoch': 0.15, 'throughput': 110.24}
364
+
365
+ [INFO|2025-04-01 09:17:40] logging.py:143 >> {'loss': 1.2313, 'learning_rate': 4.9625e-05, 'epoch': 0.17, 'throughput': 110.02}
366
+
367
+ [INFO|2025-04-01 09:18:35] logging.py:143 >> {'loss': 1.0347, 'learning_rate': 4.9554e-05, 'epoch': 0.18, 'throughput': 110.10}
368
+
369
+ [INFO|2025-04-01 09:19:28] logging.py:143 >> {'loss': 1.0422, 'learning_rate': 4.9476e-05, 'epoch': 0.20, 'throughput': 110.31}
370
+
371
+ [INFO|2025-04-01 09:20:22] logging.py:143 >> {'loss': 0.9996, 'learning_rate': 4.9393e-05, 'epoch': 0.21, 'throughput': 110.36}
372
+
373
+ [INFO|2025-04-01 09:21:16] logging.py:143 >> {'loss': 1.0755, 'learning_rate': 4.9304e-05, 'epoch': 0.23, 'throughput': 110.41}
374
+
375
+ [INFO|2025-04-01 09:22:07] logging.py:143 >> {'loss': 1.0260, 'learning_rate': 4.9208e-05, 'epoch': 0.24, 'throughput': 110.15}
376
+
377
+ [INFO|2025-04-01 09:22:59] logging.py:143 >> {'loss': 1.1307, 'learning_rate': 4.9107e-05, 'epoch': 0.26, 'throughput': 110.04}
378
+
379
+ [INFO|2025-04-01 09:23:52] logging.py:143 >> {'loss': 1.0221, 'learning_rate': 4.8999e-05, 'epoch': 0.27, 'throughput': 109.93}
380
+
381
+ [INFO|2025-04-01 09:24:43] logging.py:143 >> {'loss': 1.0120, 'learning_rate': 4.8886e-05, 'epoch': 0.29, 'throughput': 109.96}
382
+
383
+ [INFO|2025-04-01 09:25:34] logging.py:143 >> {'loss': 1.0151, 'learning_rate': 4.8767e-05, 'epoch': 0.30, 'throughput': 109.77}
384
+
385
+ [INFO|2025-04-01 09:25:34] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-100
386
+
387
+ [INFO|2025-04-01 09:25:34] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
388
+
389
+ [INFO|2025-04-01 09:25:34] configuration_utils.py:771 >> Model config Qwen2VLConfig {
390
+ "architectures": [
391
+ "Qwen2VLForConditionalGeneration"
392
+ ],
393
+ "attention_dropout": 0.0,
394
+ "bos_token_id": 151643,
395
+ "eos_token_id": 151645,
396
+ "hidden_act": "silu",
397
+ "hidden_size": 1536,
398
+ "image_token_id": 151655,
399
+ "initializer_range": 0.02,
400
+ "intermediate_size": 8960,
401
+ "max_position_embeddings": 32768,
402
+ "max_window_layers": 28,
403
+ "model_type": "qwen2_vl",
404
+ "num_attention_heads": 12,
405
+ "num_hidden_layers": 28,
406
+ "num_key_value_heads": 2,
407
+ "pad_token_id": 151654,
408
+ "rms_norm_eps": 1e-06,
409
+ "rope_scaling": {
410
+ "mrope_section": [
411
+ 16,
412
+ 24,
413
+ 24
414
+ ],
415
+ "rope_type": "default",
416
+ "type": "default"
417
+ },
418
+ "rope_theta": 1000000.0,
419
+ "sliding_window": 32768,
420
+ "tie_word_embeddings": true,
421
+ "torch_dtype": "bfloat16",
422
+ "transformers_version": "4.50.0",
423
+ "use_cache": true,
424
+ "use_sliding_window": false,
425
+ "video_token_id": 151656,
426
+ "vision_config": {
427
+ "depth": 32,
428
+ "embed_dim": 1280,
429
+ "hidden_act": "quick_gelu",
430
+ "hidden_size": 1536,
431
+ "in_channels": 3,
432
+ "in_chans": 3,
433
+ "mlp_ratio": 4,
434
+ "model_type": "qwen2_vl",
435
+ "num_heads": 16,
436
+ "patch_size": 14,
437
+ "spatial_merge_size": 2,
438
+ "spatial_patch_size": 14,
439
+ "temporal_patch_size": 2
440
+ },
441
+ "vision_end_token_id": 151653,
442
+ "vision_start_token_id": 151652,
443
+ "vision_token_id": 151654,
444
+ "vocab_size": 151936
445
+ }
446
+
447
+
448
+ [INFO|2025-04-01 09:25:35] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-100/tokenizer_config.json
449
+
450
+ [INFO|2025-04-01 09:25:35] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-100/special_tokens_map.json
451
+
452
+ [INFO|2025-04-01 09:26:26] logging.py:143 >> {'loss': 1.0028, 'learning_rate': 4.8641e-05, 'epoch': 0.32, 'throughput': 109.30}
453
+
454
+ [INFO|2025-04-01 09:27:18] logging.py:143 >> {'loss': 1.1430, 'learning_rate': 4.8510e-05, 'epoch': 0.33, 'throughput': 109.20}
455
+
456
+ [INFO|2025-04-01 09:28:09] logging.py:143 >> {'loss': 0.9695, 'learning_rate': 4.8373e-05, 'epoch': 0.35, 'throughput': 109.18}
457
+
458
+ [INFO|2025-04-01 09:29:04] logging.py:143 >> {'loss': 0.9017, 'learning_rate': 4.8230e-05, 'epoch': 0.36, 'throughput': 109.34}
459
+
460
+ [INFO|2025-04-01 09:29:56] logging.py:143 >> {'loss': 1.0350, 'learning_rate': 4.8082e-05, 'epoch': 0.38, 'throughput': 109.29}
461
+
462
+ [INFO|2025-04-01 09:30:48] logging.py:143 >> {'loss': 1.0128, 'learning_rate': 4.7928e-05, 'epoch': 0.39, 'throughput': 109.25}
463
+
464
+ [INFO|2025-04-01 09:31:41] logging.py:143 >> {'loss': 0.9432, 'learning_rate': 4.7768e-05, 'epoch': 0.41, 'throughput': 109.32}
465
+
466
+ [INFO|2025-04-01 09:32:35] logging.py:143 >> {'loss': 1.0344, 'learning_rate': 4.7602e-05, 'epoch': 0.42, 'throughput': 109.50}
467
+
468
+ [INFO|2025-04-01 09:33:27] logging.py:143 >> {'loss': 0.9452, 'learning_rate': 4.7431e-05, 'epoch': 0.44, 'throughput': 109.55}
469
+
470
+ [INFO|2025-04-01 09:34:19] logging.py:143 >> {'loss': 0.9559, 'learning_rate': 4.7254e-05, 'epoch': 0.45, 'throughput': 109.53}
471
+
472
+ [INFO|2025-04-01 09:35:12] logging.py:143 >> {'loss': 0.9726, 'learning_rate': 4.7071e-05, 'epoch': 0.47, 'throughput': 109.60}
473
+
474
+ [INFO|2025-04-01 09:36:04] logging.py:143 >> {'loss': 0.9344, 'learning_rate': 4.6883e-05, 'epoch': 0.48, 'throughput': 109.48}
475
+
476
+ [INFO|2025-04-01 09:36:56] logging.py:143 >> {'loss': 0.9497, 'learning_rate': 4.6690e-05, 'epoch': 0.50, 'throughput': 109.40}
477
+
478
+ [INFO|2025-04-01 09:37:52] logging.py:143 >> {'loss': 1.0570, 'learning_rate': 4.6491e-05, 'epoch': 0.51, 'throughput': 109.59}
479
+
480
+ [INFO|2025-04-01 09:38:44] logging.py:143 >> {'loss': 0.9847, 'learning_rate': 4.6287e-05, 'epoch': 0.53, 'throughput': 109.65}
481
+
482
+ [INFO|2025-04-01 09:39:36] logging.py:143 >> {'loss': 1.0010, 'learning_rate': 4.6078e-05, 'epoch': 0.54, 'throughput': 109.52}
483
+
484
+ [INFO|2025-04-01 09:40:29] logging.py:143 >> {'loss': 0.9384, 'learning_rate': 4.5863e-05, 'epoch': 0.56, 'throughput': 109.56}
485
+
486
+ [INFO|2025-04-01 09:41:23] logging.py:143 >> {'loss': 1.0312, 'learning_rate': 4.5643e-05, 'epoch': 0.57, 'throughput': 109.68}
487
+
488
+ [INFO|2025-04-01 09:42:16] logging.py:143 >> {'loss': 0.9112, 'learning_rate': 4.5418e-05, 'epoch': 0.59, 'throughput': 109.70}
489
+
490
+ [INFO|2025-04-01 09:43:08] logging.py:143 >> {'loss': 0.9967, 'learning_rate': 4.5188e-05, 'epoch': 0.60, 'throughput': 109.66}
491
+
492
+ [INFO|2025-04-01 09:43:08] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-200
493
+
494
+ [INFO|2025-04-01 09:43:08] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
495
+
496
+ [INFO|2025-04-01 09:43:08] configuration_utils.py:771 >> Model config Qwen2VLConfig {
497
+ "architectures": [
498
+ "Qwen2VLForConditionalGeneration"
499
+ ],
500
+ "attention_dropout": 0.0,
501
+ "bos_token_id": 151643,
502
+ "eos_token_id": 151645,
503
+ "hidden_act": "silu",
504
+ "hidden_size": 1536,
505
+ "image_token_id": 151655,
506
+ "initializer_range": 0.02,
507
+ "intermediate_size": 8960,
508
+ "max_position_embeddings": 32768,
509
+ "max_window_layers": 28,
510
+ "model_type": "qwen2_vl",
511
+ "num_attention_heads": 12,
512
+ "num_hidden_layers": 28,
513
+ "num_key_value_heads": 2,
514
+ "pad_token_id": 151654,
515
+ "rms_norm_eps": 1e-06,
516
+ "rope_scaling": {
517
+ "mrope_section": [
518
+ 16,
519
+ 24,
520
+ 24
521
+ ],
522
+ "rope_type": "default",
523
+ "type": "default"
524
+ },
525
+ "rope_theta": 1000000.0,
526
+ "sliding_window": 32768,
527
+ "tie_word_embeddings": true,
528
+ "torch_dtype": "bfloat16",
529
+ "transformers_version": "4.50.0",
530
+ "use_cache": true,
531
+ "use_sliding_window": false,
532
+ "video_token_id": 151656,
533
+ "vision_config": {
534
+ "depth": 32,
535
+ "embed_dim": 1280,
536
+ "hidden_act": "quick_gelu",
537
+ "hidden_size": 1536,
538
+ "in_channels": 3,
539
+ "in_chans": 3,
540
+ "mlp_ratio": 4,
541
+ "model_type": "qwen2_vl",
542
+ "num_heads": 16,
543
+ "patch_size": 14,
544
+ "spatial_merge_size": 2,
545
+ "spatial_patch_size": 14,
546
+ "temporal_patch_size": 2
547
+ },
548
+ "vision_end_token_id": 151653,
549
+ "vision_start_token_id": 151652,
550
+ "vision_token_id": 151654,
551
+ "vocab_size": 151936
552
+ }
553
+
554
+
555
+ [INFO|2025-04-01 09:43:09] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-200/tokenizer_config.json
556
+
557
+ [INFO|2025-04-01 09:43:09] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-200/special_tokens_map.json
558
+
559
+ [INFO|2025-04-01 09:44:02] logging.py:143 >> {'loss': 1.0905, 'learning_rate': 4.4953e-05, 'epoch': 0.62, 'throughput': 109.64}
560
+
561
+ [INFO|2025-04-01 09:44:55] logging.py:143 >> {'loss': 0.9487, 'learning_rate': 4.4713e-05, 'epoch': 0.63, 'throughput': 109.56}
562
+
563
+ [INFO|2025-04-01 09:45:47] logging.py:143 >> {'loss': 0.8675, 'learning_rate': 4.4468e-05, 'epoch': 0.65, 'throughput': 109.56}
564
+
565
+ [INFO|2025-04-01 09:46:40] logging.py:143 >> {'loss': 0.8624, 'learning_rate': 4.4219e-05, 'epoch': 0.66, 'throughput': 109.61}
566
+
567
+ [INFO|2025-04-01 09:47:35] logging.py:143 >> {'loss': 1.0489, 'learning_rate': 4.3964e-05, 'epoch': 0.68, 'throughput': 109.74}
568
+
569
+ [INFO|2025-04-01 09:48:27] logging.py:143 >> {'loss': 0.9139, 'learning_rate': 4.3705e-05, 'epoch': 0.69, 'throughput': 109.68}
570
+
571
+ [INFO|2025-04-01 09:49:19] logging.py:143 >> {'loss': 0.9905, 'learning_rate': 4.3441e-05, 'epoch': 0.71, 'throughput': 109.62}
572
+
573
+ [INFO|2025-04-01 09:50:13] logging.py:143 >> {'loss': 0.8974, 'learning_rate': 4.3172e-05, 'epoch': 0.72, 'throughput': 109.66}
574
+
575
+ [INFO|2025-04-01 09:51:06] logging.py:143 >> {'loss': 0.9990, 'learning_rate': 4.2899e-05, 'epoch': 0.74, 'throughput': 109.68}
576
+
577
+ [INFO|2025-04-01 09:51:59] logging.py:143 >> {'loss': 0.9916, 'learning_rate': 4.2622e-05, 'epoch': 0.75, 'throughput': 109.66}
578
+
579
+ [INFO|2025-04-01 09:52:51] logging.py:143 >> {'loss': 0.9242, 'learning_rate': 4.2340e-05, 'epoch': 0.77, 'throughput': 109.54}
580
+
581
+ [INFO|2025-04-01 09:53:45] logging.py:143 >> {'loss': 1.0426, 'learning_rate': 4.2054e-05, 'epoch': 0.78, 'throughput': 109.56}
582
+
583
+ [INFO|2025-04-01 09:54:37] logging.py:143 >> {'loss': 0.8625, 'learning_rate': 4.1763e-05, 'epoch': 0.80, 'throughput': 109.50}
584
+
585
+ [INFO|2025-04-01 09:55:30] logging.py:143 >> {'loss': 0.9959, 'learning_rate': 4.1469e-05, 'epoch': 0.81, 'throughput': 109.51}
586
+
587
+ [INFO|2025-04-01 09:56:23] logging.py:143 >> {'loss': 0.9390, 'learning_rate': 4.1170e-05, 'epoch': 0.83, 'throughput': 109.57}
588
+
589
+ [INFO|2025-04-01 09:57:16] logging.py:143 >> {'loss': 0.9741, 'learning_rate': 4.0867e-05, 'epoch': 0.84, 'throughput': 109.61}
590
+
591
+ [INFO|2025-04-01 09:58:10] logging.py:143 >> {'loss': 0.9800, 'learning_rate': 4.0561e-05, 'epoch': 0.86, 'throughput': 109.62}
592
+
593
+ [INFO|2025-04-01 09:59:02] logging.py:143 >> {'loss': 0.8898, 'learning_rate': 4.0250e-05, 'epoch': 0.87, 'throughput': 109.54}
594
+
595
+ [INFO|2025-04-01 09:59:57] logging.py:143 >> {'loss': 0.9530, 'learning_rate': 3.9936e-05, 'epoch': 0.89, 'throughput': 109.67}
596
+
597
+ [INFO|2025-04-01 10:00:50] logging.py:143 >> {'loss': 0.9311, 'learning_rate': 3.9618e-05, 'epoch': 0.90, 'throughput': 109.75}
598
+
599
+ [INFO|2025-04-01 10:00:50] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-300
600
+
601
+ [INFO|2025-04-01 10:00:51] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
602
+
603
+ [INFO|2025-04-01 10:00:51] configuration_utils.py:771 >> Model config Qwen2VLConfig {
604
+ "architectures": [
605
+ "Qwen2VLForConditionalGeneration"
606
+ ],
607
+ "attention_dropout": 0.0,
608
+ "bos_token_id": 151643,
609
+ "eos_token_id": 151645,
610
+ "hidden_act": "silu",
611
+ "hidden_size": 1536,
612
+ "image_token_id": 151655,
613
+ "initializer_range": 0.02,
614
+ "intermediate_size": 8960,
615
+ "max_position_embeddings": 32768,
616
+ "max_window_layers": 28,
617
+ "model_type": "qwen2_vl",
618
+ "num_attention_heads": 12,
619
+ "num_hidden_layers": 28,
620
+ "num_key_value_heads": 2,
621
+ "pad_token_id": 151654,
622
+ "rms_norm_eps": 1e-06,
623
+ "rope_scaling": {
624
+ "mrope_section": [
625
+ 16,
626
+ 24,
627
+ 24
628
+ ],
629
+ "rope_type": "default",
630
+ "type": "default"
631
+ },
632
+ "rope_theta": 1000000.0,
633
+ "sliding_window": 32768,
634
+ "tie_word_embeddings": true,
635
+ "torch_dtype": "bfloat16",
636
+ "transformers_version": "4.50.0",
637
+ "use_cache": true,
638
+ "use_sliding_window": false,
639
+ "video_token_id": 151656,
640
+ "vision_config": {
641
+ "depth": 32,
642
+ "embed_dim": 1280,
643
+ "hidden_act": "quick_gelu",
644
+ "hidden_size": 1536,
645
+ "in_channels": 3,
646
+ "in_chans": 3,
647
+ "mlp_ratio": 4,
648
+ "model_type": "qwen2_vl",
649
+ "num_heads": 16,
650
+ "patch_size": 14,
651
+ "spatial_merge_size": 2,
652
+ "spatial_patch_size": 14,
653
+ "temporal_patch_size": 2
654
+ },
655
+ "vision_end_token_id": 151653,
656
+ "vision_start_token_id": 151652,
657
+ "vision_token_id": 151654,
658
+ "vocab_size": 151936
659
+ }
660
+
661
+
662
+ [INFO|2025-04-01 10:00:51] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-300/tokenizer_config.json
663
+
664
+ [INFO|2025-04-01 10:00:51] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-300/special_tokens_map.json
665
+
666
+ [INFO|2025-04-01 10:01:43] logging.py:143 >> {'loss': 0.9114, 'learning_rate': 3.9296e-05, 'epoch': 0.92, 'throughput': 109.73}
667
+
668
+ [INFO|2025-04-01 10:02:37] logging.py:143 >> {'loss': 0.9674, 'learning_rate': 3.8971e-05, 'epoch': 0.93, 'throughput': 109.85}
669
+
670
+ [INFO|2025-04-01 10:03:30] logging.py:143 >> {'loss': 0.9582, 'learning_rate': 3.8642e-05, 'epoch': 0.95, 'throughput': 109.86}
671
+
672
+ [INFO|2025-04-01 10:04:22] logging.py:143 >> {'loss': 0.9863, 'learning_rate': 3.8310e-05, 'epoch': 0.96, 'throughput': 109.84}
673
+
674
+ [INFO|2025-04-01 10:05:16] logging.py:143 >> {'loss': 0.9060, 'learning_rate': 3.7975e-05, 'epoch': 0.98, 'throughput': 109.89}
675
+
676
+ [INFO|2025-04-01 10:06:09] logging.py:143 >> {'loss': 0.8958, 'learning_rate': 3.7636e-05, 'epoch': 0.99, 'throughput': 109.87}
677
+
678
+ [INFO|2025-04-01 10:06:54] logging.py:143 >> {'loss': 0.8349, 'learning_rate': 3.7295e-05, 'epoch': 1.01, 'throughput': 109.89}
679
+
680
+ [INFO|2025-04-01 10:07:46] logging.py:143 >> {'loss': 0.8507, 'learning_rate': 3.6950e-05, 'epoch': 1.02, 'throughput': 109.86}
681
+
682
+ [INFO|2025-04-01 10:08:39] logging.py:143 >> {'loss': 0.9287, 'learning_rate': 3.6602e-05, 'epoch': 1.04, 'throughput': 109.88}
683
+
684
+ [INFO|2025-04-01 10:09:32] logging.py:143 >> {'loss': 0.9107, 'learning_rate': 3.6251e-05, 'epoch': 1.05, 'throughput': 109.91}
685
+
686
+ [INFO|2025-04-01 10:10:25] logging.py:143 >> {'loss': 0.9520, 'learning_rate': 3.5898e-05, 'epoch': 1.07, 'throughput': 109.93}
687
+
688
+ [INFO|2025-04-01 10:11:18] logging.py:143 >> {'loss': 0.9526, 'learning_rate': 3.5542e-05, 'epoch': 1.08, 'throughput': 109.92}
689
+
690
+ [INFO|2025-04-01 10:12:11] logging.py:143 >> {'loss': 0.8775, 'learning_rate': 3.5183e-05, 'epoch': 1.10, 'throughput': 109.90}
691
+
692
+ [INFO|2025-04-01 10:13:05] logging.py:143 >> {'loss': 0.8830, 'learning_rate': 3.4821e-05, 'epoch': 1.11, 'throughput': 109.94}
693
+
694
+ [INFO|2025-04-01 10:13:58] logging.py:143 >> {'loss': 1.0032, 'learning_rate': 3.4458e-05, 'epoch': 1.13, 'throughput': 109.93}
695
+
696
+ [INFO|2025-04-01 10:14:51] logging.py:143 >> {'loss': 0.9430, 'learning_rate': 3.4092e-05, 'epoch': 1.14, 'throughput': 109.94}
697
+
698
+ [INFO|2025-04-01 10:15:43] logging.py:143 >> {'loss': 0.8010, 'learning_rate': 3.3723e-05, 'epoch': 1.16, 'throughput': 109.91}
699
+
700
+ [INFO|2025-04-01 10:16:35] logging.py:143 >> {'loss': 0.9294, 'learning_rate': 3.3353e-05, 'epoch': 1.17, 'throughput': 109.91}
701
+
702
+ [INFO|2025-04-01 10:17:28] logging.py:143 >> {'loss': 0.9528, 'learning_rate': 3.2980e-05, 'epoch': 1.19, 'throughput': 109.96}
703
+
704
+ [INFO|2025-04-01 10:18:20] logging.py:143 >> {'loss': 0.8981, 'learning_rate': 3.2605e-05, 'epoch': 1.20, 'throughput': 109.93}
705
+
706
+ [INFO|2025-04-01 10:18:21] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-400
707
+
708
+ [INFO|2025-04-01 10:18:21] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
709
+
710
+ [INFO|2025-04-01 10:18:21] configuration_utils.py:771 >> Model config Qwen2VLConfig {
711
+ "architectures": [
712
+ "Qwen2VLForConditionalGeneration"
713
+ ],
714
+ "attention_dropout": 0.0,
715
+ "bos_token_id": 151643,
716
+ "eos_token_id": 151645,
717
+ "hidden_act": "silu",
718
+ "hidden_size": 1536,
719
+ "image_token_id": 151655,
720
+ "initializer_range": 0.02,
721
+ "intermediate_size": 8960,
722
+ "max_position_embeddings": 32768,
723
+ "max_window_layers": 28,
724
+ "model_type": "qwen2_vl",
725
+ "num_attention_heads": 12,
726
+ "num_hidden_layers": 28,
727
+ "num_key_value_heads": 2,
728
+ "pad_token_id": 151654,
729
+ "rms_norm_eps": 1e-06,
730
+ "rope_scaling": {
731
+ "mrope_section": [
732
+ 16,
733
+ 24,
734
+ 24
735
+ ],
736
+ "rope_type": "default",
737
+ "type": "default"
738
+ },
739
+ "rope_theta": 1000000.0,
740
+ "sliding_window": 32768,
741
+ "tie_word_embeddings": true,
742
+ "torch_dtype": "bfloat16",
743
+ "transformers_version": "4.50.0",
744
+ "use_cache": true,
745
+ "use_sliding_window": false,
746
+ "video_token_id": 151656,
747
+ "vision_config": {
748
+ "depth": 32,
749
+ "embed_dim": 1280,
750
+ "hidden_act": "quick_gelu",
751
+ "hidden_size": 1536,
752
+ "in_channels": 3,
753
+ "in_chans": 3,
754
+ "mlp_ratio": 4,
755
+ "model_type": "qwen2_vl",
756
+ "num_heads": 16,
757
+ "patch_size": 14,
758
+ "spatial_merge_size": 2,
759
+ "spatial_patch_size": 14,
760
+ "temporal_patch_size": 2
761
+ },
762
+ "vision_end_token_id": 151653,
763
+ "vision_start_token_id": 151652,
764
+ "vision_token_id": 151654,
765
+ "vocab_size": 151936
766
+ }
767
+
768
+
769
+ [INFO|2025-04-01 10:18:21] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-400/tokenizer_config.json
770
+
771
+ [INFO|2025-04-01 10:18:21] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-400/special_tokens_map.json
772
+
773
+ [INFO|2025-04-01 10:19:15] logging.py:143 >> {'loss': 0.9823, 'learning_rate': 3.2229e-05, 'epoch': 1.22, 'throughput': 109.94}
774
+
775
+ [INFO|2025-04-01 10:20:06] logging.py:143 >> {'loss': 0.9047, 'learning_rate': 3.1850e-05, 'epoch': 1.23, 'throughput': 109.86}
776
+
777
+ [INFO|2025-04-01 10:20:58] logging.py:143 >> {'loss': 0.8582, 'learning_rate': 3.1470e-05, 'epoch': 1.25, 'throughput': 109.84}
778
+
779
+ [INFO|2025-04-01 10:21:50] logging.py:143 >> {'loss': 0.8787, 'learning_rate': 3.1089e-05, 'epoch': 1.26, 'throughput': 109.85}
780
+
781
+ [INFO|2025-04-01 10:22:42] logging.py:143 >> {'loss': 0.8729, 'learning_rate': 3.0706e-05, 'epoch': 1.28, 'throughput': 109.85}
782
+
783
+ [INFO|2025-04-01 10:23:37] logging.py:143 >> {'loss': 0.8772, 'learning_rate': 3.0321e-05, 'epoch': 1.29, 'throughput': 109.95}
784
+
785
+ [INFO|2025-04-01 10:24:28] logging.py:143 >> {'loss': 0.9451, 'learning_rate': 2.9935e-05, 'epoch': 1.31, 'throughput': 109.94}
786
+
787
+ [INFO|2025-04-01 10:25:22] logging.py:143 >> {'loss': 0.8202, 'learning_rate': 2.9548e-05, 'epoch': 1.32, 'throughput': 110.03}
788
+
789
+ [INFO|2025-04-01 10:26:14] logging.py:143 >> {'loss': 0.9773, 'learning_rate': 2.9160e-05, 'epoch': 1.34, 'throughput': 110.05}
790
+
791
+ [INFO|2025-04-01 10:27:04] logging.py:143 >> {'loss': 0.9101, 'learning_rate': 2.8771e-05, 'epoch': 1.35, 'throughput': 109.98}
792
+
793
+ [INFO|2025-04-01 10:27:57] logging.py:143 >> {'loss': 0.9633, 'learning_rate': 2.8380e-05, 'epoch': 1.37, 'throughput': 110.01}
794
+
795
+ [INFO|2025-04-01 10:28:47] logging.py:143 >> {'loss': 0.8886, 'learning_rate': 2.7989e-05, 'epoch': 1.38, 'throughput': 109.96}
796
+
797
+ [INFO|2025-04-01 10:29:39] logging.py:143 >> {'loss': 0.9258, 'learning_rate': 2.7598e-05, 'epoch': 1.40, 'throughput': 109.96}
798
+
799
+ [INFO|2025-04-01 10:30:30] logging.py:143 >> {'loss': 0.9039, 'learning_rate': 2.7205e-05, 'epoch': 1.41, 'throughput': 109.93}
800
+
801
+ [INFO|2025-04-01 10:31:24] logging.py:143 >> {'loss': 1.0116, 'learning_rate': 2.6812e-05, 'epoch': 1.43, 'throughput': 109.99}
802
+
803
+ [INFO|2025-04-01 10:32:14] logging.py:143 >> {'loss': 0.8218, 'learning_rate': 2.6419e-05, 'epoch': 1.44, 'throughput': 109.97}
804
+
805
+ [INFO|2025-04-01 10:33:07] logging.py:143 >> {'loss': 0.8604, 'learning_rate': 2.6025e-05, 'epoch': 1.46, 'throughput': 109.96}
806
+
807
+ [INFO|2025-04-01 10:33:59] logging.py:143 >> {'loss': 0.8044, 'learning_rate': 2.5631e-05, 'epoch': 1.47, 'throughput': 110.00}
808
+
809
+ [INFO|2025-04-01 10:34:51] logging.py:143 >> {'loss': 0.9198, 'learning_rate': 2.5237e-05, 'epoch': 1.49, 'throughput': 109.98}
810
+
811
+ [INFO|2025-04-01 10:35:45] logging.py:143 >> {'loss': 0.9181, 'learning_rate': 2.4842e-05, 'epoch': 1.50, 'throughput': 109.99}
812
+
813
+ [INFO|2025-04-01 10:35:45] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-500
814
+
815
+ [INFO|2025-04-01 10:35:45] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
816
+
817
+ [INFO|2025-04-01 10:35:45] configuration_utils.py:771 >> Model config Qwen2VLConfig {
818
+ "architectures": [
819
+ "Qwen2VLForConditionalGeneration"
820
+ ],
821
+ "attention_dropout": 0.0,
822
+ "bos_token_id": 151643,
823
+ "eos_token_id": 151645,
824
+ "hidden_act": "silu",
825
+ "hidden_size": 1536,
826
+ "image_token_id": 151655,
827
+ "initializer_range": 0.02,
828
+ "intermediate_size": 8960,
829
+ "max_position_embeddings": 32768,
830
+ "max_window_layers": 28,
831
+ "model_type": "qwen2_vl",
832
+ "num_attention_heads": 12,
833
+ "num_hidden_layers": 28,
834
+ "num_key_value_heads": 2,
835
+ "pad_token_id": 151654,
836
+ "rms_norm_eps": 1e-06,
837
+ "rope_scaling": {
838
+ "mrope_section": [
839
+ 16,
840
+ 24,
841
+ 24
842
+ ],
843
+ "rope_type": "default",
844
+ "type": "default"
845
+ },
846
+ "rope_theta": 1000000.0,
847
+ "sliding_window": 32768,
848
+ "tie_word_embeddings": true,
849
+ "torch_dtype": "bfloat16",
850
+ "transformers_version": "4.50.0",
851
+ "use_cache": true,
852
+ "use_sliding_window": false,
853
+ "video_token_id": 151656,
854
+ "vision_config": {
855
+ "depth": 32,
856
+ "embed_dim": 1280,
857
+ "hidden_act": "quick_gelu",
858
+ "hidden_size": 1536,
859
+ "in_channels": 3,
860
+ "in_chans": 3,
861
+ "mlp_ratio": 4,
862
+ "model_type": "qwen2_vl",
863
+ "num_heads": 16,
864
+ "patch_size": 14,
865
+ "spatial_merge_size": 2,
866
+ "spatial_patch_size": 14,
867
+ "temporal_patch_size": 2
868
+ },
869
+ "vision_end_token_id": 151653,
870
+ "vision_start_token_id": 151652,
871
+ "vision_token_id": 151654,
872
+ "vocab_size": 151936
873
+ }
874
+
875
+
876
+ [INFO|2025-04-01 10:35:45] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-500/tokenizer_config.json
877
+
878
+ [INFO|2025-04-01 10:35:45] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-500/special_tokens_map.json
879
+
880
+ [INFO|2025-04-01 10:36:39] logging.py:143 >> {'loss': 0.8644, 'learning_rate': 2.4448e-05, 'epoch': 1.52, 'throughput': 110.01}
881
+
882
+ [INFO|2025-04-01 10:37:33] logging.py:143 >> {'loss': 1.0127, 'learning_rate': 2.4054e-05, 'epoch': 1.53, 'throughput': 110.05}
883
+
884
+ [INFO|2025-04-01 10:38:24] logging.py:143 >> {'loss': 0.7937, 'learning_rate': 2.3660e-05, 'epoch': 1.55, 'throughput': 110.02}
885
+
886
+ [INFO|2025-04-01 10:39:15] logging.py:143 >> {'loss': 0.9806, 'learning_rate': 2.3267e-05, 'epoch': 1.56, 'throughput': 110.01}
887
+
888
+ [INFO|2025-04-01 10:40:09] logging.py:143 >> {'loss': 0.9340, 'learning_rate': 2.2873e-05, 'epoch': 1.58, 'throughput': 110.08}
889
+
890
+ [INFO|2025-04-01 10:41:02] logging.py:143 >> {'loss': 0.9288, 'learning_rate': 2.2481e-05, 'epoch': 1.59, 'throughput': 110.09}
891
+
892
+ [INFO|2025-04-01 10:41:55] logging.py:143 >> {'loss': 0.8597, 'learning_rate': 2.2089e-05, 'epoch': 1.61, 'throughput': 110.07}
893
+
894
+ [INFO|2025-04-01 10:42:48] logging.py:143 >> {'loss': 0.8817, 'learning_rate': 2.1698e-05, 'epoch': 1.62, 'throughput': 110.10}
895
+
896
+ [INFO|2025-04-01 10:43:41] logging.py:143 >> {'loss': 0.7770, 'learning_rate': 2.1307e-05, 'epoch': 1.64, 'throughput': 110.13}
897
+
898
+ [INFO|2025-04-01 10:44:34] logging.py:143 >> {'loss': 0.7980, 'learning_rate': 2.0918e-05, 'epoch': 1.65, 'throughput': 110.11}
899
+
900
+ [INFO|2025-04-01 10:45:28] logging.py:143 >> {'loss': 0.9104, 'learning_rate': 2.0529e-05, 'epoch': 1.67, 'throughput': 110.17}
901
+
902
+ [INFO|2025-04-01 10:46:22] logging.py:143 >> {'loss': 0.8293, 'learning_rate': 2.0142e-05, 'epoch': 1.68, 'throughput': 110.26}
903
+
904
+ [INFO|2025-04-01 10:47:14] logging.py:143 >> {'loss': 0.8821, 'learning_rate': 1.9756e-05, 'epoch': 1.70, 'throughput': 110.23}
905
+
906
+ [INFO|2025-04-01 10:48:07] logging.py:143 >> {'loss': 0.8253, 'learning_rate': 1.9371e-05, 'epoch': 1.71, 'throughput': 110.27}
907
+
908
+ [INFO|2025-04-01 10:48:59] logging.py:143 >> {'loss': 0.9391, 'learning_rate': 1.8988e-05, 'epoch': 1.73, 'throughput': 110.25}
909
+
910
+ [INFO|2025-04-01 10:49:52] logging.py:143 >> {'loss': 0.8711, 'learning_rate': 1.8606e-05, 'epoch': 1.74, 'throughput': 110.28}
911
+
912
+ [INFO|2025-04-01 10:50:44] logging.py:143 >> {'loss': 0.8346, 'learning_rate': 1.8225e-05, 'epoch': 1.76, 'throughput': 110.26}
913
+
914
+ [INFO|2025-04-01 10:51:36] logging.py:143 >> {'loss': 0.8275, 'learning_rate': 1.7847e-05, 'epoch': 1.77, 'throughput': 110.27}
915
+
916
+ [INFO|2025-04-01 10:52:27] logging.py:143 >> {'loss': 0.9435, 'learning_rate': 1.7470e-05, 'epoch': 1.79, 'throughput': 110.27}
917
+
918
+ [INFO|2025-04-01 10:53:20] logging.py:143 >> {'loss': 0.8584, 'learning_rate': 1.7095e-05, 'epoch': 1.80, 'throughput': 110.29}
919
+
920
+ [INFO|2025-04-01 10:53:20] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-600
921
+
922
+ [INFO|2025-04-01 10:53:20] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
923
+
924
+ [INFO|2025-04-01 10:53:20] configuration_utils.py:771 >> Model config Qwen2VLConfig {
925
+ "architectures": [
926
+ "Qwen2VLForConditionalGeneration"
927
+ ],
928
+ "attention_dropout": 0.0,
929
+ "bos_token_id": 151643,
930
+ "eos_token_id": 151645,
931
+ "hidden_act": "silu",
932
+ "hidden_size": 1536,
933
+ "image_token_id": 151655,
934
+ "initializer_range": 0.02,
935
+ "intermediate_size": 8960,
936
+ "max_position_embeddings": 32768,
937
+ "max_window_layers": 28,
938
+ "model_type": "qwen2_vl",
939
+ "num_attention_heads": 12,
940
+ "num_hidden_layers": 28,
941
+ "num_key_value_heads": 2,
942
+ "pad_token_id": 151654,
943
+ "rms_norm_eps": 1e-06,
944
+ "rope_scaling": {
945
+ "mrope_section": [
946
+ 16,
947
+ 24,
948
+ 24
949
+ ],
950
+ "rope_type": "default",
951
+ "type": "default"
952
+ },
953
+ "rope_theta": 1000000.0,
954
+ "sliding_window": 32768,
955
+ "tie_word_embeddings": true,
956
+ "torch_dtype": "bfloat16",
957
+ "transformers_version": "4.50.0",
958
+ "use_cache": true,
959
+ "use_sliding_window": false,
960
+ "video_token_id": 151656,
961
+ "vision_config": {
962
+ "depth": 32,
963
+ "embed_dim": 1280,
964
+ "hidden_act": "quick_gelu",
965
+ "hidden_size": 1536,
966
+ "in_channels": 3,
967
+ "in_chans": 3,
968
+ "mlp_ratio": 4,
969
+ "model_type": "qwen2_vl",
970
+ "num_heads": 16,
971
+ "patch_size": 14,
972
+ "spatial_merge_size": 2,
973
+ "spatial_patch_size": 14,
974
+ "temporal_patch_size": 2
975
+ },
976
+ "vision_end_token_id": 151653,
977
+ "vision_start_token_id": 151652,
978
+ "vision_token_id": 151654,
979
+ "vocab_size": 151936
980
+ }
981
+
982
+
983
+ [INFO|2025-04-01 10:53:21] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-600/tokenizer_config.json
984
+
985
+ [INFO|2025-04-01 10:53:21] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-600/special_tokens_map.json
986
+
987
+ [INFO|2025-04-01 10:54:14] logging.py:143 >> {'loss': 0.8800, 'learning_rate': 1.6722e-05, 'epoch': 1.82, 'throughput': 110.30}
988
+
989
+ [INFO|2025-04-01 10:55:06] logging.py:143 >> {'loss': 0.8825, 'learning_rate': 1.6351e-05, 'epoch': 1.83, 'throughput': 110.30}
990
+
991
+ [INFO|2025-04-01 10:56:00] logging.py:143 >> {'loss': 0.9978, 'learning_rate': 1.5982e-05, 'epoch': 1.85, 'throughput': 110.33}
992
+
993
+ [INFO|2025-04-01 10:56:54] logging.py:143 >> {'loss': 0.9626, 'learning_rate': 1.5615e-05, 'epoch': 1.86, 'throughput': 110.38}
994
+
995
+ [INFO|2025-04-01 10:57:45] logging.py:143 >> {'loss': 0.9308, 'learning_rate': 1.5251e-05, 'epoch': 1.88, 'throughput': 110.37}
996
+
997
+ [INFO|2025-04-01 10:58:39] logging.py:143 >> {'loss': 0.9757, 'learning_rate': 1.4889e-05, 'epoch': 1.89, 'throughput': 110.38}
998
+
999
+ [INFO|2025-04-01 10:59:33] logging.py:143 >> {'loss': 0.7670, 'learning_rate': 1.4530e-05, 'epoch': 1.91, 'throughput': 110.43}
1000
+
1001
+ [INFO|2025-04-01 11:00:23] logging.py:143 >> {'loss': 0.9272, 'learning_rate': 1.4173e-05, 'epoch': 1.92, 'throughput': 110.40}
1002
+
1003
+ [INFO|2025-04-01 11:01:16] logging.py:143 >> {'loss': 0.7941, 'learning_rate': 1.3819e-05, 'epoch': 1.94, 'throughput': 110.41}
1004
+
1005
+ [INFO|2025-04-01 11:02:08] logging.py:143 >> {'loss': 0.8408, 'learning_rate': 1.3468e-05, 'epoch': 1.95, 'throughput': 110.38}
1006
+
1007
+ [INFO|2025-04-01 11:03:00] logging.py:143 >> {'loss': 0.8459, 'learning_rate': 1.3120e-05, 'epoch': 1.97, 'throughput': 110.35}
1008
+
1009
+ [INFO|2025-04-01 11:03:52] logging.py:143 >> {'loss': 1.0117, 'learning_rate': 1.2774e-05, 'epoch': 1.98, 'throughput': 110.36}
1010
+
1011
+ [INFO|2025-04-01 11:04:43] logging.py:143 >> {'loss': 0.9665, 'learning_rate': 1.2432e-05, 'epoch': 2.00, 'throughput': 110.32}
1012
+
1013
+ [INFO|2025-04-01 11:05:28] logging.py:143 >> {'loss': 0.7625, 'learning_rate': 1.2093e-05, 'epoch': 2.01, 'throughput': 110.34}
1014
+
1015
+ [INFO|2025-04-01 11:06:21] logging.py:143 >> {'loss': 0.8667, 'learning_rate': 1.1756e-05, 'epoch': 2.03, 'throughput': 110.33}
1016
+
1017
+ [INFO|2025-04-01 11:07:14] logging.py:143 >> {'loss': 0.8297, 'learning_rate': 1.1424e-05, 'epoch': 2.04, 'throughput': 110.34}
1018
+
1019
+ [INFO|2025-04-01 11:08:06] logging.py:143 >> {'loss': 0.8774, 'learning_rate': 1.1094e-05, 'epoch': 2.06, 'throughput': 110.35}
1020
+
1021
+ [INFO|2025-04-01 11:08:59] logging.py:143 >> {'loss': 0.8476, 'learning_rate': 1.0768e-05, 'epoch': 2.07, 'throughput': 110.37}
1022
+
1023
+ [INFO|2025-04-01 11:09:51] logging.py:143 >> {'loss': 0.8641, 'learning_rate': 1.0446e-05, 'epoch': 2.09, 'throughput': 110.35}
1024
+
1025
+ [INFO|2025-04-01 11:10:44] logging.py:143 >> {'loss': 0.8383, 'learning_rate': 1.0127e-05, 'epoch': 2.10, 'throughput': 110.36}
1026
+
1027
+ [INFO|2025-04-01 11:10:44] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-700
1028
+
1029
+ [INFO|2025-04-01 11:10:45] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
1030
+
1031
+ [INFO|2025-04-01 11:10:45] configuration_utils.py:771 >> Model config Qwen2VLConfig {
1032
+ "architectures": [
1033
+ "Qwen2VLForConditionalGeneration"
1034
+ ],
1035
+ "attention_dropout": 0.0,
1036
+ "bos_token_id": 151643,
1037
+ "eos_token_id": 151645,
1038
+ "hidden_act": "silu",
1039
+ "hidden_size": 1536,
1040
+ "image_token_id": 151655,
1041
+ "initializer_range": 0.02,
1042
+ "intermediate_size": 8960,
1043
+ "max_position_embeddings": 32768,
1044
+ "max_window_layers": 28,
1045
+ "model_type": "qwen2_vl",
1046
+ "num_attention_heads": 12,
1047
+ "num_hidden_layers": 28,
1048
+ "num_key_value_heads": 2,
1049
+ "pad_token_id": 151654,
1050
+ "rms_norm_eps": 1e-06,
1051
+ "rope_scaling": {
1052
+ "mrope_section": [
1053
+ 16,
1054
+ 24,
1055
+ 24
1056
+ ],
1057
+ "rope_type": "default",
1058
+ "type": "default"
1059
+ },
1060
+ "rope_theta": 1000000.0,
1061
+ "sliding_window": 32768,
1062
+ "tie_word_embeddings": true,
1063
+ "torch_dtype": "bfloat16",
1064
+ "transformers_version": "4.50.0",
1065
+ "use_cache": true,
1066
+ "use_sliding_window": false,
1067
+ "video_token_id": 151656,
1068
+ "vision_config": {
1069
+ "depth": 32,
1070
+ "embed_dim": 1280,
1071
+ "hidden_act": "quick_gelu",
1072
+ "hidden_size": 1536,
1073
+ "in_channels": 3,
1074
+ "in_chans": 3,
1075
+ "mlp_ratio": 4,
1076
+ "model_type": "qwen2_vl",
1077
+ "num_heads": 16,
1078
+ "patch_size": 14,
1079
+ "spatial_merge_size": 2,
1080
+ "spatial_patch_size": 14,
1081
+ "temporal_patch_size": 2
1082
+ },
1083
+ "vision_end_token_id": 151653,
1084
+ "vision_start_token_id": 151652,
1085
+ "vision_token_id": 151654,
1086
+ "vocab_size": 151936
1087
+ }
1088
+
1089
+
1090
+ [INFO|2025-04-01 11:10:45] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-700/tokenizer_config.json
1091
+
1092
+ [INFO|2025-04-01 11:10:45] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-700/special_tokens_map.json
1093
+
1094
+ [INFO|2025-04-01 11:11:38] logging.py:143 >> {'loss': 0.9123, 'learning_rate': 9.8123e-06, 'epoch': 2.12, 'throughput': 110.32}
1095
+
1096
+ [INFO|2025-04-01 11:12:28] logging.py:143 >> {'loss': 0.9635, 'learning_rate': 9.5010e-06, 'epoch': 2.13, 'throughput': 110.28}
1097
+
1098
+ [INFO|2025-04-01 11:13:21] logging.py:143 >> {'loss': 0.9221, 'learning_rate': 9.1936e-06, 'epoch': 2.15, 'throughput': 110.30}
1099
+
1100
+ [INFO|2025-04-01 11:14:16] logging.py:143 >> {'loss': 0.8757, 'learning_rate': 8.8901e-06, 'epoch': 2.16, 'throughput': 110.34}
1101
+
1102
+ [INFO|2025-04-01 11:15:08] logging.py:143 >> {'loss': 0.7958, 'learning_rate': 8.5906e-06, 'epoch': 2.18, 'throughput': 110.36}
1103
+
1104
+ [INFO|2025-04-01 11:16:01] logging.py:143 >> {'loss': 0.7993, 'learning_rate': 8.2952e-06, 'epoch': 2.19, 'throughput': 110.36}
1105
+
1106
+ [INFO|2025-04-01 11:16:55] logging.py:143 >> {'loss': 0.8436, 'learning_rate': 8.0039e-06, 'epoch': 2.21, 'throughput': 110.37}
1107
+
1108
+ [INFO|2025-04-01 11:17:48] logging.py:143 >> {'loss': 0.8960, 'learning_rate': 7.7169e-06, 'epoch': 2.22, 'throughput': 110.38}
1109
+
1110
+ [INFO|2025-04-01 11:18:39] logging.py:143 >> {'loss': 0.8948, 'learning_rate': 7.4342e-06, 'epoch': 2.24, 'throughput': 110.35}
1111
+
1112
+ [INFO|2025-04-01 11:19:31] logging.py:143 >> {'loss': 0.8546, 'learning_rate': 7.1558e-06, 'epoch': 2.25, 'throughput': 110.33}
1113
+
1114
+ [INFO|2025-04-01 11:20:23] logging.py:143 >> {'loss': 0.8494, 'learning_rate': 6.8819e-06, 'epoch': 2.27, 'throughput': 110.35}
1115
+
1116
+ [INFO|2025-04-01 11:21:17] logging.py:143 >> {'loss': 0.7723, 'learning_rate': 6.6125e-06, 'epoch': 2.28, 'throughput': 110.40}
1117
+
1118
+ [INFO|2025-04-01 11:22:08] logging.py:143 >> {'loss': 0.9168, 'learning_rate': 6.3477e-06, 'epoch': 2.30, 'throughput': 110.37}
1119
+
1120
+ [INFO|2025-04-01 11:23:01] logging.py:143 >> {'loss': 0.8831, 'learning_rate': 6.0875e-06, 'epoch': 2.31, 'throughput': 110.38}
1121
+
1122
+ [INFO|2025-04-01 11:23:52] logging.py:143 >> {'loss': 0.8540, 'learning_rate': 5.8320e-06, 'epoch': 2.33, 'throughput': 110.32}
1123
+
1124
+ [INFO|2025-04-01 11:24:43] logging.py:143 >> {'loss': 0.8843, 'learning_rate': 5.5813e-06, 'epoch': 2.34, 'throughput': 110.32}
1125
+
1126
+ [INFO|2025-04-01 11:25:36] logging.py:143 >> {'loss': 0.8620, 'learning_rate': 5.3354e-06, 'epoch': 2.36, 'throughput': 110.34}
1127
+
1128
+ [INFO|2025-04-01 11:26:29] logging.py:143 >> {'loss': 0.9850, 'learning_rate': 5.0944e-06, 'epoch': 2.37, 'throughput': 110.34}
1129
+
1130
+ [INFO|2025-04-01 11:27:20] logging.py:143 >> {'loss': 0.7679, 'learning_rate': 4.8583e-06, 'epoch': 2.39, 'throughput': 110.31}
1131
+
1132
+ [INFO|2025-04-01 11:28:11] logging.py:143 >> {'loss': 0.8198, 'learning_rate': 4.6273e-06, 'epoch': 2.40, 'throughput': 110.30}
1133
+
1134
+ [INFO|2025-04-01 11:28:11] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-800
1135
+
1136
+ [INFO|2025-04-01 11:28:12] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
1137
+
1138
+ [INFO|2025-04-01 11:28:12] configuration_utils.py:771 >> Model config Qwen2VLConfig {
1139
+ "architectures": [
1140
+ "Qwen2VLForConditionalGeneration"
1141
+ ],
1142
+ "attention_dropout": 0.0,
1143
+ "bos_token_id": 151643,
1144
+ "eos_token_id": 151645,
1145
+ "hidden_act": "silu",
1146
+ "hidden_size": 1536,
1147
+ "image_token_id": 151655,
1148
+ "initializer_range": 0.02,
1149
+ "intermediate_size": 8960,
1150
+ "max_position_embeddings": 32768,
1151
+ "max_window_layers": 28,
1152
+ "model_type": "qwen2_vl",
1153
+ "num_attention_heads": 12,
1154
+ "num_hidden_layers": 28,
1155
+ "num_key_value_heads": 2,
1156
+ "pad_token_id": 151654,
1157
+ "rms_norm_eps": 1e-06,
1158
+ "rope_scaling": {
1159
+ "mrope_section": [
1160
+ 16,
1161
+ 24,
1162
+ 24
1163
+ ],
1164
+ "rope_type": "default",
1165
+ "type": "default"
1166
+ },
1167
+ "rope_theta": 1000000.0,
1168
+ "sliding_window": 32768,
1169
+ "tie_word_embeddings": true,
1170
+ "torch_dtype": "bfloat16",
1171
+ "transformers_version": "4.50.0",
1172
+ "use_cache": true,
1173
+ "use_sliding_window": false,
1174
+ "video_token_id": 151656,
1175
+ "vision_config": {
1176
+ "depth": 32,
1177
+ "embed_dim": 1280,
1178
+ "hidden_act": "quick_gelu",
1179
+ "hidden_size": 1536,
1180
+ "in_channels": 3,
1181
+ "in_chans": 3,
1182
+ "mlp_ratio": 4,
1183
+ "model_type": "qwen2_vl",
1184
+ "num_heads": 16,
1185
+ "patch_size": 14,
1186
+ "spatial_merge_size": 2,
1187
+ "spatial_patch_size": 14,
1188
+ "temporal_patch_size": 2
1189
+ },
1190
+ "vision_end_token_id": 151653,
1191
+ "vision_start_token_id": 151652,
1192
+ "vision_token_id": 151654,
1193
+ "vocab_size": 151936
1194
+ }
1195
+
1196
+
1197
+ [INFO|2025-04-01 11:28:12] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-800/tokenizer_config.json
1198
+
1199
+ [INFO|2025-04-01 11:28:12] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-800/special_tokens_map.json
1200
+
1201
+ [INFO|2025-04-01 11:29:06] logging.py:143 >> {'loss': 0.7773, 'learning_rate': 4.4013e-06, 'epoch': 2.42, 'throughput': 110.28}
1202
+
1203
+ [INFO|2025-04-01 11:29:58] logging.py:143 >> {'loss': 0.9312, 'learning_rate': 4.1805e-06, 'epoch': 2.43, 'throughput': 110.27}
1204
+
1205
+ [INFO|2025-04-01 11:30:50] logging.py:143 >> {'loss': 0.8497, 'learning_rate': 3.9648e-06, 'epoch': 2.45, 'throughput': 110.28}
1206
+
1207
+ [INFO|2025-04-01 11:31:43] logging.py:143 >> {'loss': 0.7820, 'learning_rate': 3.7543e-06, 'epoch': 2.46, 'throughput': 110.29}
1208
+
1209
+ [INFO|2025-04-01 11:32:36] logging.py:143 >> {'loss': 0.8937, 'learning_rate': 3.5492e-06, 'epoch': 2.48, 'throughput': 110.31}
1210
+
1211
+ [INFO|2025-04-01 11:33:29] logging.py:143 >> {'loss': 0.7039, 'learning_rate': 3.3494e-06, 'epoch': 2.49, 'throughput': 110.36}
1212
+
1213
+ [INFO|2025-04-01 11:34:22] logging.py:143 >> {'loss': 0.9265, 'learning_rate': 3.1549e-06, 'epoch': 2.51, 'throughput': 110.36}
1214
+
1215
+ [INFO|2025-04-01 11:35:13] logging.py:143 >> {'loss': 0.8669, 'learning_rate': 2.9659e-06, 'epoch': 2.52, 'throughput': 110.36}
1216
+
1217
+ [INFO|2025-04-01 11:36:05] logging.py:143 >> {'loss': 0.9174, 'learning_rate': 2.7824e-06, 'epoch': 2.54, 'throughput': 110.37}
1218
+
1219
+ [INFO|2025-04-01 11:36:58] logging.py:143 >> {'loss': 0.8718, 'learning_rate': 2.6044e-06, 'epoch': 2.55, 'throughput': 110.38}
1220
+
1221
+ [INFO|2025-04-01 11:37:50] logging.py:143 >> {'loss': 0.8634, 'learning_rate': 2.4320e-06, 'epoch': 2.57, 'throughput': 110.37}
1222
+
1223
+ [INFO|2025-04-01 11:38:43] logging.py:143 >> {'loss': 0.8450, 'learning_rate': 2.2652e-06, 'epoch': 2.58, 'throughput': 110.38}
1224
+
1225
+ [INFO|2025-04-01 11:39:34] logging.py:143 >> {'loss': 0.8008, 'learning_rate': 2.1040e-06, 'epoch': 2.60, 'throughput': 110.36}
1226
+
1227
+ [INFO|2025-04-01 11:40:26] logging.py:143 >> {'loss': 0.8797, 'learning_rate': 1.9485e-06, 'epoch': 2.61, 'throughput': 110.36}
1228
+
1229
+ [INFO|2025-04-01 11:41:19] logging.py:143 >> {'loss': 0.9460, 'learning_rate': 1.7988e-06, 'epoch': 2.63, 'throughput': 110.37}
1230
+
1231
+ [INFO|2025-04-01 11:42:10] logging.py:143 >> {'loss': 0.8032, 'learning_rate': 1.6548e-06, 'epoch': 2.64, 'throughput': 110.36}
1232
+
1233
+ [INFO|2025-04-01 11:43:02] logging.py:143 >> {'loss': 0.8892, 'learning_rate': 1.5167e-06, 'epoch': 2.66, 'throughput': 110.37}
1234
+
1235
+ [INFO|2025-04-01 11:43:56] logging.py:143 >> {'loss': 0.8560, 'learning_rate': 1.3844e-06, 'epoch': 2.67, 'throughput': 110.39}
1236
+
1237
+ [INFO|2025-04-01 11:44:49] logging.py:143 >> {'loss': 0.8617, 'learning_rate': 1.2579e-06, 'epoch': 2.69, 'throughput': 110.43}
1238
+
1239
+ [INFO|2025-04-01 11:45:43] logging.py:143 >> {'loss': 0.9117, 'learning_rate': 1.1374e-06, 'epoch': 2.70, 'throughput': 110.46}
1240
+
1241
+ [INFO|2025-04-01 11:45:43] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-900
1242
+
1243
+ [INFO|2025-04-01 11:45:44] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
1244
+
1245
+ [INFO|2025-04-01 11:45:44] configuration_utils.py:771 >> Model config Qwen2VLConfig {
1246
+ "architectures": [
1247
+ "Qwen2VLForConditionalGeneration"
1248
+ ],
1249
+ "attention_dropout": 0.0,
1250
+ "bos_token_id": 151643,
1251
+ "eos_token_id": 151645,
1252
+ "hidden_act": "silu",
1253
+ "hidden_size": 1536,
1254
+ "image_token_id": 151655,
1255
+ "initializer_range": 0.02,
1256
+ "intermediate_size": 8960,
1257
+ "max_position_embeddings": 32768,
1258
+ "max_window_layers": 28,
1259
+ "model_type": "qwen2_vl",
1260
+ "num_attention_heads": 12,
1261
+ "num_hidden_layers": 28,
1262
+ "num_key_value_heads": 2,
1263
+ "pad_token_id": 151654,
1264
+ "rms_norm_eps": 1e-06,
1265
+ "rope_scaling": {
1266
+ "mrope_section": [
1267
+ 16,
1268
+ 24,
1269
+ 24
1270
+ ],
1271
+ "rope_type": "default",
1272
+ "type": "default"
1273
+ },
1274
+ "rope_theta": 1000000.0,
1275
+ "sliding_window": 32768,
1276
+ "tie_word_embeddings": true,
1277
+ "torch_dtype": "bfloat16",
1278
+ "transformers_version": "4.50.0",
1279
+ "use_cache": true,
1280
+ "use_sliding_window": false,
1281
+ "video_token_id": 151656,
1282
+ "vision_config": {
1283
+ "depth": 32,
1284
+ "embed_dim": 1280,
1285
+ "hidden_act": "quick_gelu",
1286
+ "hidden_size": 1536,
1287
+ "in_channels": 3,
1288
+ "in_chans": 3,
1289
+ "mlp_ratio": 4,
1290
+ "model_type": "qwen2_vl",
1291
+ "num_heads": 16,
1292
+ "patch_size": 14,
1293
+ "spatial_merge_size": 2,
1294
+ "spatial_patch_size": 14,
1295
+ "temporal_patch_size": 2
1296
+ },
1297
+ "vision_end_token_id": 151653,
1298
+ "vision_start_token_id": 151652,
1299
+ "vision_token_id": 151654,
1300
+ "vocab_size": 151936
1301
+ }
1302
+
1303
+
1304
+ [INFO|2025-04-01 11:45:44] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-900/tokenizer_config.json
1305
+
1306
+ [INFO|2025-04-01 11:45:44] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-900/special_tokens_map.json
1307
+
1308
+ [INFO|2025-04-01 11:46:37] logging.py:143 >> {'loss': 0.7855, 'learning_rate': 1.0228e-06, 'epoch': 2.72, 'throughput': 110.44}
1309
+
1310
+ [INFO|2025-04-01 11:47:29] logging.py:143 >> {'loss': 0.8212, 'learning_rate': 9.1416e-07, 'epoch': 2.73, 'throughput': 110.42}
1311
+
1312
+ [INFO|2025-04-01 11:48:22] logging.py:143 >> {'loss': 0.8404, 'learning_rate': 8.1152e-07, 'epoch': 2.75, 'throughput': 110.45}
1313
+
1314
+ [INFO|2025-04-01 11:49:15] logging.py:143 >> {'loss': 0.7782, 'learning_rate': 7.1489e-07, 'epoch': 2.76, 'throughput': 110.45}
1315
+
1316
+ [INFO|2025-04-01 11:50:08] logging.py:143 >> {'loss': 0.7847, 'learning_rate': 6.2430e-07, 'epoch': 2.78, 'throughput': 110.45}
1317
+
1318
+ [INFO|2025-04-01 11:51:01] logging.py:143 >> {'loss': 0.8857, 'learning_rate': 5.3977e-07, 'epoch': 2.79, 'throughput': 110.46}
1319
+
1320
+ [INFO|2025-04-01 11:51:54] logging.py:143 >> {'loss': 0.8029, 'learning_rate': 4.6133e-07, 'epoch': 2.81, 'throughput': 110.48}
1321
+
1322
+ [INFO|2025-04-01 11:52:46] logging.py:143 >> {'loss': 0.8154, 'learning_rate': 3.8899e-07, 'epoch': 2.82, 'throughput': 110.45}
1323
+
1324
+ [INFO|2025-04-01 11:53:39] logging.py:143 >> {'loss': 0.8791, 'learning_rate': 3.2277e-07, 'epoch': 2.84, 'throughput': 110.45}
1325
+
1326
+ [INFO|2025-04-01 11:54:32] logging.py:143 >> {'loss': 0.7870, 'learning_rate': 2.6269e-07, 'epoch': 2.85, 'throughput': 110.46}
1327
+
1328
+ [INFO|2025-04-01 11:55:26] logging.py:143 >> {'loss': 0.8831, 'learning_rate': 2.0876e-07, 'epoch': 2.87, 'throughput': 110.49}
1329
+
1330
+ [INFO|2025-04-01 11:56:19] logging.py:143 >> {'loss': 0.7677, 'learning_rate': 1.6100e-07, 'epoch': 2.88, 'throughput': 110.49}
1331
+
1332
+ [INFO|2025-04-01 11:57:10] logging.py:143 >> {'loss': 0.7567, 'learning_rate': 1.1942e-07, 'epoch': 2.90, 'throughput': 110.47}
1333
+
1334
+ [INFO|2025-04-01 11:58:02] logging.py:143 >> {'loss': 0.8944, 'learning_rate': 8.4022e-08, 'epoch': 2.91, 'throughput': 110.46}
1335
+
1336
+ [INFO|2025-04-01 11:58:53] logging.py:143 >> {'loss': 0.9737, 'learning_rate': 5.4824e-08, 'epoch': 2.93, 'throughput': 110.44}
1337
+
1338
+ [INFO|2025-04-01 11:59:45] logging.py:143 >> {'loss': 0.8965, 'learning_rate': 3.1830e-08, 'epoch': 2.95, 'throughput': 110.44}
1339
+
1340
+ [INFO|2025-04-01 12:00:37] logging.py:143 >> {'loss': 0.8370, 'learning_rate': 1.5046e-08, 'epoch': 2.96, 'throughput': 110.41}
1341
+
1342
+ [INFO|2025-04-01 12:01:29] logging.py:143 >> {'loss': 0.7812, 'learning_rate': 4.4769e-09, 'epoch': 2.98, 'throughput': 110.40}
1343
+
1344
+ [INFO|2025-04-01 12:02:24] logging.py:143 >> {'loss': 0.8613, 'learning_rate': 1.2436e-10, 'epoch': 2.99, 'throughput': 110.45}
1345
+
1346
+ [INFO|2025-04-01 12:02:35] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-996
1347
+
1348
+ [INFO|2025-04-01 12:02:35] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
1349
+
1350
+ [INFO|2025-04-01 12:02:35] configuration_utils.py:771 >> Model config Qwen2VLConfig {
1351
+ "architectures": [
1352
+ "Qwen2VLForConditionalGeneration"
1353
+ ],
1354
+ "attention_dropout": 0.0,
1355
+ "bos_token_id": 151643,
1356
+ "eos_token_id": 151645,
1357
+ "hidden_act": "silu",
1358
+ "hidden_size": 1536,
1359
+ "image_token_id": 151655,
1360
+ "initializer_range": 0.02,
1361
+ "intermediate_size": 8960,
1362
+ "max_position_embeddings": 32768,
1363
+ "max_window_layers": 28,
1364
+ "model_type": "qwen2_vl",
1365
+ "num_attention_heads": 12,
1366
+ "num_hidden_layers": 28,
1367
+ "num_key_value_heads": 2,
1368
+ "pad_token_id": 151654,
1369
+ "rms_norm_eps": 1e-06,
1370
+ "rope_scaling": {
1371
+ "mrope_section": [
1372
+ 16,
1373
+ 24,
1374
+ 24
1375
+ ],
1376
+ "rope_type": "default",
1377
+ "type": "default"
1378
+ },
1379
+ "rope_theta": 1000000.0,
1380
+ "sliding_window": 32768,
1381
+ "tie_word_embeddings": true,
1382
+ "torch_dtype": "bfloat16",
1383
+ "transformers_version": "4.50.0",
1384
+ "use_cache": true,
1385
+ "use_sliding_window": false,
1386
+ "video_token_id": 151656,
1387
+ "vision_config": {
1388
+ "depth": 32,
1389
+ "embed_dim": 1280,
1390
+ "hidden_act": "quick_gelu",
1391
+ "hidden_size": 1536,
1392
+ "in_channels": 3,
1393
+ "in_chans": 3,
1394
+ "mlp_ratio": 4,
1395
+ "model_type": "qwen2_vl",
1396
+ "num_heads": 16,
1397
+ "patch_size": 14,
1398
+ "spatial_merge_size": 2,
1399
+ "spatial_patch_size": 14,
1400
+ "temporal_patch_size": 2
1401
+ },
1402
+ "vision_end_token_id": 151653,
1403
+ "vision_start_token_id": 151652,
1404
+ "vision_token_id": 151654,
1405
+ "vocab_size": 151936
1406
+ }
1407
+
1408
+
1409
+ [INFO|2025-04-01 12:02:35] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-996/tokenizer_config.json
1410
+
1411
+ [INFO|2025-04-01 12:02:35] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-996/special_tokens_map.json
1412
+
1413
+ [INFO|2025-04-01 12:02:36] trainer.py:2665 >>
1414
+
1415
+ Training completed. Do not forget to share your model on huggingface.co/models =)
1416
+
1417
+
1418
+
1419
+ [INFO|2025-04-01 12:02:36] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36
1420
+
1421
+ [INFO|2025-04-01 12:02:37] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
1422
+
1423
+ [INFO|2025-04-01 12:02:37] configuration_utils.py:771 >> Model config Qwen2VLConfig {
1424
+ "architectures": [
1425
+ "Qwen2VLForConditionalGeneration"
1426
+ ],
1427
+ "attention_dropout": 0.0,
1428
+ "bos_token_id": 151643,
1429
+ "eos_token_id": 151645,
1430
+ "hidden_act": "silu",
1431
+ "hidden_size": 1536,
1432
+ "image_token_id": 151655,
1433
+ "initializer_range": 0.02,
1434
+ "intermediate_size": 8960,
1435
+ "max_position_embeddings": 32768,
1436
+ "max_window_layers": 28,
1437
+ "model_type": "qwen2_vl",
1438
+ "num_attention_heads": 12,
1439
+ "num_hidden_layers": 28,
1440
+ "num_key_value_heads": 2,
1441
+ "pad_token_id": 151654,
1442
+ "rms_norm_eps": 1e-06,
1443
+ "rope_scaling": {
1444
+ "mrope_section": [
1445
+ 16,
1446
+ 24,
1447
+ 24
1448
+ ],
1449
+ "rope_type": "default",
1450
+ "type": "default"
1451
+ },
1452
+ "rope_theta": 1000000.0,
1453
+ "sliding_window": 32768,
1454
+ "tie_word_embeddings": true,
1455
+ "torch_dtype": "bfloat16",
1456
+ "transformers_version": "4.50.0",
1457
+ "use_cache": true,
1458
+ "use_sliding_window": false,
1459
+ "video_token_id": 151656,
1460
+ "vision_config": {
1461
+ "depth": 32,
1462
+ "embed_dim": 1280,
1463
+ "hidden_act": "quick_gelu",
1464
+ "hidden_size": 1536,
1465
+ "in_channels": 3,
1466
+ "in_chans": 3,
1467
+ "mlp_ratio": 4,
1468
+ "model_type": "qwen2_vl",
1469
+ "num_heads": 16,
1470
+ "patch_size": 14,
1471
+ "spatial_merge_size": 2,
1472
+ "spatial_patch_size": 14,
1473
+ "temporal_patch_size": 2
1474
+ },
1475
+ "vision_end_token_id": 151653,
1476
+ "vision_start_token_id": 151652,
1477
+ "vision_token_id": 151654,
1478
+ "vocab_size": 151936
1479
+ }
1480
+
1481
+
1482
+ [INFO|2025-04-01 12:02:37] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/tokenizer_config.json
1483
+
1484
+ [INFO|2025-04-01 12:02:37] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/special_tokens_map.json
1485
+
1486
+ [WARNING|2025-04-01 12:02:37] logging.py:148 >> No metric eval_loss to plot.
1487
+
1488
+ [WARNING|2025-04-01 12:02:37] logging.py:148 >> No metric eval_accuracy to plot.
1489
+
1490
+ [INFO|2025-04-01 12:02:37] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields:
1491
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
1492
+
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "eos_token": {
3
+ "content": "<|endoftext|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "pad_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d044cddc0af2b81635b0de71dba0a4a4d494dc953a5febbf525672df5af2e23
3
+ size 11420365
tokenizer_config.json ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "151643": {
4
+ "content": "<|endoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "151644": {
12
+ "content": "<|im_start|>",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "151645": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": true,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "151646": {
28
+ "content": "<|object_ref_start|>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": false
34
+ },
35
+ "151647": {
36
+ "content": "<|object_ref_end|>",
37
+ "lstrip": false,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": false
42
+ },
43
+ "151648": {
44
+ "content": "<|box_start|>",
45
+ "lstrip": false,
46
+ "normalized": true,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "151649": {
52
+ "content": "<|box_end|>",
53
+ "lstrip": false,
54
+ "normalized": true,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": false
58
+ },
59
+ "151650": {
60
+ "content": "<|quad_start|>",
61
+ "lstrip": false,
62
+ "normalized": true,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": false
66
+ },
67
+ "151651": {
68
+ "content": "<|quad_end|>",
69
+ "lstrip": false,
70
+ "normalized": true,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": false
74
+ },
75
+ "151652": {
76
+ "content": "<|vision_start|>",
77
+ "lstrip": false,
78
+ "normalized": true,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "151653": {
84
+ "content": "<|vision_end|>",
85
+ "lstrip": false,
86
+ "normalized": true,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "151654": {
92
+ "content": "<|vision_pad|>",
93
+ "lstrip": false,
94
+ "normalized": true,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "151655": {
100
+ "content": "<|image_pad|>",
101
+ "lstrip": false,
102
+ "normalized": true,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "151656": {
108
+ "content": "<|video_pad|>",
109
+ "lstrip": false,
110
+ "normalized": true,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ }
115
+ },
116
+ "bos_token": null,
117
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ 'System: ' + system_message + '<|endoftext|>' + '\n' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'Human: ' + content + '<|endoftext|>' + '\nAssistant:' }}{% elif message['role'] == 'assistant' %}{{ content + '<|endoftext|>' + '\n' }}{% endif %}{% endfor %}",
118
+ "clean_up_tokenization_spaces": false,
119
+ "eos_token": "<|endoftext|>",
120
+ "errors": "replace",
121
+ "extra_special_tokens": {},
122
+ "model_max_length": 1000000000000000019884624838656,
123
+ "pad_token": "<|endoftext|>",
124
+ "padding_side": "right",
125
+ "split_special_tokens": false,
126
+ "tokenizer_class": "Qwen2Tokenizer",
127
+ "unk_token": "<|endoftext|>"
128
+ }
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 2.9932279909706545,
3
+ "num_input_tokens_seen": 1157808,
4
+ "total_flos": 1.3788411572404224e+16,
5
+ "train_loss": 0.939127180590687,
6
+ "train_runtime": 10484.6402,
7
+ "train_samples_per_second": 0.761,
8
+ "train_steps_per_second": 0.095
9
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 996, "loss": 2.4707, "lr": 4.9996890990217804e-05, "epoch": 0.015048908954100828, "percentage": 0.5, "elapsed_time": "0:00:54", "remaining_time": "2:59:14", "throughput": 108.07, "total_tokens": 5864}
2
+ {"current_steps": 10, "total_steps": 996, "loss": 2.2509, "lr": 4.9987564734146566e-05, "epoch": 0.030097817908201655, "percentage": 1.0, "elapsed_time": "0:01:47", "remaining_time": "2:57:04", "throughput": 106.09, "total_tokens": 11432}
3
+ {"current_steps": 15, "total_steps": 996, "loss": 1.6895, "lr": 4.997202355141999e-05, "epoch": 0.045146726862302484, "percentage": 1.51, "elapsed_time": "0:02:40", "remaining_time": "2:54:28", "throughput": 106.2, "total_tokens": 17000}
4
+ {"current_steps": 20, "total_steps": 996, "loss": 1.4876, "lr": 4.995027130745321e-05, "epoch": 0.06019563581640331, "percentage": 2.01, "elapsed_time": "0:03:33", "remaining_time": "2:53:25", "throughput": 107.12, "total_tokens": 22840}
5
+ {"current_steps": 25, "total_steps": 996, "loss": 1.4812, "lr": 4.992231341248137e-05, "epoch": 0.07524454477050414, "percentage": 2.51, "elapsed_time": "0:04:27", "remaining_time": "2:53:07", "throughput": 108.37, "total_tokens": 28984}
6
+ {"current_steps": 30, "total_steps": 996, "loss": 1.3642, "lr": 4.9888156820213974e-05, "epoch": 0.09029345372460497, "percentage": 3.01, "elapsed_time": "0:05:20", "remaining_time": "2:51:47", "throughput": 108.89, "total_tokens": 34856}
7
+ {"current_steps": 35, "total_steps": 996, "loss": 1.3651, "lr": 4.9847810026105394e-05, "epoch": 0.1053423626787058, "percentage": 3.51, "elapsed_time": "0:06:15", "remaining_time": "2:51:37", "throughput": 109.89, "total_tokens": 41216}
8
+ {"current_steps": 40, "total_steps": 996, "loss": 1.1321, "lr": 4.980128306524183e-05, "epoch": 0.12039127163280662, "percentage": 4.02, "elapsed_time": "0:07:09", "remaining_time": "2:50:57", "throughput": 110.22, "total_tokens": 47304}
9
+ {"current_steps": 45, "total_steps": 996, "loss": 1.3012, "lr": 4.97485875098454e-05, "epoch": 0.13544018058690746, "percentage": 4.52, "elapsed_time": "0:08:02", "remaining_time": "2:49:52", "throughput": 110.28, "total_tokens": 53184}
10
+ {"current_steps": 50, "total_steps": 996, "loss": 0.9827, "lr": 4.968973646639589e-05, "epoch": 0.1504890895410083, "percentage": 5.02, "elapsed_time": "0:08:55", "remaining_time": "2:48:50", "throughput": 110.24, "total_tokens": 59024}
11
+ {"current_steps": 55, "total_steps": 996, "loss": 1.2313, "lr": 4.9624744572370865e-05, "epoch": 0.1655379984951091, "percentage": 5.52, "elapsed_time": "0:09:49", "remaining_time": "2:47:59", "throughput": 110.02, "total_tokens": 64816}
12
+ {"current_steps": 60, "total_steps": 996, "loss": 1.0347, "lr": 4.9553627992605066e-05, "epoch": 0.18058690744920994, "percentage": 6.02, "elapsed_time": "0:10:43", "remaining_time": "2:47:18", "throughput": 110.1, "total_tokens": 70848}
13
+ {"current_steps": 65, "total_steps": 996, "loss": 1.0422, "lr": 4.947640441526989e-05, "epoch": 0.19563581640331076, "percentage": 6.53, "elapsed_time": "0:11:37", "remaining_time": "2:46:23", "throughput": 110.31, "total_tokens": 76888}
14
+ {"current_steps": 70, "total_steps": 996, "loss": 0.9996, "lr": 4.939309304747391e-05, "epoch": 0.2106847253574116, "percentage": 7.03, "elapsed_time": "0:12:30", "remaining_time": "2:45:29", "throughput": 110.36, "total_tokens": 82840}
15
+ {"current_steps": 75, "total_steps": 996, "loss": 1.0755, "lr": 4.930371461048571e-05, "epoch": 0.22573363431151242, "percentage": 7.53, "elapsed_time": "0:13:24", "remaining_time": "2:44:38", "throughput": 110.41, "total_tokens": 88824}
16
+ {"current_steps": 80, "total_steps": 996, "loss": 1.026, "lr": 4.9208291334580104e-05, "epoch": 0.24078254326561324, "percentage": 8.03, "elapsed_time": "0:14:15", "remaining_time": "2:43:18", "throughput": 110.15, "total_tokens": 94264}
17
+ {"current_steps": 85, "total_steps": 996, "loss": 1.1307, "lr": 4.910684695350895e-05, "epoch": 0.2558314522197141, "percentage": 8.53, "elapsed_time": "0:15:07", "remaining_time": "2:42:09", "throughput": 110.04, "total_tokens": 99896}
18
+ {"current_steps": 90, "total_steps": 996, "loss": 1.0221, "lr": 4.8999406698598074e-05, "epoch": 0.2708803611738149, "percentage": 9.04, "elapsed_time": "0:16:00", "remaining_time": "2:41:13", "throughput": 109.93, "total_tokens": 105640}
19
+ {"current_steps": 95, "total_steps": 996, "loss": 1.012, "lr": 4.8885997292471774e-05, "epoch": 0.28592927012791575, "percentage": 9.54, "elapsed_time": "0:16:52", "remaining_time": "2:39:58", "throughput": 109.96, "total_tokens": 111280}
20
+ {"current_steps": 100, "total_steps": 996, "loss": 1.0151, "lr": 4.87666469424063e-05, "epoch": 0.3009781790820166, "percentage": 10.04, "elapsed_time": "0:17:42", "remaining_time": "2:38:41", "throughput": 109.77, "total_tokens": 116640}
21
+ {"current_steps": 105, "total_steps": 996, "loss": 1.0028, "lr": 4.86413853333141e-05, "epoch": 0.3160270880361174, "percentage": 10.54, "elapsed_time": "0:18:34", "remaining_time": "2:37:40", "throughput": 109.3, "total_tokens": 121864}
22
+ {"current_steps": 110, "total_steps": 996, "loss": 1.143, "lr": 4.851024362036064e-05, "epoch": 0.3310759969902182, "percentage": 11.04, "elapsed_time": "0:19:26", "remaining_time": "2:36:35", "throughput": 109.2, "total_tokens": 127384}
23
+ {"current_steps": 115, "total_steps": 996, "loss": 0.9695, "lr": 4.837325442121538e-05, "epoch": 0.34612490594431905, "percentage": 11.55, "elapsed_time": "0:20:18", "remaining_time": "2:35:32", "throughput": 109.18, "total_tokens": 133008}
24
+ {"current_steps": 120, "total_steps": 996, "loss": 0.9017, "lr": 4.8230451807939135e-05, "epoch": 0.3611738148984199, "percentage": 12.05, "elapsed_time": "0:21:12", "remaining_time": "2:34:49", "throughput": 109.34, "total_tokens": 139144}
25
+ {"current_steps": 125, "total_steps": 996, "loss": 1.035, "lr": 4.808187129850963e-05, "epoch": 0.3762227238525207, "percentage": 12.55, "elapsed_time": "0:22:05", "remaining_time": "2:33:55", "throughput": 109.29, "total_tokens": 144848}
26
+ {"current_steps": 130, "total_steps": 996, "loss": 1.0128, "lr": 4.792754984798745e-05, "epoch": 0.3912716328066215, "percentage": 13.05, "elapsed_time": "0:22:57", "remaining_time": "2:32:55", "throughput": 109.25, "total_tokens": 150480}
27
+ {"current_steps": 135, "total_steps": 996, "loss": 0.9432, "lr": 4.776752583932454e-05, "epoch": 0.40632054176072235, "percentage": 13.55, "elapsed_time": "0:23:50", "remaining_time": "2:32:00", "throughput": 109.32, "total_tokens": 156336}
28
+ {"current_steps": 140, "total_steps": 996, "loss": 1.0344, "lr": 4.760183907381757e-05, "epoch": 0.4213694507148232, "percentage": 14.06, "elapsed_time": "0:24:43", "remaining_time": "2:31:10", "throughput": 109.5, "total_tokens": 162440}
29
+ {"current_steps": 145, "total_steps": 996, "loss": 0.9452, "lr": 4.7430530761208494e-05, "epoch": 0.436418359668924, "percentage": 14.56, "elapsed_time": "0:25:36", "remaining_time": "2:30:16", "throughput": 109.55, "total_tokens": 168304}
30
+ {"current_steps": 150, "total_steps": 996, "loss": 0.9559, "lr": 4.725364350943492e-05, "epoch": 0.45146726862302483, "percentage": 15.06, "elapsed_time": "0:26:28", "remaining_time": "2:29:18", "throughput": 109.53, "total_tokens": 173984}
31
+ {"current_steps": 155, "total_steps": 996, "loss": 0.9726, "lr": 4.707122131403251e-05, "epoch": 0.46651617757712566, "percentage": 15.56, "elapsed_time": "0:27:21", "remaining_time": "2:28:25", "throughput": 109.6, "total_tokens": 179896}
32
+ {"current_steps": 160, "total_steps": 996, "loss": 0.9344, "lr": 4.6883309547192476e-05, "epoch": 0.4815650865312265, "percentage": 16.06, "elapsed_time": "0:28:12", "remaining_time": "2:27:23", "throughput": 109.48, "total_tokens": 185296}
33
+ {"current_steps": 165, "total_steps": 996, "loss": 0.9497, "lr": 4.668995494647653e-05, "epoch": 0.4966139954853273, "percentage": 16.57, "elapsed_time": "0:29:05", "remaining_time": "2:26:29", "throughput": 109.4, "total_tokens": 190928}
34
+ {"current_steps": 170, "total_steps": 996, "loss": 1.057, "lr": 4.649120560319225e-05, "epoch": 0.5116629044394282, "percentage": 17.07, "elapsed_time": "0:30:00", "remaining_time": "2:25:49", "throughput": 109.59, "total_tokens": 197352}
35
+ {"current_steps": 175, "total_steps": 996, "loss": 0.9847, "lr": 4.6287110950431865e-05, "epoch": 0.526711813393529, "percentage": 17.57, "elapsed_time": "0:30:53", "remaining_time": "2:24:54", "throughput": 109.65, "total_tokens": 203216}
36
+ {"current_steps": 180, "total_steps": 996, "loss": 1.001, "lr": 4.607772175077711e-05, "epoch": 0.5417607223476298, "percentage": 18.07, "elapsed_time": "0:31:44", "remaining_time": "2:23:55", "throughput": 109.52, "total_tokens": 208624}
37
+ {"current_steps": 185, "total_steps": 996, "loss": 0.9384, "lr": 4.586309008367359e-05, "epoch": 0.5568096313017307, "percentage": 18.57, "elapsed_time": "0:32:38", "remaining_time": "2:23:04", "throughput": 109.56, "total_tokens": 214552}
38
+ {"current_steps": 190, "total_steps": 996, "loss": 1.0312, "lr": 4.564326933247752e-05, "epoch": 0.5718585402558315, "percentage": 19.08, "elapsed_time": "0:33:32", "remaining_time": "2:22:16", "throughput": 109.68, "total_tokens": 220704}
39
+ {"current_steps": 195, "total_steps": 996, "loss": 0.9112, "lr": 4.541831417117815e-05, "epoch": 0.5869074492099323, "percentage": 19.58, "elapsed_time": "0:34:24", "remaining_time": "2:21:20", "throughput": 109.7, "total_tokens": 226480}
40
+ {"current_steps": 200, "total_steps": 996, "loss": 0.9967, "lr": 4.518828055079925e-05, "epoch": 0.6019563581640331, "percentage": 20.08, "elapsed_time": "0:35:16", "remaining_time": "2:20:25", "throughput": 109.66, "total_tokens": 232136}
41
+ {"current_steps": 205, "total_steps": 996, "loss": 1.0905, "lr": 4.4953225685482904e-05, "epoch": 0.617005267118134, "percentage": 20.58, "elapsed_time": "0:36:11", "remaining_time": "2:19:38", "throughput": 109.64, "total_tokens": 238072}
42
+ {"current_steps": 210, "total_steps": 996, "loss": 0.9487, "lr": 4.471320803825915e-05, "epoch": 0.6320541760722348, "percentage": 21.08, "elapsed_time": "0:37:04", "remaining_time": "2:18:44", "throughput": 109.56, "total_tokens": 243680}
43
+ {"current_steps": 215, "total_steps": 996, "loss": 0.8675, "lr": 4.4468287306505045e-05, "epoch": 0.6471030850263356, "percentage": 21.59, "elapsed_time": "0:37:56", "remaining_time": "2:17:48", "throughput": 109.56, "total_tokens": 249376}
44
+ {"current_steps": 220, "total_steps": 996, "loss": 0.8624, "lr": 4.421852440709666e-05, "epoch": 0.6621519939804364, "percentage": 22.09, "elapsed_time": "0:38:48", "remaining_time": "2:16:54", "throughput": 109.61, "total_tokens": 255288}
45
+ {"current_steps": 225, "total_steps": 996, "loss": 1.0489, "lr": 4.39639814612578e-05, "epoch": 0.6772009029345373, "percentage": 22.59, "elapsed_time": "0:39:43", "remaining_time": "2:16:08", "throughput": 109.74, "total_tokens": 261592}
46
+ {"current_steps": 230, "total_steps": 996, "loss": 0.9139, "lr": 4.370472177910914e-05, "epoch": 0.6922498118886381, "percentage": 23.09, "elapsed_time": "0:40:36", "remaining_time": "2:15:13", "throughput": 109.68, "total_tokens": 267192}
47
+ {"current_steps": 235, "total_steps": 996, "loss": 0.9905, "lr": 4.3440809843921725e-05, "epoch": 0.7072987208427389, "percentage": 23.59, "elapsed_time": "0:41:27", "remaining_time": "2:14:16", "throughput": 109.62, "total_tokens": 272712}
48
+ {"current_steps": 240, "total_steps": 996, "loss": 0.8974, "lr": 4.3172311296078595e-05, "epoch": 0.7223476297968398, "percentage": 24.1, "elapsed_time": "0:42:21", "remaining_time": "2:13:25", "throughput": 109.66, "total_tokens": 278720}
49
+ {"current_steps": 245, "total_steps": 996, "loss": 0.999, "lr": 4.28992929167487e-05, "epoch": 0.7373965387509406, "percentage": 24.6, "elapsed_time": "0:43:14", "remaining_time": "2:12:33", "throughput": 109.68, "total_tokens": 284584}
50
+ {"current_steps": 250, "total_steps": 996, "loss": 0.9916, "lr": 4.2621822611277e-05, "epoch": 0.7524454477050414, "percentage": 25.1, "elapsed_time": "0:44:08", "remaining_time": "2:11:42", "throughput": 109.66, "total_tokens": 290408}
51
+ {"current_steps": 255, "total_steps": 996, "loss": 0.9242, "lr": 4.233996939229502e-05, "epoch": 0.7674943566591422, "percentage": 25.6, "elapsed_time": "0:45:00", "remaining_time": "2:10:46", "throughput": 109.54, "total_tokens": 295776}
52
+ {"current_steps": 260, "total_steps": 996, "loss": 1.0426, "lr": 4.205380336255594e-05, "epoch": 0.782543265613243, "percentage": 26.1, "elapsed_time": "0:45:54", "remaining_time": "2:09:55", "throughput": 109.56, "total_tokens": 301736}
53
+ {"current_steps": 265, "total_steps": 996, "loss": 0.8625, "lr": 4.176339569749865e-05, "epoch": 0.7975921745673439, "percentage": 26.61, "elapsed_time": "0:46:45", "remaining_time": "2:08:59", "throughput": 109.5, "total_tokens": 307224}
54
+ {"current_steps": 270, "total_steps": 996, "loss": 0.9959, "lr": 4.1468818627544845e-05, "epoch": 0.8126410835214447, "percentage": 27.11, "elapsed_time": "0:47:38", "remaining_time": "2:08:06", "throughput": 109.51, "total_tokens": 313040}
55
+ {"current_steps": 275, "total_steps": 996, "loss": 0.939, "lr": 4.11701454201339e-05, "epoch": 0.8276899924755455, "percentage": 27.61, "elapsed_time": "0:48:32", "remaining_time": "2:07:15", "throughput": 109.57, "total_tokens": 319112}
56
+ {"current_steps": 280, "total_steps": 996, "loss": 0.9741, "lr": 4.08674503614997e-05, "epoch": 0.8427389014296464, "percentage": 28.11, "elapsed_time": "0:49:25", "remaining_time": "2:06:22", "throughput": 109.61, "total_tokens": 325040}
57
+ {"current_steps": 285, "total_steps": 996, "loss": 0.98, "lr": 4.0560808738194114e-05, "epoch": 0.8577878103837472, "percentage": 28.61, "elapsed_time": "0:50:18", "remaining_time": "2:05:30", "throughput": 109.62, "total_tokens": 330904}
58
+ {"current_steps": 290, "total_steps": 996, "loss": 0.8898, "lr": 4.0250296818361647e-05, "epoch": 0.872836719337848, "percentage": 29.12, "elapsed_time": "0:51:10", "remaining_time": "2:04:35", "throughput": 109.54, "total_tokens": 336392}
59
+ {"current_steps": 295, "total_steps": 996, "loss": 0.953, "lr": 3.993599183277001e-05, "epoch": 0.8878856282919488, "percentage": 29.62, "elapsed_time": "0:52:06", "remaining_time": "2:03:48", "throughput": 109.67, "total_tokens": 342832}
60
+ {"current_steps": 300, "total_steps": 996, "loss": 0.9311, "lr": 3.961797195560118e-05, "epoch": 0.9029345372460497, "percentage": 30.12, "elapsed_time": "0:52:59", "remaining_time": "2:02:56", "throughput": 109.75, "total_tokens": 348944}
61
+ {"current_steps": 305, "total_steps": 996, "loss": 0.9114, "lr": 3.9296316285007887e-05, "epoch": 0.9179834462001505, "percentage": 30.62, "elapsed_time": "0:53:52", "remaining_time": "2:02:03", "throughput": 109.73, "total_tokens": 354680}
62
+ {"current_steps": 310, "total_steps": 996, "loss": 0.9674, "lr": 3.897110482344024e-05, "epoch": 0.9330323551542513, "percentage": 31.12, "elapsed_time": "0:54:46", "remaining_time": "2:01:12", "throughput": 109.85, "total_tokens": 361008}
63
+ {"current_steps": 315, "total_steps": 996, "loss": 0.9582, "lr": 3.864241845774746e-05, "epoch": 0.9480812641083521, "percentage": 31.63, "elapsed_time": "0:55:38", "remaining_time": "2:00:17", "throughput": 109.86, "total_tokens": 366760}
64
+ {"current_steps": 320, "total_steps": 996, "loss": 0.9863, "lr": 3.8310338939059644e-05, "epoch": 0.963130173062453, "percentage": 32.13, "elapsed_time": "0:56:30", "remaining_time": "1:59:23", "throughput": 109.84, "total_tokens": 372448}
65
+ {"current_steps": 325, "total_steps": 996, "loss": 0.906, "lr": 3.797494886245456e-05, "epoch": 0.9781790820165538, "percentage": 32.63, "elapsed_time": "0:57:24", "remaining_time": "1:58:31", "throughput": 109.89, "total_tokens": 378520}
66
+ {"current_steps": 330, "total_steps": 996, "loss": 0.8958, "lr": 3.7636331646414524e-05, "epoch": 0.9932279909706546, "percentage": 33.13, "elapsed_time": "0:58:17", "remaining_time": "1:57:38", "throughput": 109.87, "total_tokens": 384272}
67
+ {"current_steps": 335, "total_steps": 996, "loss": 0.8349, "lr": 3.7294571512078506e-05, "epoch": 1.0060195635816402, "percentage": 33.63, "elapsed_time": "0:59:02", "remaining_time": "1:56:30", "throughput": 109.89, "total_tokens": 389280}
68
+ {"current_steps": 340, "total_steps": 996, "loss": 0.8507, "lr": 3.694975346229458e-05, "epoch": 1.021068472535741, "percentage": 34.14, "elapsed_time": "0:59:54", "remaining_time": "1:55:36", "throughput": 109.86, "total_tokens": 394944}
69
+ {"current_steps": 345, "total_steps": 996, "loss": 0.9287, "lr": 3.6601963260477924e-05, "epoch": 1.036117381489842, "percentage": 34.64, "elapsed_time": "1:00:47", "remaining_time": "1:54:42", "throughput": 109.88, "total_tokens": 400800}
70
+ {"current_steps": 350, "total_steps": 996, "loss": 0.9107, "lr": 3.625128740927971e-05, "epoch": 1.0511662904439427, "percentage": 35.14, "elapsed_time": "1:01:40", "remaining_time": "1:53:50", "throughput": 109.91, "total_tokens": 406728}
71
+ {"current_steps": 355, "total_steps": 996, "loss": 0.952, "lr": 3.589781312907207e-05, "epoch": 1.0662151993980435, "percentage": 35.64, "elapsed_time": "1:02:33", "remaining_time": "1:52:57", "throughput": 109.93, "total_tokens": 412656}
72
+ {"current_steps": 360, "total_steps": 996, "loss": 0.9526, "lr": 3.55416283362546e-05, "epoch": 1.0812641083521444, "percentage": 36.14, "elapsed_time": "1:03:27", "remaining_time": "1:52:05", "throughput": 109.92, "total_tokens": 418488}
73
+ {"current_steps": 365, "total_steps": 996, "loss": 0.8775, "lr": 3.518282162138772e-05, "epoch": 1.0963130173062452, "percentage": 36.65, "elapsed_time": "1:04:19", "remaining_time": "1:51:12", "throughput": 109.9, "total_tokens": 424192}
74
+ {"current_steps": 370, "total_steps": 996, "loss": 0.883, "lr": 3.482148222715835e-05, "epoch": 1.111361926260346, "percentage": 37.15, "elapsed_time": "1:05:14", "remaining_time": "1:50:22", "throughput": 109.94, "total_tokens": 430312}
75
+ {"current_steps": 375, "total_steps": 996, "loss": 1.0032, "lr": 3.4457700026183374e-05, "epoch": 1.1264108352144468, "percentage": 37.65, "elapsed_time": "1:06:07", "remaining_time": "1:49:29", "throughput": 109.93, "total_tokens": 436128}
76
+ {"current_steps": 380, "total_steps": 996, "loss": 0.943, "lr": 3.409156549865654e-05, "epoch": 1.141459744168548, "percentage": 38.15, "elapsed_time": "1:06:59", "remaining_time": "1:48:36", "throughput": 109.94, "total_tokens": 441928}
77
+ {"current_steps": 385, "total_steps": 996, "loss": 0.801, "lr": 3.3723169709844026e-05, "epoch": 1.1565086531226485, "percentage": 38.65, "elapsed_time": "1:07:51", "remaining_time": "1:47:42", "throughput": 109.91, "total_tokens": 447560}
78
+ {"current_steps": 390, "total_steps": 996, "loss": 0.9294, "lr": 3.335260428743475e-05, "epoch": 1.1715575620767495, "percentage": 39.16, "elapsed_time": "1:08:44", "remaining_time": "1:46:48", "throughput": 109.91, "total_tokens": 453296}
79
+ {"current_steps": 395, "total_steps": 996, "loss": 0.9528, "lr": 3.297996139875055e-05, "epoch": 1.1866064710308502, "percentage": 39.66, "elapsed_time": "1:09:37", "remaining_time": "1:45:56", "throughput": 109.96, "total_tokens": 459336}
80
+ {"current_steps": 400, "total_steps": 996, "loss": 0.8981, "lr": 3.260533372782234e-05, "epoch": 1.2016553799849512, "percentage": 40.16, "elapsed_time": "1:10:29", "remaining_time": "1:45:01", "throughput": 109.93, "total_tokens": 464944}
81
+ {"current_steps": 405, "total_steps": 996, "loss": 0.9823, "lr": 3.222881445233759e-05, "epoch": 1.2167042889390518, "percentage": 40.66, "elapsed_time": "1:11:24", "remaining_time": "1:44:11", "throughput": 109.94, "total_tokens": 470992}
82
+ {"current_steps": 410, "total_steps": 996, "loss": 0.9047, "lr": 3.185049722046516e-05, "epoch": 1.2317531978931529, "percentage": 41.16, "elapsed_time": "1:12:14", "remaining_time": "1:43:15", "throughput": 109.86, "total_tokens": 476216}
83
+ {"current_steps": 415, "total_steps": 996, "loss": 0.8582, "lr": 3.147047612756302e-05, "epoch": 1.2468021068472535, "percentage": 41.67, "elapsed_time": "1:13:06", "remaining_time": "1:42:21", "throughput": 109.84, "total_tokens": 481824}
84
+ {"current_steps": 420, "total_steps": 996, "loss": 0.8787, "lr": 3.10888456927748e-05, "epoch": 1.2618510158013545, "percentage": 42.17, "elapsed_time": "1:13:58", "remaining_time": "1:41:27", "throughput": 109.85, "total_tokens": 487576}
85
+ {"current_steps": 425, "total_steps": 996, "loss": 0.8729, "lr": 3.0705700835520895e-05, "epoch": 1.276899924755455, "percentage": 42.67, "elapsed_time": "1:14:50", "remaining_time": "1:40:33", "throughput": 109.85, "total_tokens": 493336}
86
+ {"current_steps": 430, "total_steps": 996, "loss": 0.8772, "lr": 3.0321136851890036e-05, "epoch": 1.2919488337095562, "percentage": 43.17, "elapsed_time": "1:15:45", "remaining_time": "1:39:43", "throughput": 109.95, "total_tokens": 499760}
87
+ {"current_steps": 435, "total_steps": 996, "loss": 0.9451, "lr": 2.9935249390937183e-05, "epoch": 1.3069977426636568, "percentage": 43.67, "elapsed_time": "1:16:37", "remaining_time": "1:38:48", "throughput": 109.94, "total_tokens": 505400}
88
+ {"current_steps": 440, "total_steps": 996, "loss": 0.8202, "lr": 2.9548134430893604e-05, "epoch": 1.3220466516177578, "percentage": 44.18, "elapsed_time": "1:17:31", "remaining_time": "1:37:57", "throughput": 110.03, "total_tokens": 511760}
89
+ {"current_steps": 445, "total_steps": 996, "loss": 0.9773, "lr": 2.9159888255295116e-05, "epoch": 1.3370955605718584, "percentage": 44.68, "elapsed_time": "1:18:23", "remaining_time": "1:37:03", "throughput": 110.05, "total_tokens": 517616}
90
+ {"current_steps": 450, "total_steps": 996, "loss": 0.9101, "lr": 2.8770607429034352e-05, "epoch": 1.3521444695259595, "percentage": 45.18, "elapsed_time": "1:19:13", "remaining_time": "1:36:07", "throughput": 109.98, "total_tokens": 522744}
91
+ {"current_steps": 455, "total_steps": 996, "loss": 0.9633, "lr": 2.8380388774343047e-05, "epoch": 1.36719337848006, "percentage": 45.68, "elapsed_time": "1:20:05", "remaining_time": "1:35:13", "throughput": 110.01, "total_tokens": 528648}
92
+ {"current_steps": 460, "total_steps": 996, "loss": 0.8886, "lr": 2.7989329346710375e-05, "epoch": 1.382242287434161, "percentage": 46.18, "elapsed_time": "1:20:56", "remaining_time": "1:34:18", "throughput": 109.96, "total_tokens": 534000}
93
+ {"current_steps": 465, "total_steps": 996, "loss": 0.9258, "lr": 2.759752641074322e-05, "epoch": 1.3972911963882617, "percentage": 46.69, "elapsed_time": "1:21:48", "remaining_time": "1:33:24", "throughput": 109.96, "total_tokens": 539688}
94
+ {"current_steps": 470, "total_steps": 996, "loss": 0.9039, "lr": 2.7205077415974416e-05, "epoch": 1.4123401053423628, "percentage": 47.19, "elapsed_time": "1:22:38", "remaining_time": "1:32:29", "throughput": 109.93, "total_tokens": 545112}
95
+ {"current_steps": 475, "total_steps": 996, "loss": 1.0116, "lr": 2.6812079972625077e-05, "epoch": 1.4273890142964636, "percentage": 47.69, "elapsed_time": "1:23:32", "remaining_time": "1:31:37", "throughput": 109.99, "total_tokens": 551328}
96
+ {"current_steps": 480, "total_steps": 996, "loss": 0.8218, "lr": 2.6418631827326857e-05, "epoch": 1.4424379232505644, "percentage": 48.19, "elapsed_time": "1:24:23", "remaining_time": "1:30:42", "throughput": 109.97, "total_tokens": 556816}
97
+ {"current_steps": 485, "total_steps": 996, "loss": 0.8604, "lr": 2.602483083881035e-05, "epoch": 1.4574868322046652, "percentage": 48.69, "elapsed_time": "1:25:15", "remaining_time": "1:29:50", "throughput": 109.96, "total_tokens": 562552}
98
+ {"current_steps": 490, "total_steps": 996, "loss": 0.8044, "lr": 2.563077495356561e-05, "epoch": 1.472535741158766, "percentage": 49.2, "elapsed_time": "1:26:08", "remaining_time": "1:28:56", "throughput": 110.0, "total_tokens": 568480}
99
+ {"current_steps": 495, "total_steps": 996, "loss": 0.9198, "lr": 2.5236562181480794e-05, "epoch": 1.487584650112867, "percentage": 49.7, "elapsed_time": "1:26:59", "remaining_time": "1:28:03", "throughput": 109.98, "total_tokens": 574072}
100
+ {"current_steps": 500, "total_steps": 996, "loss": 0.9181, "lr": 2.484229057146507e-05, "epoch": 1.5026335590669677, "percentage": 50.2, "elapsed_time": "1:27:53", "remaining_time": "1:27:11", "throughput": 109.99, "total_tokens": 580040}
101
+ {"current_steps": 505, "total_steps": 996, "loss": 0.8644, "lr": 2.4448058187061835e-05, "epoch": 1.5176824680210683, "percentage": 50.7, "elapsed_time": "1:28:48", "remaining_time": "1:26:20", "throughput": 110.01, "total_tokens": 586128}
102
+ {"current_steps": 510, "total_steps": 996, "loss": 1.0127, "lr": 2.4053963082058244e-05, "epoch": 1.5327313769751694, "percentage": 51.2, "elapsed_time": "1:29:41", "remaining_time": "1:25:28", "throughput": 110.05, "total_tokens": 592256}
103
+ {"current_steps": 515, "total_steps": 996, "loss": 0.7937, "lr": 2.3660103276097232e-05, "epoch": 1.54778028592927, "percentage": 51.71, "elapsed_time": "1:30:32", "remaining_time": "1:24:33", "throughput": 110.02, "total_tokens": 597704}
104
+ {"current_steps": 520, "total_steps": 996, "loss": 0.9806, "lr": 2.3266576730297956e-05, "epoch": 1.562829194883371, "percentage": 52.21, "elapsed_time": "1:31:23", "remaining_time": "1:23:39", "throughput": 110.01, "total_tokens": 603240}
105
+ {"current_steps": 525, "total_steps": 996, "loss": 0.934, "lr": 2.2873481322890862e-05, "epoch": 1.5778781038374716, "percentage": 52.71, "elapsed_time": "1:32:18", "remaining_time": "1:22:48", "throughput": 110.08, "total_tokens": 609616}
106
+ {"current_steps": 530, "total_steps": 996, "loss": 0.9288, "lr": 2.2480914824873297e-05, "epoch": 1.5929270127915727, "percentage": 53.21, "elapsed_time": "1:33:11", "remaining_time": "1:21:56", "throughput": 110.09, "total_tokens": 615520}
107
+ {"current_steps": 535, "total_steps": 996, "loss": 0.8597, "lr": 2.2088974875691863e-05, "epoch": 1.6079759217456733, "percentage": 53.71, "elapsed_time": "1:34:03", "remaining_time": "1:21:03", "throughput": 110.07, "total_tokens": 621208}
108
+ {"current_steps": 540, "total_steps": 996, "loss": 0.8817, "lr": 2.1697758958957448e-05, "epoch": 1.6230248306997743, "percentage": 54.22, "elapsed_time": "1:34:56", "remaining_time": "1:20:10", "throughput": 110.1, "total_tokens": 627176}
109
+ {"current_steps": 545, "total_steps": 996, "loss": 0.777, "lr": 2.1307364378199005e-05, "epoch": 1.6380737396538751, "percentage": 54.72, "elapsed_time": "1:35:50", "remaining_time": "1:19:18", "throughput": 110.13, "total_tokens": 633248}
110
+ {"current_steps": 550, "total_steps": 996, "loss": 0.798, "lr": 2.0917888232662196e-05, "epoch": 1.653122648607976, "percentage": 55.22, "elapsed_time": "1:36:43", "remaining_time": "1:18:25", "throughput": 110.11, "total_tokens": 639000}
111
+ {"current_steps": 555, "total_steps": 996, "loss": 0.9104, "lr": 2.0529427393158705e-05, "epoch": 1.6681715575620768, "percentage": 55.72, "elapsed_time": "1:37:37", "remaining_time": "1:17:33", "throughput": 110.17, "total_tokens": 645280}
112
+ {"current_steps": 560, "total_steps": 996, "loss": 0.8293, "lr": 2.014207847797256e-05, "epoch": 1.6832204665161776, "percentage": 56.22, "elapsed_time": "1:38:31", "remaining_time": "1:16:42", "throughput": 110.26, "total_tokens": 651760}
113
+ {"current_steps": 565, "total_steps": 996, "loss": 0.8821, "lr": 1.9755937828829067e-05, "epoch": 1.6982693754702785, "percentage": 56.73, "elapsed_time": "1:39:22", "remaining_time": "1:15:48", "throughput": 110.23, "total_tokens": 657272}
114
+ {"current_steps": 570, "total_steps": 996, "loss": 0.8253, "lr": 1.937110148693265e-05, "epoch": 1.7133182844243793, "percentage": 57.23, "elapsed_time": "1:40:15", "remaining_time": "1:14:56", "throughput": 110.27, "total_tokens": 663336}
115
+ {"current_steps": 575, "total_steps": 996, "loss": 0.9391, "lr": 1.8987665169079454e-05, "epoch": 1.72836719337848, "percentage": 57.73, "elapsed_time": "1:41:07", "remaining_time": "1:14:02", "throughput": 110.25, "total_tokens": 668936}
116
+ {"current_steps": 580, "total_steps": 996, "loss": 0.8711, "lr": 1.8605724243850502e-05, "epoch": 1.743416102332581, "percentage": 58.23, "elapsed_time": "1:42:00", "remaining_time": "1:13:10", "throughput": 110.28, "total_tokens": 675000}
117
+ {"current_steps": 585, "total_steps": 996, "loss": 0.8346, "lr": 1.822537370789163e-05, "epoch": 1.7584650112866818, "percentage": 58.73, "elapsed_time": "1:42:52", "remaining_time": "1:12:16", "throughput": 110.26, "total_tokens": 680584}
118
+ {"current_steps": 590, "total_steps": 996, "loss": 0.8275, "lr": 1.7846708162285785e-05, "epoch": 1.7735139202407826, "percentage": 59.24, "elapsed_time": "1:43:44", "remaining_time": "1:11:23", "throughput": 110.27, "total_tokens": 686416}
119
+ {"current_steps": 595, "total_steps": 996, "loss": 0.9435, "lr": 1.7469821789023815e-05, "epoch": 1.7885628291948834, "percentage": 59.74, "elapsed_time": "1:44:35", "remaining_time": "1:10:29", "throughput": 110.27, "total_tokens": 692016}
120
+ {"current_steps": 600, "total_steps": 996, "loss": 0.8584, "lr": 1.70948083275794e-05, "epoch": 1.8036117381489842, "percentage": 60.24, "elapsed_time": "1:45:28", "remaining_time": "1:09:36", "throughput": 110.29, "total_tokens": 697984}
121
+ {"current_steps": 605, "total_steps": 996, "loss": 0.88, "lr": 1.672176105159417e-05, "epoch": 1.818660647103085, "percentage": 60.74, "elapsed_time": "1:46:23", "remaining_time": "1:08:45", "throughput": 110.3, "total_tokens": 704056}
122
+ {"current_steps": 610, "total_steps": 996, "loss": 0.8825, "lr": 1.635077274567854e-05, "epoch": 1.8337095560571859, "percentage": 61.24, "elapsed_time": "1:47:15", "remaining_time": "1:07:52", "throughput": 110.3, "total_tokens": 709760}
123
+ {"current_steps": 615, "total_steps": 996, "loss": 0.9978, "lr": 1.5981935682334264e-05, "epoch": 1.8487584650112867, "percentage": 61.75, "elapsed_time": "1:48:08", "remaining_time": "1:06:59", "throughput": 110.33, "total_tokens": 715872}
124
+ {"current_steps": 620, "total_steps": 996, "loss": 0.9626, "lr": 1.561534159900441e-05, "epoch": 1.8638073739653875, "percentage": 62.25, "elapsed_time": "1:49:02", "remaining_time": "1:06:07", "throughput": 110.38, "total_tokens": 722184}
125
+ {"current_steps": 625, "total_steps": 996, "loss": 0.9308, "lr": 1.525108167525624e-05, "epoch": 1.8788562829194884, "percentage": 62.75, "elapsed_time": "1:49:54", "remaining_time": "1:05:14", "throughput": 110.37, "total_tokens": 727776}
126
+ {"current_steps": 630, "total_steps": 996, "loss": 0.9757, "lr": 1.4889246510103077e-05, "epoch": 1.8939051918735892, "percentage": 63.25, "elapsed_time": "1:50:47", "remaining_time": "1:04:21", "throughput": 110.38, "total_tokens": 733760}
127
+ {"current_steps": 635, "total_steps": 996, "loss": 0.767, "lr": 1.4529926099470348e-05, "epoch": 1.90895410082769, "percentage": 63.76, "elapsed_time": "1:51:41", "remaining_time": "1:03:29", "throughput": 110.43, "total_tokens": 740024}
128
+ {"current_steps": 640, "total_steps": 996, "loss": 0.9272, "lr": 1.4173209813811788e-05, "epoch": 1.9240030097817908, "percentage": 64.26, "elapsed_time": "1:52:32", "remaining_time": "1:02:36", "throughput": 110.4, "total_tokens": 745480}
129
+ {"current_steps": 645, "total_steps": 996, "loss": 0.7941, "lr": 1.381918637588112e-05, "epoch": 1.9390519187358917, "percentage": 64.76, "elapsed_time": "1:53:25", "remaining_time": "1:01:43", "throughput": 110.41, "total_tokens": 751384}
130
+ {"current_steps": 650, "total_steps": 996, "loss": 0.8408, "lr": 1.3467943838664863e-05, "epoch": 1.9541008276899925, "percentage": 65.26, "elapsed_time": "1:54:17", "remaining_time": "1:00:50", "throughput": 110.38, "total_tokens": 756920}
131
+ {"current_steps": 655, "total_steps": 996, "loss": 0.8459, "lr": 1.311956956348177e-05, "epoch": 1.9691497366440933, "percentage": 65.76, "elapsed_time": "1:55:08", "remaining_time": "0:59:56", "throughput": 110.35, "total_tokens": 762424}
132
+ {"current_steps": 660, "total_steps": 996, "loss": 1.0117, "lr": 1.277415019825417e-05, "epoch": 1.9841986455981941, "percentage": 66.27, "elapsed_time": "1:56:01", "remaining_time": "0:59:03", "throughput": 110.36, "total_tokens": 768224}
133
+ {"current_steps": 665, "total_steps": 996, "loss": 0.9665, "lr": 1.2431771655956925e-05, "epoch": 1.999247554552295, "percentage": 66.77, "elapsed_time": "1:56:52", "remaining_time": "0:58:10", "throughput": 110.32, "total_tokens": 773568}
134
+ {"current_steps": 670, "total_steps": 996, "loss": 0.7625, "lr": 1.2092519093248988e-05, "epoch": 2.0120391271632805, "percentage": 67.27, "elapsed_time": "1:57:37", "remaining_time": "0:57:13", "throughput": 110.34, "total_tokens": 778672}
135
+ {"current_steps": 675, "total_steps": 996, "loss": 0.8667, "lr": 1.1756476889293269e-05, "epoch": 2.0270880361173815, "percentage": 67.77, "elapsed_time": "1:58:30", "remaining_time": "0:56:21", "throughput": 110.33, "total_tokens": 784488}
136
+ {"current_steps": 680, "total_steps": 996, "loss": 0.8297, "lr": 1.1423728624769695e-05, "epoch": 2.042136945071482, "percentage": 68.27, "elapsed_time": "1:59:22", "remaining_time": "0:55:28", "throughput": 110.34, "total_tokens": 790304}
137
+ {"current_steps": 685, "total_steps": 996, "loss": 0.8774, "lr": 1.1094357061087033e-05, "epoch": 2.057185854025583, "percentage": 68.78, "elapsed_time": "2:00:15", "remaining_time": "0:54:35", "throughput": 110.35, "total_tokens": 796192}
138
+ {"current_steps": 690, "total_steps": 996, "loss": 0.8476, "lr": 1.0768444119798357e-05, "epoch": 2.072234762979684, "percentage": 69.28, "elapsed_time": "2:01:07", "remaining_time": "0:53:43", "throughput": 110.37, "total_tokens": 802144}
139
+ {"current_steps": 695, "total_steps": 996, "loss": 0.8641, "lr": 1.0446070862225463e-05, "epoch": 2.087283671933785, "percentage": 69.78, "elapsed_time": "2:02:00", "remaining_time": "0:52:50", "throughput": 110.35, "total_tokens": 807768}
140
+ {"current_steps": 700, "total_steps": 996, "loss": 0.8383, "lr": 1.0127317469297277e-05, "epoch": 2.1023325808878854, "percentage": 70.28, "elapsed_time": "2:02:53", "remaining_time": "0:51:57", "throughput": 110.36, "total_tokens": 813712}
141
+ {"current_steps": 705, "total_steps": 996, "loss": 0.9123, "lr": 9.812263221607112e-06, "epoch": 2.1173814898419865, "percentage": 70.78, "elapsed_time": "2:03:46", "remaining_time": "0:51:05", "throughput": 110.32, "total_tokens": 819360}
142
+ {"current_steps": 710, "total_steps": 996, "loss": 0.9635, "lr": 9.500986479694036e-06, "epoch": 2.132430398796087, "percentage": 71.29, "elapsed_time": "2:04:37", "remaining_time": "0:50:11", "throughput": 110.28, "total_tokens": 824584}
143
+ {"current_steps": 715, "total_steps": 996, "loss": 0.9221, "lr": 9.19356466455287e-06, "epoch": 2.147479307750188, "percentage": 71.79, "elapsed_time": "2:05:30", "remaining_time": "0:49:19", "throughput": 110.3, "total_tokens": 830600}
144
+ {"current_steps": 720, "total_steps": 996, "loss": 0.8757, "lr": 8.890074238378074e-06, "epoch": 2.1625282167042887, "percentage": 72.29, "elapsed_time": "2:06:24", "remaining_time": "0:48:27", "throughput": 110.34, "total_tokens": 836856}
145
+ {"current_steps": 725, "total_steps": 996, "loss": 0.7958, "lr": 8.590590685545946e-06, "epoch": 2.17757712565839, "percentage": 72.79, "elapsed_time": "2:07:17", "remaining_time": "0:47:34", "throughput": 110.36, "total_tokens": 842872}
146
+ {"current_steps": 730, "total_steps": 996, "loss": 0.7993, "lr": 8.295188493840104e-06, "epoch": 2.1926260346124904, "percentage": 73.29, "elapsed_time": "2:08:10", "remaining_time": "0:46:42", "throughput": 110.36, "total_tokens": 848664}
147
+ {"current_steps": 735, "total_steps": 996, "loss": 0.8436, "lr": 8.003941135924858e-06, "epoch": 2.2076749435665914, "percentage": 73.8, "elapsed_time": "2:09:04", "remaining_time": "0:45:49", "throughput": 110.37, "total_tokens": 854712}
148
+ {"current_steps": 740, "total_steps": 996, "loss": 0.896, "lr": 7.71692105107098e-06, "epoch": 2.222723852520692, "percentage": 74.3, "elapsed_time": "2:09:56", "remaining_time": "0:44:57", "throughput": 110.38, "total_tokens": 860648}
149
+ {"current_steps": 745, "total_steps": 996, "loss": 0.8948, "lr": 7.434199627138602e-06, "epoch": 2.237772761474793, "percentage": 74.8, "elapsed_time": "2:10:48", "remaining_time": "0:44:04", "throughput": 110.35, "total_tokens": 866080}
150
+ {"current_steps": 750, "total_steps": 996, "loss": 0.8546, "lr": 7.155847182821523e-06, "epoch": 2.2528216704288937, "percentage": 75.3, "elapsed_time": "2:11:39", "remaining_time": "0:43:11", "throughput": 110.33, "total_tokens": 871560}
151
+ {"current_steps": 755, "total_steps": 996, "loss": 0.8494, "lr": 6.881932950157538e-06, "epoch": 2.2678705793829947, "percentage": 75.8, "elapsed_time": "2:12:32", "remaining_time": "0:42:18", "throughput": 110.35, "total_tokens": 877568}
152
+ {"current_steps": 760, "total_steps": 996, "loss": 0.7723, "lr": 6.612525057308949e-06, "epoch": 2.282919488337096, "percentage": 76.31, "elapsed_time": "2:13:25", "remaining_time": "0:41:26", "throughput": 110.4, "total_tokens": 883808}
153
+ {"current_steps": 765, "total_steps": 996, "loss": 0.9168, "lr": 6.347690511617693e-06, "epoch": 2.2979683972911964, "percentage": 76.81, "elapsed_time": "2:14:17", "remaining_time": "0:40:32", "throughput": 110.37, "total_tokens": 889296}
154
+ {"current_steps": 770, "total_steps": 996, "loss": 0.8831, "lr": 6.0874951829392234e-06, "epoch": 2.313017306245297, "percentage": 77.31, "elapsed_time": "2:15:09", "remaining_time": "0:39:40", "throughput": 110.38, "total_tokens": 895120}
155
+ {"current_steps": 775, "total_steps": 996, "loss": 0.854, "lr": 5.832003787259327e-06, "epoch": 2.328066215199398, "percentage": 77.81, "elapsed_time": "2:16:00", "remaining_time": "0:38:47", "throughput": 110.32, "total_tokens": 900320}
156
+ {"current_steps": 780, "total_steps": 996, "loss": 0.8843, "lr": 5.581279870597867e-06, "epoch": 2.343115124153499, "percentage": 78.31, "elapsed_time": "2:16:52", "remaining_time": "0:37:54", "throughput": 110.32, "total_tokens": 905928}
157
+ {"current_steps": 785, "total_steps": 996, "loss": 0.862, "lr": 5.335385793203604e-06, "epoch": 2.3581640331075997, "percentage": 78.82, "elapsed_time": "2:17:44", "remaining_time": "0:37:01", "throughput": 110.34, "total_tokens": 911976}
158
+ {"current_steps": 790, "total_steps": 996, "loss": 0.985, "lr": 5.094382714043907e-06, "epoch": 2.3732129420617003, "percentage": 79.32, "elapsed_time": "2:18:37", "remaining_time": "0:36:08", "throughput": 110.34, "total_tokens": 917840}
159
+ {"current_steps": 795, "total_steps": 996, "loss": 0.7679, "lr": 4.85833057559322e-06, "epoch": 2.3882618510158014, "percentage": 79.82, "elapsed_time": "2:19:28", "remaining_time": "0:35:15", "throughput": 110.31, "total_tokens": 923168}
160
+ {"current_steps": 800, "total_steps": 996, "loss": 0.8198, "lr": 4.627288088924156e-06, "epoch": 2.4033107599699024, "percentage": 80.32, "elapsed_time": "2:20:20", "remaining_time": "0:34:22", "throughput": 110.3, "total_tokens": 928720}
161
+ {"current_steps": 805, "total_steps": 996, "loss": 0.7773, "lr": 4.401312719104802e-06, "epoch": 2.418359668924003, "percentage": 80.82, "elapsed_time": "2:21:14", "remaining_time": "0:33:30", "throughput": 110.28, "total_tokens": 934568}
162
+ {"current_steps": 810, "total_steps": 996, "loss": 0.9312, "lr": 4.180460670905978e-06, "epoch": 2.4334085778781036, "percentage": 81.33, "elapsed_time": "2:22:06", "remaining_time": "0:32:38", "throughput": 110.27, "total_tokens": 940264}
163
+ {"current_steps": 815, "total_steps": 996, "loss": 0.8497, "lr": 3.964786874821955e-06, "epoch": 2.4484574868322047, "percentage": 81.83, "elapsed_time": "2:22:59", "remaining_time": "0:31:45", "throughput": 110.28, "total_tokens": 946128}
164
+ {"current_steps": 820, "total_steps": 996, "loss": 0.782, "lr": 3.754344973408064e-06, "epoch": 2.4635063957863057, "percentage": 82.33, "elapsed_time": "2:23:52", "remaining_time": "0:30:52", "throughput": 110.29, "total_tokens": 952032}
165
+ {"current_steps": 825, "total_steps": 996, "loss": 0.8937, "lr": 3.5491873079387256e-06, "epoch": 2.4785553047404063, "percentage": 82.83, "elapsed_time": "2:24:44", "remaining_time": "0:30:00", "throughput": 110.31, "total_tokens": 957960}
166
+ {"current_steps": 830, "total_steps": 996, "loss": 0.7039, "lr": 3.3493649053890326e-06, "epoch": 2.493604213694507, "percentage": 83.33, "elapsed_time": "2:25:38", "remaining_time": "0:29:07", "throughput": 110.36, "total_tokens": 964336}
167
+ {"current_steps": 835, "total_steps": 996, "loss": 0.9265, "lr": 3.1549274657433375e-06, "epoch": 2.508653122648608, "percentage": 83.84, "elapsed_time": "2:26:30", "remaining_time": "0:28:15", "throughput": 110.36, "total_tokens": 970168}
168
+ {"current_steps": 840, "total_steps": 996, "loss": 0.8669, "lr": 2.9659233496337786e-06, "epoch": 2.523702031602709, "percentage": 84.34, "elapsed_time": "2:27:21", "remaining_time": "0:27:22", "throughput": 110.36, "total_tokens": 975752}
169
+ {"current_steps": 845, "total_steps": 996, "loss": 0.9174, "lr": 2.7823995663120327e-06, "epoch": 2.5387509405568096, "percentage": 84.84, "elapsed_time": "2:28:14", "remaining_time": "0:26:29", "throughput": 110.37, "total_tokens": 981672}
170
+ {"current_steps": 850, "total_steps": 996, "loss": 0.8718, "lr": 2.6044017619571065e-06, "epoch": 2.55379984951091, "percentage": 85.34, "elapsed_time": "2:29:06", "remaining_time": "0:25:36", "throughput": 110.38, "total_tokens": 987560}
171
+ {"current_steps": 855, "total_steps": 996, "loss": 0.8634, "lr": 2.431974208322191e-06, "epoch": 2.5688487584650113, "percentage": 85.84, "elapsed_time": "2:29:58", "remaining_time": "0:24:44", "throughput": 110.37, "total_tokens": 993200}
172
+ {"current_steps": 860, "total_steps": 996, "loss": 0.845, "lr": 2.265159791723373e-06, "epoch": 2.5838976674191123, "percentage": 86.35, "elapsed_time": "2:30:52", "remaining_time": "0:23:51", "throughput": 110.38, "total_tokens": 999192}
173
+ {"current_steps": 865, "total_steps": 996, "loss": 0.8008, "lr": 2.104000002372886e-06, "epoch": 2.598946576373213, "percentage": 86.85, "elapsed_time": "2:31:42", "remaining_time": "0:22:58", "throughput": 110.36, "total_tokens": 1004576}
174
+ {"current_steps": 870, "total_steps": 996, "loss": 0.8797, "lr": 1.9485349240596613e-06, "epoch": 2.6139954853273135, "percentage": 87.35, "elapsed_time": "2:32:34", "remaining_time": "0:22:05", "throughput": 110.36, "total_tokens": 1010352}
175
+ {"current_steps": 875, "total_steps": 996, "loss": 0.946, "lr": 1.7988032241796376e-06, "epoch": 2.6290443942814146, "percentage": 87.85, "elapsed_time": "2:33:27", "remaining_time": "0:21:13", "throughput": 110.37, "total_tokens": 1016272}
176
+ {"current_steps": 880, "total_steps": 996, "loss": 0.8032, "lr": 1.6548421441183875e-06, "epoch": 2.6440933032355156, "percentage": 88.35, "elapsed_time": "2:34:19", "remaining_time": "0:20:20", "throughput": 110.36, "total_tokens": 1021896}
177
+ {"current_steps": 885, "total_steps": 996, "loss": 0.8892, "lr": 1.5166874899884053e-06, "epoch": 2.659142212189616, "percentage": 88.86, "elapsed_time": "2:35:11", "remaining_time": "0:19:27", "throughput": 110.37, "total_tokens": 1027704}
178
+ {"current_steps": 890, "total_steps": 996, "loss": 0.856, "lr": 1.3843736237233784e-06, "epoch": 2.674191121143717, "percentage": 89.36, "elapsed_time": "2:36:04", "remaining_time": "0:18:35", "throughput": 110.39, "total_tokens": 1033800}
179
+ {"current_steps": 895, "total_steps": 996, "loss": 0.8617, "lr": 1.2579334545316733e-06, "epoch": 2.689240030097818, "percentage": 89.86, "elapsed_time": "2:36:58", "remaining_time": "0:17:42", "throughput": 110.43, "total_tokens": 1040008}
180
+ {"current_steps": 900, "total_steps": 996, "loss": 0.9117, "lr": 1.137398430711123e-06, "epoch": 2.704288939051919, "percentage": 90.36, "elapsed_time": "2:37:52", "remaining_time": "0:16:50", "throughput": 110.46, "total_tokens": 1046272}
181
+ {"current_steps": 905, "total_steps": 996, "loss": 0.7855, "lr": 1.0227985318271682e-06, "epoch": 2.7193378480060195, "percentage": 90.86, "elapsed_time": "2:38:45", "remaining_time": "0:15:57", "throughput": 110.44, "total_tokens": 1052032}
182
+ {"current_steps": 910, "total_steps": 996, "loss": 0.8212, "lr": 9.141622612563571e-07, "epoch": 2.73438675696012, "percentage": 91.37, "elapsed_time": "2:39:37", "remaining_time": "0:15:05", "throughput": 110.42, "total_tokens": 1057584}
183
+ {"current_steps": 915, "total_steps": 996, "loss": 0.8404, "lr": 8.115166390969125e-07, "epoch": 2.749435665914221, "percentage": 91.87, "elapsed_time": "2:40:31", "remaining_time": "0:14:12", "throughput": 110.45, "total_tokens": 1063760}
184
+ {"current_steps": 920, "total_steps": 996, "loss": 0.7782, "lr": 7.148871954483105e-07, "epoch": 2.764484574868322, "percentage": 92.37, "elapsed_time": "2:41:23", "remaining_time": "0:13:19", "throughput": 110.45, "total_tokens": 1069544}
185
+ {"current_steps": 925, "total_steps": 996, "loss": 0.7847, "lr": 6.242979640613933e-07, "epoch": 2.779533483822423, "percentage": 92.87, "elapsed_time": "2:42:16", "remaining_time": "0:12:27", "throughput": 110.45, "total_tokens": 1075472}
186
+ {"current_steps": 930, "total_steps": 996, "loss": 0.8857, "lr": 5.397714763606843e-07, "epoch": 2.7945823927765234, "percentage": 93.37, "elapsed_time": "2:43:10", "remaining_time": "0:11:34", "throughput": 110.46, "total_tokens": 1081464}
187
+ {"current_steps": 935, "total_steps": 996, "loss": 0.8029, "lr": 4.613287558403512e-07, "epoch": 2.8096313017306245, "percentage": 93.88, "elapsed_time": "2:44:03", "remaining_time": "0:10:42", "throughput": 110.48, "total_tokens": 1087464}
188
+ {"current_steps": 940, "total_steps": 996, "loss": 0.8154, "lr": 3.8898931283523344e-07, "epoch": 2.8246802106847255, "percentage": 94.38, "elapsed_time": "2:44:54", "remaining_time": "0:09:49", "throughput": 110.45, "total_tokens": 1092888}
189
+ {"current_steps": 945, "total_steps": 996, "loss": 0.8791, "lr": 3.227711396682015e-07, "epoch": 2.839729119638826, "percentage": 94.88, "elapsed_time": "2:45:48", "remaining_time": "0:08:56", "throughput": 110.45, "total_tokens": 1098808}
190
+ {"current_steps": 950, "total_steps": 996, "loss": 0.787, "lr": 2.626907061751116e-07, "epoch": 2.854778028592927, "percentage": 95.38, "elapsed_time": "2:46:40", "remaining_time": "0:08:04", "throughput": 110.46, "total_tokens": 1104688}
191
+ {"current_steps": 955, "total_steps": 996, "loss": 0.8831, "lr": 2.0876295560839364e-07, "epoch": 2.869826937547028, "percentage": 95.88, "elapsed_time": "2:47:34", "remaining_time": "0:07:11", "throughput": 110.49, "total_tokens": 1110960}
192
+ {"current_steps": 960, "total_steps": 996, "loss": 0.7677, "lr": 1.6100130092037703e-07, "epoch": 2.884875846501129, "percentage": 96.39, "elapsed_time": "2:48:27", "remaining_time": "0:06:19", "throughput": 110.49, "total_tokens": 1116800}
193
+ {"current_steps": 965, "total_steps": 996, "loss": 0.7567, "lr": 1.194176214271897e-07, "epoch": 2.8999247554552294, "percentage": 96.89, "elapsed_time": "2:49:18", "remaining_time": "0:05:26", "throughput": 110.47, "total_tokens": 1122248}
194
+ {"current_steps": 970, "total_steps": 996, "loss": 0.8944, "lr": 8.402225985413848e-08, "epoch": 2.9149736644093305, "percentage": 97.39, "elapsed_time": "2:50:10", "remaining_time": "0:04:33", "throughput": 110.46, "total_tokens": 1127928}
195
+ {"current_steps": 975, "total_steps": 996, "loss": 0.9737, "lr": 5.4824019763252685e-08, "epoch": 2.930022573363431, "percentage": 97.89, "elapsed_time": "2:51:01", "remaining_time": "0:03:41", "throughput": 110.44, "total_tokens": 1133336}
196
+ {"current_steps": 980, "total_steps": 996, "loss": 0.8965, "lr": 3.1830163363655296e-08, "epoch": 2.945071482317532, "percentage": 98.39, "elapsed_time": "2:51:54", "remaining_time": "0:02:48", "throughput": 110.44, "total_tokens": 1139048}
197
+ {"current_steps": 985, "total_steps": 996, "loss": 0.837, "lr": 1.504640970531046e-08, "epoch": 2.9601203912716327, "percentage": 98.9, "elapsed_time": "2:52:45", "remaining_time": "0:01:55", "throughput": 110.41, "total_tokens": 1144456}
198
+ {"current_steps": 990, "total_steps": 996, "loss": 0.7812, "lr": 4.4769332565558485e-09, "epoch": 2.975169300225734, "percentage": 99.4, "elapsed_time": "2:53:37", "remaining_time": "0:01:03", "throughput": 110.4, "total_tokens": 1150160}
199
+ {"current_steps": 995, "total_steps": 996, "loss": 0.8613, "lr": 1.2436286584982527e-10, "epoch": 2.9902182091798344, "percentage": 99.9, "elapsed_time": "2:54:33", "remaining_time": "0:00:10", "throughput": 110.45, "total_tokens": 1156704}
200
+ {"current_steps": 996, "total_steps": 996, "epoch": 2.9932279909706545, "percentage": 100.0, "elapsed_time": "2:54:44", "remaining_time": "0:00:00", "throughput": 110.43, "total_tokens": 1157808}
trainer_state.json ADDED
@@ -0,0 +1,1636 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.9932279909706545,
6
+ "eval_steps": 500,
7
+ "global_step": 996,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.015048908954100828,
14
+ "grad_norm": 1.2988319396972656,
15
+ "learning_rate": 4.9996890990217804e-05,
16
+ "loss": 2.4707,
17
+ "num_input_tokens_seen": 5864,
18
+ "step": 5
19
+ },
20
+ {
21
+ "epoch": 0.030097817908201655,
22
+ "grad_norm": 1.8058427572250366,
23
+ "learning_rate": 4.9987564734146566e-05,
24
+ "loss": 2.2509,
25
+ "num_input_tokens_seen": 11432,
26
+ "step": 10
27
+ },
28
+ {
29
+ "epoch": 0.045146726862302484,
30
+ "grad_norm": 0.8231738209724426,
31
+ "learning_rate": 4.997202355141999e-05,
32
+ "loss": 1.6895,
33
+ "num_input_tokens_seen": 17000,
34
+ "step": 15
35
+ },
36
+ {
37
+ "epoch": 0.06019563581640331,
38
+ "grad_norm": 0.7266705632209778,
39
+ "learning_rate": 4.995027130745321e-05,
40
+ "loss": 1.4876,
41
+ "num_input_tokens_seen": 22840,
42
+ "step": 20
43
+ },
44
+ {
45
+ "epoch": 0.07524454477050414,
46
+ "grad_norm": 1.1722582578659058,
47
+ "learning_rate": 4.992231341248137e-05,
48
+ "loss": 1.4812,
49
+ "num_input_tokens_seen": 28984,
50
+ "step": 25
51
+ },
52
+ {
53
+ "epoch": 0.09029345372460497,
54
+ "grad_norm": 0.9262341260910034,
55
+ "learning_rate": 4.9888156820213974e-05,
56
+ "loss": 1.3642,
57
+ "num_input_tokens_seen": 34856,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 0.1053423626787058,
62
+ "grad_norm": 0.8832902908325195,
63
+ "learning_rate": 4.9847810026105394e-05,
64
+ "loss": 1.3651,
65
+ "num_input_tokens_seen": 41216,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 0.12039127163280662,
70
+ "grad_norm": 0.8503655791282654,
71
+ "learning_rate": 4.980128306524183e-05,
72
+ "loss": 1.1321,
73
+ "num_input_tokens_seen": 47304,
74
+ "step": 40
75
+ },
76
+ {
77
+ "epoch": 0.13544018058690746,
78
+ "grad_norm": 1.348948359489441,
79
+ "learning_rate": 4.97485875098454e-05,
80
+ "loss": 1.3012,
81
+ "num_input_tokens_seen": 53184,
82
+ "step": 45
83
+ },
84
+ {
85
+ "epoch": 0.1504890895410083,
86
+ "grad_norm": 0.7177269458770752,
87
+ "learning_rate": 4.968973646639589e-05,
88
+ "loss": 0.9827,
89
+ "num_input_tokens_seen": 59024,
90
+ "step": 50
91
+ },
92
+ {
93
+ "epoch": 0.1655379984951091,
94
+ "grad_norm": 0.6005258560180664,
95
+ "learning_rate": 4.9624744572370865e-05,
96
+ "loss": 1.2313,
97
+ "num_input_tokens_seen": 64816,
98
+ "step": 55
99
+ },
100
+ {
101
+ "epoch": 0.18058690744920994,
102
+ "grad_norm": 0.6153081059455872,
103
+ "learning_rate": 4.9553627992605066e-05,
104
+ "loss": 1.0347,
105
+ "num_input_tokens_seen": 70848,
106
+ "step": 60
107
+ },
108
+ {
109
+ "epoch": 0.19563581640331076,
110
+ "grad_norm": 0.7796200513839722,
111
+ "learning_rate": 4.947640441526989e-05,
112
+ "loss": 1.0422,
113
+ "num_input_tokens_seen": 76888,
114
+ "step": 65
115
+ },
116
+ {
117
+ "epoch": 0.2106847253574116,
118
+ "grad_norm": 0.7273033857345581,
119
+ "learning_rate": 4.939309304747391e-05,
120
+ "loss": 0.9996,
121
+ "num_input_tokens_seen": 82840,
122
+ "step": 70
123
+ },
124
+ {
125
+ "epoch": 0.22573363431151242,
126
+ "grad_norm": 0.7943289875984192,
127
+ "learning_rate": 4.930371461048571e-05,
128
+ "loss": 1.0755,
129
+ "num_input_tokens_seen": 88824,
130
+ "step": 75
131
+ },
132
+ {
133
+ "epoch": 0.24078254326561324,
134
+ "grad_norm": 0.6128024458885193,
135
+ "learning_rate": 4.9208291334580104e-05,
136
+ "loss": 1.026,
137
+ "num_input_tokens_seen": 94264,
138
+ "step": 80
139
+ },
140
+ {
141
+ "epoch": 0.2558314522197141,
142
+ "grad_norm": 0.7087495923042297,
143
+ "learning_rate": 4.910684695350895e-05,
144
+ "loss": 1.1307,
145
+ "num_input_tokens_seen": 99896,
146
+ "step": 85
147
+ },
148
+ {
149
+ "epoch": 0.2708803611738149,
150
+ "grad_norm": 0.711476743221283,
151
+ "learning_rate": 4.8999406698598074e-05,
152
+ "loss": 1.0221,
153
+ "num_input_tokens_seen": 105640,
154
+ "step": 90
155
+ },
156
+ {
157
+ "epoch": 0.28592927012791575,
158
+ "grad_norm": 0.5772566795349121,
159
+ "learning_rate": 4.8885997292471774e-05,
160
+ "loss": 1.012,
161
+ "num_input_tokens_seen": 111280,
162
+ "step": 95
163
+ },
164
+ {
165
+ "epoch": 0.3009781790820166,
166
+ "grad_norm": 0.6769325137138367,
167
+ "learning_rate": 4.87666469424063e-05,
168
+ "loss": 1.0151,
169
+ "num_input_tokens_seen": 116640,
170
+ "step": 100
171
+ },
172
+ {
173
+ "epoch": 0.3160270880361174,
174
+ "grad_norm": 0.679373025894165,
175
+ "learning_rate": 4.86413853333141e-05,
176
+ "loss": 1.0028,
177
+ "num_input_tokens_seen": 121864,
178
+ "step": 105
179
+ },
180
+ {
181
+ "epoch": 0.3310759969902182,
182
+ "grad_norm": 0.9181504845619202,
183
+ "learning_rate": 4.851024362036064e-05,
184
+ "loss": 1.143,
185
+ "num_input_tokens_seen": 127384,
186
+ "step": 110
187
+ },
188
+ {
189
+ "epoch": 0.34612490594431905,
190
+ "grad_norm": 0.7842696905136108,
191
+ "learning_rate": 4.837325442121538e-05,
192
+ "loss": 0.9695,
193
+ "num_input_tokens_seen": 133008,
194
+ "step": 115
195
+ },
196
+ {
197
+ "epoch": 0.3611738148984199,
198
+ "grad_norm": 0.6459535360336304,
199
+ "learning_rate": 4.8230451807939135e-05,
200
+ "loss": 0.9017,
201
+ "num_input_tokens_seen": 139144,
202
+ "step": 120
203
+ },
204
+ {
205
+ "epoch": 0.3762227238525207,
206
+ "grad_norm": 0.6695935726165771,
207
+ "learning_rate": 4.808187129850963e-05,
208
+ "loss": 1.035,
209
+ "num_input_tokens_seen": 144848,
210
+ "step": 125
211
+ },
212
+ {
213
+ "epoch": 0.3912716328066215,
214
+ "grad_norm": 0.9289236664772034,
215
+ "learning_rate": 4.792754984798745e-05,
216
+ "loss": 1.0128,
217
+ "num_input_tokens_seen": 150480,
218
+ "step": 130
219
+ },
220
+ {
221
+ "epoch": 0.40632054176072235,
222
+ "grad_norm": 0.6192979216575623,
223
+ "learning_rate": 4.776752583932454e-05,
224
+ "loss": 0.9432,
225
+ "num_input_tokens_seen": 156336,
226
+ "step": 135
227
+ },
228
+ {
229
+ "epoch": 0.4213694507148232,
230
+ "grad_norm": 0.7946303486824036,
231
+ "learning_rate": 4.760183907381757e-05,
232
+ "loss": 1.0344,
233
+ "num_input_tokens_seen": 162440,
234
+ "step": 140
235
+ },
236
+ {
237
+ "epoch": 0.436418359668924,
238
+ "grad_norm": 0.6548484563827515,
239
+ "learning_rate": 4.7430530761208494e-05,
240
+ "loss": 0.9452,
241
+ "num_input_tokens_seen": 168304,
242
+ "step": 145
243
+ },
244
+ {
245
+ "epoch": 0.45146726862302483,
246
+ "grad_norm": 0.9075986742973328,
247
+ "learning_rate": 4.725364350943492e-05,
248
+ "loss": 0.9559,
249
+ "num_input_tokens_seen": 173984,
250
+ "step": 150
251
+ },
252
+ {
253
+ "epoch": 0.46651617757712566,
254
+ "grad_norm": 0.8047800660133362,
255
+ "learning_rate": 4.707122131403251e-05,
256
+ "loss": 0.9726,
257
+ "num_input_tokens_seen": 179896,
258
+ "step": 155
259
+ },
260
+ {
261
+ "epoch": 0.4815650865312265,
262
+ "grad_norm": 0.6954847574234009,
263
+ "learning_rate": 4.6883309547192476e-05,
264
+ "loss": 0.9344,
265
+ "num_input_tokens_seen": 185296,
266
+ "step": 160
267
+ },
268
+ {
269
+ "epoch": 0.4966139954853273,
270
+ "grad_norm": 0.7912609577178955,
271
+ "learning_rate": 4.668995494647653e-05,
272
+ "loss": 0.9497,
273
+ "num_input_tokens_seen": 190928,
274
+ "step": 165
275
+ },
276
+ {
277
+ "epoch": 0.5116629044394282,
278
+ "grad_norm": 0.7360678315162659,
279
+ "learning_rate": 4.649120560319225e-05,
280
+ "loss": 1.057,
281
+ "num_input_tokens_seen": 197352,
282
+ "step": 170
283
+ },
284
+ {
285
+ "epoch": 0.526711813393529,
286
+ "grad_norm": 0.7325194478034973,
287
+ "learning_rate": 4.6287110950431865e-05,
288
+ "loss": 0.9847,
289
+ "num_input_tokens_seen": 203216,
290
+ "step": 175
291
+ },
292
+ {
293
+ "epoch": 0.5417607223476298,
294
+ "grad_norm": 0.7140082120895386,
295
+ "learning_rate": 4.607772175077711e-05,
296
+ "loss": 1.001,
297
+ "num_input_tokens_seen": 208624,
298
+ "step": 180
299
+ },
300
+ {
301
+ "epoch": 0.5568096313017307,
302
+ "grad_norm": 0.9454194903373718,
303
+ "learning_rate": 4.586309008367359e-05,
304
+ "loss": 0.9384,
305
+ "num_input_tokens_seen": 214552,
306
+ "step": 185
307
+ },
308
+ {
309
+ "epoch": 0.5718585402558315,
310
+ "grad_norm": 0.9370235800743103,
311
+ "learning_rate": 4.564326933247752e-05,
312
+ "loss": 1.0312,
313
+ "num_input_tokens_seen": 220704,
314
+ "step": 190
315
+ },
316
+ {
317
+ "epoch": 0.5869074492099323,
318
+ "grad_norm": 0.7274216413497925,
319
+ "learning_rate": 4.541831417117815e-05,
320
+ "loss": 0.9112,
321
+ "num_input_tokens_seen": 226480,
322
+ "step": 195
323
+ },
324
+ {
325
+ "epoch": 0.6019563581640331,
326
+ "grad_norm": 0.9026529788970947,
327
+ "learning_rate": 4.518828055079925e-05,
328
+ "loss": 0.9967,
329
+ "num_input_tokens_seen": 232136,
330
+ "step": 200
331
+ },
332
+ {
333
+ "epoch": 0.617005267118134,
334
+ "grad_norm": 0.9668667316436768,
335
+ "learning_rate": 4.4953225685482904e-05,
336
+ "loss": 1.0905,
337
+ "num_input_tokens_seen": 238072,
338
+ "step": 205
339
+ },
340
+ {
341
+ "epoch": 0.6320541760722348,
342
+ "grad_norm": 0.7728851437568665,
343
+ "learning_rate": 4.471320803825915e-05,
344
+ "loss": 0.9487,
345
+ "num_input_tokens_seen": 243680,
346
+ "step": 210
347
+ },
348
+ {
349
+ "epoch": 0.6471030850263356,
350
+ "grad_norm": 0.7141396999359131,
351
+ "learning_rate": 4.4468287306505045e-05,
352
+ "loss": 0.8675,
353
+ "num_input_tokens_seen": 249376,
354
+ "step": 215
355
+ },
356
+ {
357
+ "epoch": 0.6621519939804364,
358
+ "grad_norm": 0.7524191737174988,
359
+ "learning_rate": 4.421852440709666e-05,
360
+ "loss": 0.8624,
361
+ "num_input_tokens_seen": 255288,
362
+ "step": 220
363
+ },
364
+ {
365
+ "epoch": 0.6772009029345373,
366
+ "grad_norm": 1.1502355337142944,
367
+ "learning_rate": 4.39639814612578e-05,
368
+ "loss": 1.0489,
369
+ "num_input_tokens_seen": 261592,
370
+ "step": 225
371
+ },
372
+ {
373
+ "epoch": 0.6922498118886381,
374
+ "grad_norm": 0.7467320561408997,
375
+ "learning_rate": 4.370472177910914e-05,
376
+ "loss": 0.9139,
377
+ "num_input_tokens_seen": 267192,
378
+ "step": 230
379
+ },
380
+ {
381
+ "epoch": 0.7072987208427389,
382
+ "grad_norm": 0.6400129795074463,
383
+ "learning_rate": 4.3440809843921725e-05,
384
+ "loss": 0.9905,
385
+ "num_input_tokens_seen": 272712,
386
+ "step": 235
387
+ },
388
+ {
389
+ "epoch": 0.7223476297968398,
390
+ "grad_norm": 0.6654481291770935,
391
+ "learning_rate": 4.3172311296078595e-05,
392
+ "loss": 0.8974,
393
+ "num_input_tokens_seen": 278720,
394
+ "step": 240
395
+ },
396
+ {
397
+ "epoch": 0.7373965387509406,
398
+ "grad_norm": 0.7487585544586182,
399
+ "learning_rate": 4.28992929167487e-05,
400
+ "loss": 0.999,
401
+ "num_input_tokens_seen": 284584,
402
+ "step": 245
403
+ },
404
+ {
405
+ "epoch": 0.7524454477050414,
406
+ "grad_norm": 0.6885581612586975,
407
+ "learning_rate": 4.2621822611277e-05,
408
+ "loss": 0.9916,
409
+ "num_input_tokens_seen": 290408,
410
+ "step": 250
411
+ },
412
+ {
413
+ "epoch": 0.7674943566591422,
414
+ "grad_norm": 0.774027407169342,
415
+ "learning_rate": 4.233996939229502e-05,
416
+ "loss": 0.9242,
417
+ "num_input_tokens_seen": 295776,
418
+ "step": 255
419
+ },
420
+ {
421
+ "epoch": 0.782543265613243,
422
+ "grad_norm": 0.8608073592185974,
423
+ "learning_rate": 4.205380336255594e-05,
424
+ "loss": 1.0426,
425
+ "num_input_tokens_seen": 301736,
426
+ "step": 260
427
+ },
428
+ {
429
+ "epoch": 0.7975921745673439,
430
+ "grad_norm": 0.6539498567581177,
431
+ "learning_rate": 4.176339569749865e-05,
432
+ "loss": 0.8625,
433
+ "num_input_tokens_seen": 307224,
434
+ "step": 265
435
+ },
436
+ {
437
+ "epoch": 0.8126410835214447,
438
+ "grad_norm": 0.8432996273040771,
439
+ "learning_rate": 4.1468818627544845e-05,
440
+ "loss": 0.9959,
441
+ "num_input_tokens_seen": 313040,
442
+ "step": 270
443
+ },
444
+ {
445
+ "epoch": 0.8276899924755455,
446
+ "grad_norm": 0.877001166343689,
447
+ "learning_rate": 4.11701454201339e-05,
448
+ "loss": 0.939,
449
+ "num_input_tokens_seen": 319112,
450
+ "step": 275
451
+ },
452
+ {
453
+ "epoch": 0.8427389014296464,
454
+ "grad_norm": 0.9003238081932068,
455
+ "learning_rate": 4.08674503614997e-05,
456
+ "loss": 0.9741,
457
+ "num_input_tokens_seen": 325040,
458
+ "step": 280
459
+ },
460
+ {
461
+ "epoch": 0.8577878103837472,
462
+ "grad_norm": 0.8585950136184692,
463
+ "learning_rate": 4.0560808738194114e-05,
464
+ "loss": 0.98,
465
+ "num_input_tokens_seen": 330904,
466
+ "step": 285
467
+ },
468
+ {
469
+ "epoch": 0.872836719337848,
470
+ "grad_norm": 0.8015385270118713,
471
+ "learning_rate": 4.0250296818361647e-05,
472
+ "loss": 0.8898,
473
+ "num_input_tokens_seen": 336392,
474
+ "step": 290
475
+ },
476
+ {
477
+ "epoch": 0.8878856282919488,
478
+ "grad_norm": 0.8380082845687866,
479
+ "learning_rate": 3.993599183277001e-05,
480
+ "loss": 0.953,
481
+ "num_input_tokens_seen": 342832,
482
+ "step": 295
483
+ },
484
+ {
485
+ "epoch": 0.9029345372460497,
486
+ "grad_norm": 0.8890098929405212,
487
+ "learning_rate": 3.961797195560118e-05,
488
+ "loss": 0.9311,
489
+ "num_input_tokens_seen": 348944,
490
+ "step": 300
491
+ },
492
+ {
493
+ "epoch": 0.9179834462001505,
494
+ "grad_norm": 0.9356483221054077,
495
+ "learning_rate": 3.9296316285007887e-05,
496
+ "loss": 0.9114,
497
+ "num_input_tokens_seen": 354680,
498
+ "step": 305
499
+ },
500
+ {
501
+ "epoch": 0.9330323551542513,
502
+ "grad_norm": 0.8241044878959656,
503
+ "learning_rate": 3.897110482344024e-05,
504
+ "loss": 0.9674,
505
+ "num_input_tokens_seen": 361008,
506
+ "step": 310
507
+ },
508
+ {
509
+ "epoch": 0.9480812641083521,
510
+ "grad_norm": 0.7882922887802124,
511
+ "learning_rate": 3.864241845774746e-05,
512
+ "loss": 0.9582,
513
+ "num_input_tokens_seen": 366760,
514
+ "step": 315
515
+ },
516
+ {
517
+ "epoch": 0.963130173062453,
518
+ "grad_norm": 0.7503064274787903,
519
+ "learning_rate": 3.8310338939059644e-05,
520
+ "loss": 0.9863,
521
+ "num_input_tokens_seen": 372448,
522
+ "step": 320
523
+ },
524
+ {
525
+ "epoch": 0.9781790820165538,
526
+ "grad_norm": 0.6487952470779419,
527
+ "learning_rate": 3.797494886245456e-05,
528
+ "loss": 0.906,
529
+ "num_input_tokens_seen": 378520,
530
+ "step": 325
531
+ },
532
+ {
533
+ "epoch": 0.9932279909706546,
534
+ "grad_norm": 0.8584316968917847,
535
+ "learning_rate": 3.7636331646414524e-05,
536
+ "loss": 0.8958,
537
+ "num_input_tokens_seen": 384272,
538
+ "step": 330
539
+ },
540
+ {
541
+ "epoch": 1.0060195635816402,
542
+ "grad_norm": 0.8825767040252686,
543
+ "learning_rate": 3.7294571512078506e-05,
544
+ "loss": 0.8349,
545
+ "num_input_tokens_seen": 389280,
546
+ "step": 335
547
+ },
548
+ {
549
+ "epoch": 1.021068472535741,
550
+ "grad_norm": 0.8422874808311462,
551
+ "learning_rate": 3.694975346229458e-05,
552
+ "loss": 0.8507,
553
+ "num_input_tokens_seen": 394944,
554
+ "step": 340
555
+ },
556
+ {
557
+ "epoch": 1.036117381489842,
558
+ "grad_norm": 0.8337146639823914,
559
+ "learning_rate": 3.6601963260477924e-05,
560
+ "loss": 0.9287,
561
+ "num_input_tokens_seen": 400800,
562
+ "step": 345
563
+ },
564
+ {
565
+ "epoch": 1.0511662904439427,
566
+ "grad_norm": 0.936469316482544,
567
+ "learning_rate": 3.625128740927971e-05,
568
+ "loss": 0.9107,
569
+ "num_input_tokens_seen": 406728,
570
+ "step": 350
571
+ },
572
+ {
573
+ "epoch": 1.0662151993980435,
574
+ "grad_norm": 0.8475446105003357,
575
+ "learning_rate": 3.589781312907207e-05,
576
+ "loss": 0.952,
577
+ "num_input_tokens_seen": 412656,
578
+ "step": 355
579
+ },
580
+ {
581
+ "epoch": 1.0812641083521444,
582
+ "grad_norm": 0.7245047092437744,
583
+ "learning_rate": 3.55416283362546e-05,
584
+ "loss": 0.9526,
585
+ "num_input_tokens_seen": 418488,
586
+ "step": 360
587
+ },
588
+ {
589
+ "epoch": 1.0963130173062452,
590
+ "grad_norm": 1.0173735618591309,
591
+ "learning_rate": 3.518282162138772e-05,
592
+ "loss": 0.8775,
593
+ "num_input_tokens_seen": 424192,
594
+ "step": 365
595
+ },
596
+ {
597
+ "epoch": 1.111361926260346,
598
+ "grad_norm": 0.9992531538009644,
599
+ "learning_rate": 3.482148222715835e-05,
600
+ "loss": 0.883,
601
+ "num_input_tokens_seen": 430312,
602
+ "step": 370
603
+ },
604
+ {
605
+ "epoch": 1.1264108352144468,
606
+ "grad_norm": 1.0938397645950317,
607
+ "learning_rate": 3.4457700026183374e-05,
608
+ "loss": 1.0032,
609
+ "num_input_tokens_seen": 436128,
610
+ "step": 375
611
+ },
612
+ {
613
+ "epoch": 1.141459744168548,
614
+ "grad_norm": 0.8988808989524841,
615
+ "learning_rate": 3.409156549865654e-05,
616
+ "loss": 0.943,
617
+ "num_input_tokens_seen": 441928,
618
+ "step": 380
619
+ },
620
+ {
621
+ "epoch": 1.1565086531226485,
622
+ "grad_norm": 0.9952559471130371,
623
+ "learning_rate": 3.3723169709844026e-05,
624
+ "loss": 0.801,
625
+ "num_input_tokens_seen": 447560,
626
+ "step": 385
627
+ },
628
+ {
629
+ "epoch": 1.1715575620767495,
630
+ "grad_norm": 0.7556662559509277,
631
+ "learning_rate": 3.335260428743475e-05,
632
+ "loss": 0.9294,
633
+ "num_input_tokens_seen": 453296,
634
+ "step": 390
635
+ },
636
+ {
637
+ "epoch": 1.1866064710308502,
638
+ "grad_norm": 0.8362197279930115,
639
+ "learning_rate": 3.297996139875055e-05,
640
+ "loss": 0.9528,
641
+ "num_input_tokens_seen": 459336,
642
+ "step": 395
643
+ },
644
+ {
645
+ "epoch": 1.2016553799849512,
646
+ "grad_norm": 0.9389665722846985,
647
+ "learning_rate": 3.260533372782234e-05,
648
+ "loss": 0.8981,
649
+ "num_input_tokens_seen": 464944,
650
+ "step": 400
651
+ },
652
+ {
653
+ "epoch": 1.2167042889390518,
654
+ "grad_norm": 1.1821860074996948,
655
+ "learning_rate": 3.222881445233759e-05,
656
+ "loss": 0.9823,
657
+ "num_input_tokens_seen": 470992,
658
+ "step": 405
659
+ },
660
+ {
661
+ "epoch": 1.2317531978931529,
662
+ "grad_norm": 1.0015898942947388,
663
+ "learning_rate": 3.185049722046516e-05,
664
+ "loss": 0.9047,
665
+ "num_input_tokens_seen": 476216,
666
+ "step": 410
667
+ },
668
+ {
669
+ "epoch": 1.2468021068472535,
670
+ "grad_norm": 0.8765709400177002,
671
+ "learning_rate": 3.147047612756302e-05,
672
+ "loss": 0.8582,
673
+ "num_input_tokens_seen": 481824,
674
+ "step": 415
675
+ },
676
+ {
677
+ "epoch": 1.2618510158013545,
678
+ "grad_norm": 0.9712916612625122,
679
+ "learning_rate": 3.10888456927748e-05,
680
+ "loss": 0.8787,
681
+ "num_input_tokens_seen": 487576,
682
+ "step": 420
683
+ },
684
+ {
685
+ "epoch": 1.276899924755455,
686
+ "grad_norm": 1.1555066108703613,
687
+ "learning_rate": 3.0705700835520895e-05,
688
+ "loss": 0.8729,
689
+ "num_input_tokens_seen": 493336,
690
+ "step": 425
691
+ },
692
+ {
693
+ "epoch": 1.2919488337095562,
694
+ "grad_norm": 1.1198400259017944,
695
+ "learning_rate": 3.0321136851890036e-05,
696
+ "loss": 0.8772,
697
+ "num_input_tokens_seen": 499760,
698
+ "step": 430
699
+ },
700
+ {
701
+ "epoch": 1.3069977426636568,
702
+ "grad_norm": 1.1468943357467651,
703
+ "learning_rate": 2.9935249390937183e-05,
704
+ "loss": 0.9451,
705
+ "num_input_tokens_seen": 505400,
706
+ "step": 435
707
+ },
708
+ {
709
+ "epoch": 1.3220466516177578,
710
+ "grad_norm": 0.8468641042709351,
711
+ "learning_rate": 2.9548134430893604e-05,
712
+ "loss": 0.8202,
713
+ "num_input_tokens_seen": 511760,
714
+ "step": 440
715
+ },
716
+ {
717
+ "epoch": 1.3370955605718584,
718
+ "grad_norm": 1.3206151723861694,
719
+ "learning_rate": 2.9159888255295116e-05,
720
+ "loss": 0.9773,
721
+ "num_input_tokens_seen": 517616,
722
+ "step": 445
723
+ },
724
+ {
725
+ "epoch": 1.3521444695259595,
726
+ "grad_norm": 1.1996040344238281,
727
+ "learning_rate": 2.8770607429034352e-05,
728
+ "loss": 0.9101,
729
+ "num_input_tokens_seen": 522744,
730
+ "step": 450
731
+ },
732
+ {
733
+ "epoch": 1.36719337848006,
734
+ "grad_norm": 1.1539313793182373,
735
+ "learning_rate": 2.8380388774343047e-05,
736
+ "loss": 0.9633,
737
+ "num_input_tokens_seen": 528648,
738
+ "step": 455
739
+ },
740
+ {
741
+ "epoch": 1.382242287434161,
742
+ "grad_norm": 1.021848440170288,
743
+ "learning_rate": 2.7989329346710375e-05,
744
+ "loss": 0.8886,
745
+ "num_input_tokens_seen": 534000,
746
+ "step": 460
747
+ },
748
+ {
749
+ "epoch": 1.3972911963882617,
750
+ "grad_norm": 0.8612179160118103,
751
+ "learning_rate": 2.759752641074322e-05,
752
+ "loss": 0.9258,
753
+ "num_input_tokens_seen": 539688,
754
+ "step": 465
755
+ },
756
+ {
757
+ "epoch": 1.4123401053423628,
758
+ "grad_norm": 1.0109293460845947,
759
+ "learning_rate": 2.7205077415974416e-05,
760
+ "loss": 0.9039,
761
+ "num_input_tokens_seen": 545112,
762
+ "step": 470
763
+ },
764
+ {
765
+ "epoch": 1.4273890142964636,
766
+ "grad_norm": 1.1920832395553589,
767
+ "learning_rate": 2.6812079972625077e-05,
768
+ "loss": 1.0116,
769
+ "num_input_tokens_seen": 551328,
770
+ "step": 475
771
+ },
772
+ {
773
+ "epoch": 1.4424379232505644,
774
+ "grad_norm": 1.0512142181396484,
775
+ "learning_rate": 2.6418631827326857e-05,
776
+ "loss": 0.8218,
777
+ "num_input_tokens_seen": 556816,
778
+ "step": 480
779
+ },
780
+ {
781
+ "epoch": 1.4574868322046652,
782
+ "grad_norm": 1.146946907043457,
783
+ "learning_rate": 2.602483083881035e-05,
784
+ "loss": 0.8604,
785
+ "num_input_tokens_seen": 562552,
786
+ "step": 485
787
+ },
788
+ {
789
+ "epoch": 1.472535741158766,
790
+ "grad_norm": 1.1064790487289429,
791
+ "learning_rate": 2.563077495356561e-05,
792
+ "loss": 0.8044,
793
+ "num_input_tokens_seen": 568480,
794
+ "step": 490
795
+ },
796
+ {
797
+ "epoch": 1.487584650112867,
798
+ "grad_norm": 0.9678347110748291,
799
+ "learning_rate": 2.5236562181480794e-05,
800
+ "loss": 0.9198,
801
+ "num_input_tokens_seen": 574072,
802
+ "step": 495
803
+ },
804
+ {
805
+ "epoch": 1.5026335590669677,
806
+ "grad_norm": 0.9460956454277039,
807
+ "learning_rate": 2.484229057146507e-05,
808
+ "loss": 0.9181,
809
+ "num_input_tokens_seen": 580040,
810
+ "step": 500
811
+ },
812
+ {
813
+ "epoch": 1.5176824680210683,
814
+ "grad_norm": 1.175920844078064,
815
+ "learning_rate": 2.4448058187061835e-05,
816
+ "loss": 0.8644,
817
+ "num_input_tokens_seen": 586128,
818
+ "step": 505
819
+ },
820
+ {
821
+ "epoch": 1.5327313769751694,
822
+ "grad_norm": 1.2150397300720215,
823
+ "learning_rate": 2.4053963082058244e-05,
824
+ "loss": 1.0127,
825
+ "num_input_tokens_seen": 592256,
826
+ "step": 510
827
+ },
828
+ {
829
+ "epoch": 1.54778028592927,
830
+ "grad_norm": 0.9520708918571472,
831
+ "learning_rate": 2.3660103276097232e-05,
832
+ "loss": 0.7937,
833
+ "num_input_tokens_seen": 597704,
834
+ "step": 515
835
+ },
836
+ {
837
+ "epoch": 1.562829194883371,
838
+ "grad_norm": 1.0742231607437134,
839
+ "learning_rate": 2.3266576730297956e-05,
840
+ "loss": 0.9806,
841
+ "num_input_tokens_seen": 603240,
842
+ "step": 520
843
+ },
844
+ {
845
+ "epoch": 1.5778781038374716,
846
+ "grad_norm": 1.0484352111816406,
847
+ "learning_rate": 2.2873481322890862e-05,
848
+ "loss": 0.934,
849
+ "num_input_tokens_seen": 609616,
850
+ "step": 525
851
+ },
852
+ {
853
+ "epoch": 1.5929270127915727,
854
+ "grad_norm": 0.8829598426818848,
855
+ "learning_rate": 2.2480914824873297e-05,
856
+ "loss": 0.9288,
857
+ "num_input_tokens_seen": 615520,
858
+ "step": 530
859
+ },
860
+ {
861
+ "epoch": 1.6079759217456733,
862
+ "grad_norm": 0.9222884178161621,
863
+ "learning_rate": 2.2088974875691863e-05,
864
+ "loss": 0.8597,
865
+ "num_input_tokens_seen": 621208,
866
+ "step": 535
867
+ },
868
+ {
869
+ "epoch": 1.6230248306997743,
870
+ "grad_norm": 0.894801914691925,
871
+ "learning_rate": 2.1697758958957448e-05,
872
+ "loss": 0.8817,
873
+ "num_input_tokens_seen": 627176,
874
+ "step": 540
875
+ },
876
+ {
877
+ "epoch": 1.6380737396538751,
878
+ "grad_norm": 1.1703195571899414,
879
+ "learning_rate": 2.1307364378199005e-05,
880
+ "loss": 0.777,
881
+ "num_input_tokens_seen": 633248,
882
+ "step": 545
883
+ },
884
+ {
885
+ "epoch": 1.653122648607976,
886
+ "grad_norm": 1.0596733093261719,
887
+ "learning_rate": 2.0917888232662196e-05,
888
+ "loss": 0.798,
889
+ "num_input_tokens_seen": 639000,
890
+ "step": 550
891
+ },
892
+ {
893
+ "epoch": 1.6681715575620768,
894
+ "grad_norm": 1.0426228046417236,
895
+ "learning_rate": 2.0529427393158705e-05,
896
+ "loss": 0.9104,
897
+ "num_input_tokens_seen": 645280,
898
+ "step": 555
899
+ },
900
+ {
901
+ "epoch": 1.6832204665161776,
902
+ "grad_norm": 1.3300392627716064,
903
+ "learning_rate": 2.014207847797256e-05,
904
+ "loss": 0.8293,
905
+ "num_input_tokens_seen": 651760,
906
+ "step": 560
907
+ },
908
+ {
909
+ "epoch": 1.6982693754702785,
910
+ "grad_norm": 1.2664028406143188,
911
+ "learning_rate": 1.9755937828829067e-05,
912
+ "loss": 0.8821,
913
+ "num_input_tokens_seen": 657272,
914
+ "step": 565
915
+ },
916
+ {
917
+ "epoch": 1.7133182844243793,
918
+ "grad_norm": 0.9889734983444214,
919
+ "learning_rate": 1.937110148693265e-05,
920
+ "loss": 0.8253,
921
+ "num_input_tokens_seen": 663336,
922
+ "step": 570
923
+ },
924
+ {
925
+ "epoch": 1.72836719337848,
926
+ "grad_norm": 1.0789241790771484,
927
+ "learning_rate": 1.8987665169079454e-05,
928
+ "loss": 0.9391,
929
+ "num_input_tokens_seen": 668936,
930
+ "step": 575
931
+ },
932
+ {
933
+ "epoch": 1.743416102332581,
934
+ "grad_norm": 1.2337504625320435,
935
+ "learning_rate": 1.8605724243850502e-05,
936
+ "loss": 0.8711,
937
+ "num_input_tokens_seen": 675000,
938
+ "step": 580
939
+ },
940
+ {
941
+ "epoch": 1.7584650112866818,
942
+ "grad_norm": 0.905838668346405,
943
+ "learning_rate": 1.822537370789163e-05,
944
+ "loss": 0.8346,
945
+ "num_input_tokens_seen": 680584,
946
+ "step": 585
947
+ },
948
+ {
949
+ "epoch": 1.7735139202407826,
950
+ "grad_norm": 1.1633321046829224,
951
+ "learning_rate": 1.7846708162285785e-05,
952
+ "loss": 0.8275,
953
+ "num_input_tokens_seen": 686416,
954
+ "step": 590
955
+ },
956
+ {
957
+ "epoch": 1.7885628291948834,
958
+ "grad_norm": 0.9946597814559937,
959
+ "learning_rate": 1.7469821789023815e-05,
960
+ "loss": 0.9435,
961
+ "num_input_tokens_seen": 692016,
962
+ "step": 595
963
+ },
964
+ {
965
+ "epoch": 1.8036117381489842,
966
+ "grad_norm": 1.0259568691253662,
967
+ "learning_rate": 1.70948083275794e-05,
968
+ "loss": 0.8584,
969
+ "num_input_tokens_seen": 697984,
970
+ "step": 600
971
+ },
972
+ {
973
+ "epoch": 1.818660647103085,
974
+ "grad_norm": 1.0644334554672241,
975
+ "learning_rate": 1.672176105159417e-05,
976
+ "loss": 0.88,
977
+ "num_input_tokens_seen": 704056,
978
+ "step": 605
979
+ },
980
+ {
981
+ "epoch": 1.8337095560571859,
982
+ "grad_norm": 1.0443474054336548,
983
+ "learning_rate": 1.635077274567854e-05,
984
+ "loss": 0.8825,
985
+ "num_input_tokens_seen": 709760,
986
+ "step": 610
987
+ },
988
+ {
989
+ "epoch": 1.8487584650112867,
990
+ "grad_norm": 1.0267105102539062,
991
+ "learning_rate": 1.5981935682334264e-05,
992
+ "loss": 0.9978,
993
+ "num_input_tokens_seen": 715872,
994
+ "step": 615
995
+ },
996
+ {
997
+ "epoch": 1.8638073739653875,
998
+ "grad_norm": 1.3127869367599487,
999
+ "learning_rate": 1.561534159900441e-05,
1000
+ "loss": 0.9626,
1001
+ "num_input_tokens_seen": 722184,
1002
+ "step": 620
1003
+ },
1004
+ {
1005
+ "epoch": 1.8788562829194884,
1006
+ "grad_norm": 1.2093840837478638,
1007
+ "learning_rate": 1.525108167525624e-05,
1008
+ "loss": 0.9308,
1009
+ "num_input_tokens_seen": 727776,
1010
+ "step": 625
1011
+ },
1012
+ {
1013
+ "epoch": 1.8939051918735892,
1014
+ "grad_norm": 0.982764482498169,
1015
+ "learning_rate": 1.4889246510103077e-05,
1016
+ "loss": 0.9757,
1017
+ "num_input_tokens_seen": 733760,
1018
+ "step": 630
1019
+ },
1020
+ {
1021
+ "epoch": 1.90895410082769,
1022
+ "grad_norm": 1.111680507659912,
1023
+ "learning_rate": 1.4529926099470348e-05,
1024
+ "loss": 0.767,
1025
+ "num_input_tokens_seen": 740024,
1026
+ "step": 635
1027
+ },
1028
+ {
1029
+ "epoch": 1.9240030097817908,
1030
+ "grad_norm": 1.218017578125,
1031
+ "learning_rate": 1.4173209813811788e-05,
1032
+ "loss": 0.9272,
1033
+ "num_input_tokens_seen": 745480,
1034
+ "step": 640
1035
+ },
1036
+ {
1037
+ "epoch": 1.9390519187358917,
1038
+ "grad_norm": 1.3443623781204224,
1039
+ "learning_rate": 1.381918637588112e-05,
1040
+ "loss": 0.7941,
1041
+ "num_input_tokens_seen": 751384,
1042
+ "step": 645
1043
+ },
1044
+ {
1045
+ "epoch": 1.9541008276899925,
1046
+ "grad_norm": 0.9702039361000061,
1047
+ "learning_rate": 1.3467943838664863e-05,
1048
+ "loss": 0.8408,
1049
+ "num_input_tokens_seen": 756920,
1050
+ "step": 650
1051
+ },
1052
+ {
1053
+ "epoch": 1.9691497366440933,
1054
+ "grad_norm": 1.1215064525604248,
1055
+ "learning_rate": 1.311956956348177e-05,
1056
+ "loss": 0.8459,
1057
+ "num_input_tokens_seen": 762424,
1058
+ "step": 655
1059
+ },
1060
+ {
1061
+ "epoch": 1.9841986455981941,
1062
+ "grad_norm": 1.3830626010894775,
1063
+ "learning_rate": 1.277415019825417e-05,
1064
+ "loss": 1.0117,
1065
+ "num_input_tokens_seen": 768224,
1066
+ "step": 660
1067
+ },
1068
+ {
1069
+ "epoch": 1.999247554552295,
1070
+ "grad_norm": 1.028895616531372,
1071
+ "learning_rate": 1.2431771655956925e-05,
1072
+ "loss": 0.9665,
1073
+ "num_input_tokens_seen": 773568,
1074
+ "step": 665
1075
+ },
1076
+ {
1077
+ "epoch": 2.0120391271632805,
1078
+ "grad_norm": 1.1555911302566528,
1079
+ "learning_rate": 1.2092519093248988e-05,
1080
+ "loss": 0.7625,
1081
+ "num_input_tokens_seen": 778672,
1082
+ "step": 670
1083
+ },
1084
+ {
1085
+ "epoch": 2.0270880361173815,
1086
+ "grad_norm": 1.037429690361023,
1087
+ "learning_rate": 1.1756476889293269e-05,
1088
+ "loss": 0.8667,
1089
+ "num_input_tokens_seen": 784488,
1090
+ "step": 675
1091
+ },
1092
+ {
1093
+ "epoch": 2.042136945071482,
1094
+ "grad_norm": 1.053051471710205,
1095
+ "learning_rate": 1.1423728624769695e-05,
1096
+ "loss": 0.8297,
1097
+ "num_input_tokens_seen": 790304,
1098
+ "step": 680
1099
+ },
1100
+ {
1101
+ "epoch": 2.057185854025583,
1102
+ "grad_norm": 1.0523649454116821,
1103
+ "learning_rate": 1.1094357061087033e-05,
1104
+ "loss": 0.8774,
1105
+ "num_input_tokens_seen": 796192,
1106
+ "step": 685
1107
+ },
1108
+ {
1109
+ "epoch": 2.072234762979684,
1110
+ "grad_norm": 1.0367976427078247,
1111
+ "learning_rate": 1.0768444119798357e-05,
1112
+ "loss": 0.8476,
1113
+ "num_input_tokens_seen": 802144,
1114
+ "step": 690
1115
+ },
1116
+ {
1117
+ "epoch": 2.087283671933785,
1118
+ "grad_norm": 1.4130756855010986,
1119
+ "learning_rate": 1.0446070862225463e-05,
1120
+ "loss": 0.8641,
1121
+ "num_input_tokens_seen": 807768,
1122
+ "step": 695
1123
+ },
1124
+ {
1125
+ "epoch": 2.1023325808878854,
1126
+ "grad_norm": 1.1584120988845825,
1127
+ "learning_rate": 1.0127317469297277e-05,
1128
+ "loss": 0.8383,
1129
+ "num_input_tokens_seen": 813712,
1130
+ "step": 700
1131
+ },
1132
+ {
1133
+ "epoch": 2.1173814898419865,
1134
+ "grad_norm": 1.2318339347839355,
1135
+ "learning_rate": 9.812263221607112e-06,
1136
+ "loss": 0.9123,
1137
+ "num_input_tokens_seen": 819360,
1138
+ "step": 705
1139
+ },
1140
+ {
1141
+ "epoch": 2.132430398796087,
1142
+ "grad_norm": 1.6237512826919556,
1143
+ "learning_rate": 9.500986479694036e-06,
1144
+ "loss": 0.9635,
1145
+ "num_input_tokens_seen": 824584,
1146
+ "step": 710
1147
+ },
1148
+ {
1149
+ "epoch": 2.147479307750188,
1150
+ "grad_norm": 1.106604814529419,
1151
+ "learning_rate": 9.19356466455287e-06,
1152
+ "loss": 0.9221,
1153
+ "num_input_tokens_seen": 830600,
1154
+ "step": 715
1155
+ },
1156
+ {
1157
+ "epoch": 2.1625282167042887,
1158
+ "grad_norm": 0.8615310788154602,
1159
+ "learning_rate": 8.890074238378074e-06,
1160
+ "loss": 0.8757,
1161
+ "num_input_tokens_seen": 836856,
1162
+ "step": 720
1163
+ },
1164
+ {
1165
+ "epoch": 2.17757712565839,
1166
+ "grad_norm": 0.8537486791610718,
1167
+ "learning_rate": 8.590590685545946e-06,
1168
+ "loss": 0.7958,
1169
+ "num_input_tokens_seen": 842872,
1170
+ "step": 725
1171
+ },
1172
+ {
1173
+ "epoch": 2.1926260346124904,
1174
+ "grad_norm": 0.8556107878684998,
1175
+ "learning_rate": 8.295188493840104e-06,
1176
+ "loss": 0.7993,
1177
+ "num_input_tokens_seen": 848664,
1178
+ "step": 730
1179
+ },
1180
+ {
1181
+ "epoch": 2.2076749435665914,
1182
+ "grad_norm": 1.093944787979126,
1183
+ "learning_rate": 8.003941135924858e-06,
1184
+ "loss": 0.8436,
1185
+ "num_input_tokens_seen": 854712,
1186
+ "step": 735
1187
+ },
1188
+ {
1189
+ "epoch": 2.222723852520692,
1190
+ "grad_norm": 1.2639975547790527,
1191
+ "learning_rate": 7.71692105107098e-06,
1192
+ "loss": 0.896,
1193
+ "num_input_tokens_seen": 860648,
1194
+ "step": 740
1195
+ },
1196
+ {
1197
+ "epoch": 2.237772761474793,
1198
+ "grad_norm": 1.177778720855713,
1199
+ "learning_rate": 7.434199627138602e-06,
1200
+ "loss": 0.8948,
1201
+ "num_input_tokens_seen": 866080,
1202
+ "step": 745
1203
+ },
1204
+ {
1205
+ "epoch": 2.2528216704288937,
1206
+ "grad_norm": 0.9701932668685913,
1207
+ "learning_rate": 7.155847182821523e-06,
1208
+ "loss": 0.8546,
1209
+ "num_input_tokens_seen": 871560,
1210
+ "step": 750
1211
+ },
1212
+ {
1213
+ "epoch": 2.2678705793829947,
1214
+ "grad_norm": 1.0232161283493042,
1215
+ "learning_rate": 6.881932950157538e-06,
1216
+ "loss": 0.8494,
1217
+ "num_input_tokens_seen": 877568,
1218
+ "step": 755
1219
+ },
1220
+ {
1221
+ "epoch": 2.282919488337096,
1222
+ "grad_norm": 1.119441270828247,
1223
+ "learning_rate": 6.612525057308949e-06,
1224
+ "loss": 0.7723,
1225
+ "num_input_tokens_seen": 883808,
1226
+ "step": 760
1227
+ },
1228
+ {
1229
+ "epoch": 2.2979683972911964,
1230
+ "grad_norm": 1.5488731861114502,
1231
+ "learning_rate": 6.347690511617693e-06,
1232
+ "loss": 0.9168,
1233
+ "num_input_tokens_seen": 889296,
1234
+ "step": 765
1235
+ },
1236
+ {
1237
+ "epoch": 2.313017306245297,
1238
+ "grad_norm": 1.2143895626068115,
1239
+ "learning_rate": 6.0874951829392234e-06,
1240
+ "loss": 0.8831,
1241
+ "num_input_tokens_seen": 895120,
1242
+ "step": 770
1243
+ },
1244
+ {
1245
+ "epoch": 2.328066215199398,
1246
+ "grad_norm": 1.157663106918335,
1247
+ "learning_rate": 5.832003787259327e-06,
1248
+ "loss": 0.854,
1249
+ "num_input_tokens_seen": 900320,
1250
+ "step": 775
1251
+ },
1252
+ {
1253
+ "epoch": 2.343115124153499,
1254
+ "grad_norm": 1.4496403932571411,
1255
+ "learning_rate": 5.581279870597867e-06,
1256
+ "loss": 0.8843,
1257
+ "num_input_tokens_seen": 905928,
1258
+ "step": 780
1259
+ },
1260
+ {
1261
+ "epoch": 2.3581640331075997,
1262
+ "grad_norm": 0.8820686936378479,
1263
+ "learning_rate": 5.335385793203604e-06,
1264
+ "loss": 0.862,
1265
+ "num_input_tokens_seen": 911976,
1266
+ "step": 785
1267
+ },
1268
+ {
1269
+ "epoch": 2.3732129420617003,
1270
+ "grad_norm": 1.622916579246521,
1271
+ "learning_rate": 5.094382714043907e-06,
1272
+ "loss": 0.985,
1273
+ "num_input_tokens_seen": 917840,
1274
+ "step": 790
1275
+ },
1276
+ {
1277
+ "epoch": 2.3882618510158014,
1278
+ "grad_norm": 1.0603710412979126,
1279
+ "learning_rate": 4.85833057559322e-06,
1280
+ "loss": 0.7679,
1281
+ "num_input_tokens_seen": 923168,
1282
+ "step": 795
1283
+ },
1284
+ {
1285
+ "epoch": 2.4033107599699024,
1286
+ "grad_norm": 1.0989526510238647,
1287
+ "learning_rate": 4.627288088924156e-06,
1288
+ "loss": 0.8198,
1289
+ "num_input_tokens_seen": 928720,
1290
+ "step": 800
1291
+ },
1292
+ {
1293
+ "epoch": 2.418359668924003,
1294
+ "grad_norm": 0.9745952486991882,
1295
+ "learning_rate": 4.401312719104802e-06,
1296
+ "loss": 0.7773,
1297
+ "num_input_tokens_seen": 934568,
1298
+ "step": 805
1299
+ },
1300
+ {
1301
+ "epoch": 2.4334085778781036,
1302
+ "grad_norm": 1.529707670211792,
1303
+ "learning_rate": 4.180460670905978e-06,
1304
+ "loss": 0.9312,
1305
+ "num_input_tokens_seen": 940264,
1306
+ "step": 810
1307
+ },
1308
+ {
1309
+ "epoch": 2.4484574868322047,
1310
+ "grad_norm": 1.2537649869918823,
1311
+ "learning_rate": 3.964786874821955e-06,
1312
+ "loss": 0.8497,
1313
+ "num_input_tokens_seen": 946128,
1314
+ "step": 815
1315
+ },
1316
+ {
1317
+ "epoch": 2.4635063957863057,
1318
+ "grad_norm": 1.0871232748031616,
1319
+ "learning_rate": 3.754344973408064e-06,
1320
+ "loss": 0.782,
1321
+ "num_input_tokens_seen": 952032,
1322
+ "step": 820
1323
+ },
1324
+ {
1325
+ "epoch": 2.4785553047404063,
1326
+ "grad_norm": 1.2940268516540527,
1327
+ "learning_rate": 3.5491873079387256e-06,
1328
+ "loss": 0.8937,
1329
+ "num_input_tokens_seen": 957960,
1330
+ "step": 825
1331
+ },
1332
+ {
1333
+ "epoch": 2.493604213694507,
1334
+ "grad_norm": 1.2327598333358765,
1335
+ "learning_rate": 3.3493649053890326e-06,
1336
+ "loss": 0.7039,
1337
+ "num_input_tokens_seen": 964336,
1338
+ "step": 830
1339
+ },
1340
+ {
1341
+ "epoch": 2.508653122648608,
1342
+ "grad_norm": 1.516093373298645,
1343
+ "learning_rate": 3.1549274657433375e-06,
1344
+ "loss": 0.9265,
1345
+ "num_input_tokens_seen": 970168,
1346
+ "step": 835
1347
+ },
1348
+ {
1349
+ "epoch": 2.523702031602709,
1350
+ "grad_norm": 1.1418204307556152,
1351
+ "learning_rate": 2.9659233496337786e-06,
1352
+ "loss": 0.8669,
1353
+ "num_input_tokens_seen": 975752,
1354
+ "step": 840
1355
+ },
1356
+ {
1357
+ "epoch": 2.5387509405568096,
1358
+ "grad_norm": 1.3584462404251099,
1359
+ "learning_rate": 2.7823995663120327e-06,
1360
+ "loss": 0.9174,
1361
+ "num_input_tokens_seen": 981672,
1362
+ "step": 845
1363
+ },
1364
+ {
1365
+ "epoch": 2.55379984951091,
1366
+ "grad_norm": 1.1911269426345825,
1367
+ "learning_rate": 2.6044017619571065e-06,
1368
+ "loss": 0.8718,
1369
+ "num_input_tokens_seen": 987560,
1370
+ "step": 850
1371
+ },
1372
+ {
1373
+ "epoch": 2.5688487584650113,
1374
+ "grad_norm": 1.3048710823059082,
1375
+ "learning_rate": 2.431974208322191e-06,
1376
+ "loss": 0.8634,
1377
+ "num_input_tokens_seen": 993200,
1378
+ "step": 855
1379
+ },
1380
+ {
1381
+ "epoch": 2.5838976674191123,
1382
+ "grad_norm": 1.1356749534606934,
1383
+ "learning_rate": 2.265159791723373e-06,
1384
+ "loss": 0.845,
1385
+ "num_input_tokens_seen": 999192,
1386
+ "step": 860
1387
+ },
1388
+ {
1389
+ "epoch": 2.598946576373213,
1390
+ "grad_norm": 1.2655149698257446,
1391
+ "learning_rate": 2.104000002372886e-06,
1392
+ "loss": 0.8008,
1393
+ "num_input_tokens_seen": 1004576,
1394
+ "step": 865
1395
+ },
1396
+ {
1397
+ "epoch": 2.6139954853273135,
1398
+ "grad_norm": 1.354706048965454,
1399
+ "learning_rate": 1.9485349240596613e-06,
1400
+ "loss": 0.8797,
1401
+ "num_input_tokens_seen": 1010352,
1402
+ "step": 870
1403
+ },
1404
+ {
1405
+ "epoch": 2.6290443942814146,
1406
+ "grad_norm": 1.0957777500152588,
1407
+ "learning_rate": 1.7988032241796376e-06,
1408
+ "loss": 0.946,
1409
+ "num_input_tokens_seen": 1016272,
1410
+ "step": 875
1411
+ },
1412
+ {
1413
+ "epoch": 2.6440933032355156,
1414
+ "grad_norm": 1.3322904109954834,
1415
+ "learning_rate": 1.6548421441183875e-06,
1416
+ "loss": 0.8032,
1417
+ "num_input_tokens_seen": 1021896,
1418
+ "step": 880
1419
+ },
1420
+ {
1421
+ "epoch": 2.659142212189616,
1422
+ "grad_norm": 1.1363080739974976,
1423
+ "learning_rate": 1.5166874899884053e-06,
1424
+ "loss": 0.8892,
1425
+ "num_input_tokens_seen": 1027704,
1426
+ "step": 885
1427
+ },
1428
+ {
1429
+ "epoch": 2.674191121143717,
1430
+ "grad_norm": 1.2706754207611084,
1431
+ "learning_rate": 1.3843736237233784e-06,
1432
+ "loss": 0.856,
1433
+ "num_input_tokens_seen": 1033800,
1434
+ "step": 890
1435
+ },
1436
+ {
1437
+ "epoch": 2.689240030097818,
1438
+ "grad_norm": 1.1934438943862915,
1439
+ "learning_rate": 1.2579334545316733e-06,
1440
+ "loss": 0.8617,
1441
+ "num_input_tokens_seen": 1040008,
1442
+ "step": 895
1443
+ },
1444
+ {
1445
+ "epoch": 2.704288939051919,
1446
+ "grad_norm": 1.4581674337387085,
1447
+ "learning_rate": 1.137398430711123e-06,
1448
+ "loss": 0.9117,
1449
+ "num_input_tokens_seen": 1046272,
1450
+ "step": 900
1451
+ },
1452
+ {
1453
+ "epoch": 2.7193378480060195,
1454
+ "grad_norm": 1.080992579460144,
1455
+ "learning_rate": 1.0227985318271682e-06,
1456
+ "loss": 0.7855,
1457
+ "num_input_tokens_seen": 1052032,
1458
+ "step": 905
1459
+ },
1460
+ {
1461
+ "epoch": 2.73438675696012,
1462
+ "grad_norm": 1.0012861490249634,
1463
+ "learning_rate": 9.141622612563571e-07,
1464
+ "loss": 0.8212,
1465
+ "num_input_tokens_seen": 1057584,
1466
+ "step": 910
1467
+ },
1468
+ {
1469
+ "epoch": 2.749435665914221,
1470
+ "grad_norm": 1.1472314596176147,
1471
+ "learning_rate": 8.115166390969125e-07,
1472
+ "loss": 0.8404,
1473
+ "num_input_tokens_seen": 1063760,
1474
+ "step": 915
1475
+ },
1476
+ {
1477
+ "epoch": 2.764484574868322,
1478
+ "grad_norm": 1.2558523416519165,
1479
+ "learning_rate": 7.148871954483105e-07,
1480
+ "loss": 0.7782,
1481
+ "num_input_tokens_seen": 1069544,
1482
+ "step": 920
1483
+ },
1484
+ {
1485
+ "epoch": 2.779533483822423,
1486
+ "grad_norm": 1.1380338668823242,
1487
+ "learning_rate": 6.242979640613933e-07,
1488
+ "loss": 0.7847,
1489
+ "num_input_tokens_seen": 1075472,
1490
+ "step": 925
1491
+ },
1492
+ {
1493
+ "epoch": 2.7945823927765234,
1494
+ "grad_norm": 0.972878098487854,
1495
+ "learning_rate": 5.397714763606843e-07,
1496
+ "loss": 0.8857,
1497
+ "num_input_tokens_seen": 1081464,
1498
+ "step": 930
1499
+ },
1500
+ {
1501
+ "epoch": 2.8096313017306245,
1502
+ "grad_norm": 1.2546579837799072,
1503
+ "learning_rate": 4.613287558403512e-07,
1504
+ "loss": 0.8029,
1505
+ "num_input_tokens_seen": 1087464,
1506
+ "step": 935
1507
+ },
1508
+ {
1509
+ "epoch": 2.8246802106847255,
1510
+ "grad_norm": 1.1165034770965576,
1511
+ "learning_rate": 3.8898931283523344e-07,
1512
+ "loss": 0.8154,
1513
+ "num_input_tokens_seen": 1092888,
1514
+ "step": 940
1515
+ },
1516
+ {
1517
+ "epoch": 2.839729119638826,
1518
+ "grad_norm": 1.3924362659454346,
1519
+ "learning_rate": 3.227711396682015e-07,
1520
+ "loss": 0.8791,
1521
+ "num_input_tokens_seen": 1098808,
1522
+ "step": 945
1523
+ },
1524
+ {
1525
+ "epoch": 2.854778028592927,
1526
+ "grad_norm": 1.021448016166687,
1527
+ "learning_rate": 2.626907061751116e-07,
1528
+ "loss": 0.787,
1529
+ "num_input_tokens_seen": 1104688,
1530
+ "step": 950
1531
+ },
1532
+ {
1533
+ "epoch": 2.869826937547028,
1534
+ "grad_norm": 1.3344382047653198,
1535
+ "learning_rate": 2.0876295560839364e-07,
1536
+ "loss": 0.8831,
1537
+ "num_input_tokens_seen": 1110960,
1538
+ "step": 955
1539
+ },
1540
+ {
1541
+ "epoch": 2.884875846501129,
1542
+ "grad_norm": 1.3956490755081177,
1543
+ "learning_rate": 1.6100130092037703e-07,
1544
+ "loss": 0.7677,
1545
+ "num_input_tokens_seen": 1116800,
1546
+ "step": 960
1547
+ },
1548
+ {
1549
+ "epoch": 2.8999247554552294,
1550
+ "grad_norm": 1.1644206047058105,
1551
+ "learning_rate": 1.194176214271897e-07,
1552
+ "loss": 0.7567,
1553
+ "num_input_tokens_seen": 1122248,
1554
+ "step": 965
1555
+ },
1556
+ {
1557
+ "epoch": 2.9149736644093305,
1558
+ "grad_norm": 1.2540746927261353,
1559
+ "learning_rate": 8.402225985413848e-08,
1560
+ "loss": 0.8944,
1561
+ "num_input_tokens_seen": 1127928,
1562
+ "step": 970
1563
+ },
1564
+ {
1565
+ "epoch": 2.930022573363431,
1566
+ "grad_norm": 1.1684881448745728,
1567
+ "learning_rate": 5.4824019763252685e-08,
1568
+ "loss": 0.9737,
1569
+ "num_input_tokens_seen": 1133336,
1570
+ "step": 975
1571
+ },
1572
+ {
1573
+ "epoch": 2.945071482317532,
1574
+ "grad_norm": 1.072198510169983,
1575
+ "learning_rate": 3.1830163363655296e-08,
1576
+ "loss": 0.8965,
1577
+ "num_input_tokens_seen": 1139048,
1578
+ "step": 980
1579
+ },
1580
+ {
1581
+ "epoch": 2.9601203912716327,
1582
+ "grad_norm": 1.7171086072921753,
1583
+ "learning_rate": 1.504640970531046e-08,
1584
+ "loss": 0.837,
1585
+ "num_input_tokens_seen": 1144456,
1586
+ "step": 985
1587
+ },
1588
+ {
1589
+ "epoch": 2.975169300225734,
1590
+ "grad_norm": 1.4984806776046753,
1591
+ "learning_rate": 4.4769332565558485e-09,
1592
+ "loss": 0.7812,
1593
+ "num_input_tokens_seen": 1150160,
1594
+ "step": 990
1595
+ },
1596
+ {
1597
+ "epoch": 2.9902182091798344,
1598
+ "grad_norm": 1.2322272062301636,
1599
+ "learning_rate": 1.2436286584982527e-10,
1600
+ "loss": 0.8613,
1601
+ "num_input_tokens_seen": 1156704,
1602
+ "step": 995
1603
+ },
1604
+ {
1605
+ "epoch": 2.9932279909706545,
1606
+ "num_input_tokens_seen": 1157808,
1607
+ "step": 996,
1608
+ "total_flos": 1.3788411572404224e+16,
1609
+ "train_loss": 0.939127180590687,
1610
+ "train_runtime": 10484.6402,
1611
+ "train_samples_per_second": 0.761,
1612
+ "train_steps_per_second": 0.095
1613
+ }
1614
+ ],
1615
+ "logging_steps": 5,
1616
+ "max_steps": 996,
1617
+ "num_input_tokens_seen": 1157808,
1618
+ "num_train_epochs": 3,
1619
+ "save_steps": 100,
1620
+ "stateful_callbacks": {
1621
+ "TrainerControl": {
1622
+ "args": {
1623
+ "should_epoch_stop": false,
1624
+ "should_evaluate": false,
1625
+ "should_log": false,
1626
+ "should_save": true,
1627
+ "should_training_stop": true
1628
+ },
1629
+ "attributes": {}
1630
+ }
1631
+ },
1632
+ "total_flos": 1.3788411572404224e+16,
1633
+ "train_batch_size": 1,
1634
+ "trial_name": null,
1635
+ "trial_params": null
1636
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19877188370cf3d74bf7f601a975694ade04d8236ac4f1d0937bf61a4ca990d0
3
+ size 5688
training_args.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ bf16: true
2
+ cutoff_len: 2048
3
+ dataset: OCR_Finetuning_Dataset
4
+ dataset_dir: /content/drive/MyDrive
5
+ ddp_timeout: 180000000
6
+ do_train: true
7
+ finetuning_type: lora
8
+ flash_attn: auto
9
+ gradient_accumulation_steps: 8
10
+ include_num_input_tokens_seen: true
11
+ learning_rate: 5.0e-05
12
+ logging_steps: 5
13
+ lora_alpha: 16
14
+ lora_dropout: 0
15
+ lora_rank: 8
16
+ lora_target: all
17
+ lr_scheduler_type: cosine
18
+ max_grad_norm: 1.0
19
+ max_samples: 100000
20
+ model_name_or_path: prithivMLmods/Qwen2-VL-OCR-2B-Instruct
21
+ num_train_epochs: 3.0
22
+ optim: adamw_torch
23
+ output_dir: saves/Custom/lora/train_2025-04-01-09-06-36
24
+ packing: false
25
+ per_device_train_batch_size: 1
26
+ plot_loss: true
27
+ preprocessing_num_workers: 16
28
+ report_to: none
29
+ save_steps: 100
30
+ stage: sft
31
+ template: default
32
+ trust_remote_code: true
33
+ warmup_steps: 0
training_loss.png ADDED
vocab.json ADDED
The diff for this file is too large to render. See raw diff