Upload 16 files
Browse files- .gitattributes +1 -0
- README.md +58 -3
- added_tokens.json +16 -0
- all_results.json +9 -0
- llamaboard_config.yaml +78 -0
- merges.txt +0 -0
- running_log.txt +1492 -0
- special_tokens_map.json +23 -0
- tokenizer.json +3 -0
- tokenizer_config.json +128 -0
- train_results.json +9 -0
- trainer_log.jsonl +200 -0
- trainer_state.json +1636 -0
- training_args.bin +3 -0
- training_args.yaml +33 -0
- training_loss.png +0 -0
- vocab.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,58 @@
|
|
| 1 |
-
---
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: peft
|
| 3 |
+
license: other
|
| 4 |
+
base_model: prithivMLmods/Qwen2-VL-OCR-2B-Instruct
|
| 5 |
+
tags:
|
| 6 |
+
- llama-factory
|
| 7 |
+
- lora
|
| 8 |
+
- generated_from_trainer
|
| 9 |
+
model-index:
|
| 10 |
+
- name: train_2025-04-01-09-06-36
|
| 11 |
+
results: []
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
| 15 |
+
should probably proofread and complete it, then remove this comment. -->
|
| 16 |
+
|
| 17 |
+
# train_2025-04-01-09-06-36
|
| 18 |
+
|
| 19 |
+
This model is a fine-tuned version of [prithivMLmods/Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct) on the OCR_Finetuning_Dataset dataset.
|
| 20 |
+
|
| 21 |
+
## Model description
|
| 22 |
+
|
| 23 |
+
More information needed
|
| 24 |
+
|
| 25 |
+
## Intended uses & limitations
|
| 26 |
+
|
| 27 |
+
More information needed
|
| 28 |
+
|
| 29 |
+
## Training and evaluation data
|
| 30 |
+
|
| 31 |
+
More information needed
|
| 32 |
+
|
| 33 |
+
## Training procedure
|
| 34 |
+
|
| 35 |
+
### Training hyperparameters
|
| 36 |
+
|
| 37 |
+
The following hyperparameters were used during training:
|
| 38 |
+
- learning_rate: 5e-05
|
| 39 |
+
- train_batch_size: 1
|
| 40 |
+
- eval_batch_size: 8
|
| 41 |
+
- seed: 42
|
| 42 |
+
- gradient_accumulation_steps: 8
|
| 43 |
+
- total_train_batch_size: 8
|
| 44 |
+
- optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
|
| 45 |
+
- lr_scheduler_type: cosine
|
| 46 |
+
- num_epochs: 3.0
|
| 47 |
+
|
| 48 |
+
### Training results
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
### Framework versions
|
| 53 |
+
|
| 54 |
+
- PEFT 0.15.0
|
| 55 |
+
- Transformers 4.50.0
|
| 56 |
+
- Pytorch 2.6.0+cu124
|
| 57 |
+
- Datasets 3.4.1
|
| 58 |
+
- Tokenizers 0.21.0
|
added_tokens.json
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"<|box_end|>": 151649,
|
| 3 |
+
"<|box_start|>": 151648,
|
| 4 |
+
"<|endoftext|>": 151643,
|
| 5 |
+
"<|im_end|>": 151645,
|
| 6 |
+
"<|im_start|>": 151644,
|
| 7 |
+
"<|image_pad|>": 151655,
|
| 8 |
+
"<|object_ref_end|>": 151647,
|
| 9 |
+
"<|object_ref_start|>": 151646,
|
| 10 |
+
"<|quad_end|>": 151651,
|
| 11 |
+
"<|quad_start|>": 151650,
|
| 12 |
+
"<|video_pad|>": 151656,
|
| 13 |
+
"<|vision_end|>": 151653,
|
| 14 |
+
"<|vision_pad|>": 151654,
|
| 15 |
+
"<|vision_start|>": 151652
|
| 16 |
+
}
|
all_results.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 2.9932279909706545,
|
| 3 |
+
"num_input_tokens_seen": 1157808,
|
| 4 |
+
"total_flos": 1.3788411572404224e+16,
|
| 5 |
+
"train_loss": 0.939127180590687,
|
| 6 |
+
"train_runtime": 10484.6402,
|
| 7 |
+
"train_samples_per_second": 0.761,
|
| 8 |
+
"train_steps_per_second": 0.095
|
| 9 |
+
}
|
llamaboard_config.yaml
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
top.booster: auto
|
| 2 |
+
top.checkpoint_path: []
|
| 3 |
+
top.finetuning_type: lora
|
| 4 |
+
top.model_name: Custom
|
| 5 |
+
top.quantization_bit: none
|
| 6 |
+
top.quantization_method: bitsandbytes
|
| 7 |
+
top.rope_scaling: none
|
| 8 |
+
top.template: default
|
| 9 |
+
train.additional_target: ''
|
| 10 |
+
train.apollo_rank: 16
|
| 11 |
+
train.apollo_scale: 32
|
| 12 |
+
train.apollo_target: all
|
| 13 |
+
train.apollo_update_interval: 200
|
| 14 |
+
train.badam_mode: layer
|
| 15 |
+
train.badam_switch_interval: 50
|
| 16 |
+
train.badam_switch_mode: ascending
|
| 17 |
+
train.badam_update_ratio: 0.05
|
| 18 |
+
train.batch_size: 1
|
| 19 |
+
train.compute_type: bf16
|
| 20 |
+
train.create_new_adapter: false
|
| 21 |
+
train.cutoff_len: 2048
|
| 22 |
+
train.dataset:
|
| 23 |
+
- OCR_Finetuning_Dataset
|
| 24 |
+
train.dataset_dir: /content/drive/MyDrive
|
| 25 |
+
train.ds_offload: false
|
| 26 |
+
train.ds_stage: none
|
| 27 |
+
train.extra_args: '{"optim": "adamw_torch"}'
|
| 28 |
+
train.freeze_extra_modules: ''
|
| 29 |
+
train.freeze_trainable_layers: 2
|
| 30 |
+
train.freeze_trainable_modules: all
|
| 31 |
+
train.galore_rank: 16
|
| 32 |
+
train.galore_scale: 2
|
| 33 |
+
train.galore_target: all
|
| 34 |
+
train.galore_update_interval: 200
|
| 35 |
+
train.gradient_accumulation_steps: 8
|
| 36 |
+
train.learning_rate: 5e-5
|
| 37 |
+
train.logging_steps: 5
|
| 38 |
+
train.lora_alpha: 16
|
| 39 |
+
train.lora_dropout: 0
|
| 40 |
+
train.lora_rank: 8
|
| 41 |
+
train.lora_target: ''
|
| 42 |
+
train.loraplus_lr_ratio: 0
|
| 43 |
+
train.lr_scheduler_type: cosine
|
| 44 |
+
train.mask_history: false
|
| 45 |
+
train.max_grad_norm: '1.0'
|
| 46 |
+
train.max_samples: '100000'
|
| 47 |
+
train.neat_packing: false
|
| 48 |
+
train.neftune_alpha: 0
|
| 49 |
+
train.num_train_epochs: '3.0'
|
| 50 |
+
train.packing: false
|
| 51 |
+
train.ppo_score_norm: false
|
| 52 |
+
train.ppo_whiten_rewards: false
|
| 53 |
+
train.pref_beta: 0.1
|
| 54 |
+
train.pref_ftx: 0
|
| 55 |
+
train.pref_loss: sigmoid
|
| 56 |
+
train.report_to:
|
| 57 |
+
- none
|
| 58 |
+
train.resize_vocab: false
|
| 59 |
+
train.reward_model: []
|
| 60 |
+
train.save_steps: 100
|
| 61 |
+
train.swanlab_api_key: ''
|
| 62 |
+
train.swanlab_link: ''
|
| 63 |
+
train.swanlab_mode: cloud
|
| 64 |
+
train.swanlab_project: llamafactory
|
| 65 |
+
train.swanlab_run_name: ''
|
| 66 |
+
train.swanlab_workspace: ''
|
| 67 |
+
train.train_on_prompt: false
|
| 68 |
+
train.training_stage: Supervised Fine-Tuning
|
| 69 |
+
train.use_apollo: false
|
| 70 |
+
train.use_badam: false
|
| 71 |
+
train.use_dora: false
|
| 72 |
+
train.use_galore: false
|
| 73 |
+
train.use_llama_pro: false
|
| 74 |
+
train.use_pissa: false
|
| 75 |
+
train.use_rslora: false
|
| 76 |
+
train.use_swanlab: false
|
| 77 |
+
train.val_size: 0
|
| 78 |
+
train.warmup_steps: 0
|
merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
running_log.txt
ADDED
|
@@ -0,0 +1,1492 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[INFO|2025-04-01 09:07:44] tokenization_auto.py:759 >> Could not locate the tokenizer configuration file, will try to use the model config instead.
|
| 2 |
+
|
| 3 |
+
[INFO|2025-04-01 09:07:44] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 4 |
+
|
| 5 |
+
[INFO|2025-04-01 09:07:44] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 6 |
+
"architectures": [
|
| 7 |
+
"Qwen2VLForConditionalGeneration"
|
| 8 |
+
],
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"bos_token_id": 151643,
|
| 11 |
+
"eos_token_id": 151645,
|
| 12 |
+
"hidden_act": "silu",
|
| 13 |
+
"hidden_size": 1536,
|
| 14 |
+
"image_token_id": 151655,
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": 8960,
|
| 17 |
+
"max_position_embeddings": 32768,
|
| 18 |
+
"max_window_layers": 28,
|
| 19 |
+
"model_type": "qwen2_vl",
|
| 20 |
+
"num_attention_heads": 12,
|
| 21 |
+
"num_hidden_layers": 28,
|
| 22 |
+
"num_key_value_heads": 2,
|
| 23 |
+
"pad_token_id": 151654,
|
| 24 |
+
"rms_norm_eps": 1e-06,
|
| 25 |
+
"rope_scaling": {
|
| 26 |
+
"mrope_section": [
|
| 27 |
+
16,
|
| 28 |
+
24,
|
| 29 |
+
24
|
| 30 |
+
],
|
| 31 |
+
"rope_type": "default",
|
| 32 |
+
"type": "default"
|
| 33 |
+
},
|
| 34 |
+
"rope_theta": 1000000.0,
|
| 35 |
+
"sliding_window": 32768,
|
| 36 |
+
"tie_word_embeddings": true,
|
| 37 |
+
"torch_dtype": "bfloat16",
|
| 38 |
+
"transformers_version": "4.50.0",
|
| 39 |
+
"use_cache": true,
|
| 40 |
+
"use_sliding_window": false,
|
| 41 |
+
"video_token_id": 151656,
|
| 42 |
+
"vision_config": {
|
| 43 |
+
"depth": 32,
|
| 44 |
+
"embed_dim": 1280,
|
| 45 |
+
"hidden_act": "quick_gelu",
|
| 46 |
+
"hidden_size": 1536,
|
| 47 |
+
"in_channels": 3,
|
| 48 |
+
"in_chans": 3,
|
| 49 |
+
"mlp_ratio": 4,
|
| 50 |
+
"model_type": "qwen2_vl",
|
| 51 |
+
"num_heads": 16,
|
| 52 |
+
"patch_size": 14,
|
| 53 |
+
"spatial_merge_size": 2,
|
| 54 |
+
"spatial_patch_size": 14,
|
| 55 |
+
"temporal_patch_size": 2
|
| 56 |
+
},
|
| 57 |
+
"vision_end_token_id": 151653,
|
| 58 |
+
"vision_start_token_id": 151652,
|
| 59 |
+
"vision_token_id": 151654,
|
| 60 |
+
"vocab_size": 151936
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
[INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file vocab.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/vocab.json
|
| 65 |
+
|
| 66 |
+
[INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file merges.txt from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/merges.txt
|
| 67 |
+
|
| 68 |
+
[INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file tokenizer.json from cache at None
|
| 69 |
+
|
| 70 |
+
[INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file added_tokens.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/added_tokens.json
|
| 71 |
+
|
| 72 |
+
[INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file special_tokens_map.json from cache at None
|
| 73 |
+
|
| 74 |
+
[INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file tokenizer_config.json from cache at None
|
| 75 |
+
|
| 76 |
+
[INFO|2025-04-01 09:07:44] tokenization_utils_base.py:2060 >> loading file chat_template.jinja from cache at None
|
| 77 |
+
|
| 78 |
+
[INFO|2025-04-01 09:07:44] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 79 |
+
|
| 80 |
+
[INFO|2025-04-01 09:07:44] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 81 |
+
"architectures": [
|
| 82 |
+
"Qwen2VLForConditionalGeneration"
|
| 83 |
+
],
|
| 84 |
+
"attention_dropout": 0.0,
|
| 85 |
+
"bos_token_id": 151643,
|
| 86 |
+
"eos_token_id": 151645,
|
| 87 |
+
"hidden_act": "silu",
|
| 88 |
+
"hidden_size": 1536,
|
| 89 |
+
"image_token_id": 151655,
|
| 90 |
+
"initializer_range": 0.02,
|
| 91 |
+
"intermediate_size": 8960,
|
| 92 |
+
"max_position_embeddings": 32768,
|
| 93 |
+
"max_window_layers": 28,
|
| 94 |
+
"model_type": "qwen2_vl",
|
| 95 |
+
"num_attention_heads": 12,
|
| 96 |
+
"num_hidden_layers": 28,
|
| 97 |
+
"num_key_value_heads": 2,
|
| 98 |
+
"pad_token_id": 151654,
|
| 99 |
+
"rms_norm_eps": 1e-06,
|
| 100 |
+
"rope_scaling": {
|
| 101 |
+
"mrope_section": [
|
| 102 |
+
16,
|
| 103 |
+
24,
|
| 104 |
+
24
|
| 105 |
+
],
|
| 106 |
+
"rope_type": "default",
|
| 107 |
+
"type": "default"
|
| 108 |
+
},
|
| 109 |
+
"rope_theta": 1000000.0,
|
| 110 |
+
"sliding_window": 32768,
|
| 111 |
+
"tie_word_embeddings": true,
|
| 112 |
+
"torch_dtype": "bfloat16",
|
| 113 |
+
"transformers_version": "4.50.0",
|
| 114 |
+
"use_cache": true,
|
| 115 |
+
"use_sliding_window": false,
|
| 116 |
+
"video_token_id": 151656,
|
| 117 |
+
"vision_config": {
|
| 118 |
+
"depth": 32,
|
| 119 |
+
"embed_dim": 1280,
|
| 120 |
+
"hidden_act": "quick_gelu",
|
| 121 |
+
"hidden_size": 1536,
|
| 122 |
+
"in_channels": 3,
|
| 123 |
+
"in_chans": 3,
|
| 124 |
+
"mlp_ratio": 4,
|
| 125 |
+
"model_type": "qwen2_vl",
|
| 126 |
+
"num_heads": 16,
|
| 127 |
+
"patch_size": 14,
|
| 128 |
+
"spatial_merge_size": 2,
|
| 129 |
+
"spatial_patch_size": 14,
|
| 130 |
+
"temporal_patch_size": 2
|
| 131 |
+
},
|
| 132 |
+
"vision_end_token_id": 151653,
|
| 133 |
+
"vision_start_token_id": 151652,
|
| 134 |
+
"vision_token_id": 151654,
|
| 135 |
+
"vocab_size": 151936
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
[INFO|2025-04-01 09:07:45] tokenization_utils_base.py:2323 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 140 |
+
|
| 141 |
+
[INFO|2025-04-01 09:07:45] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 142 |
+
|
| 143 |
+
[INFO|2025-04-01 09:07:45] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 144 |
+
"architectures": [
|
| 145 |
+
"Qwen2VLForConditionalGeneration"
|
| 146 |
+
],
|
| 147 |
+
"attention_dropout": 0.0,
|
| 148 |
+
"bos_token_id": 151643,
|
| 149 |
+
"eos_token_id": 151645,
|
| 150 |
+
"hidden_act": "silu",
|
| 151 |
+
"hidden_size": 1536,
|
| 152 |
+
"image_token_id": 151655,
|
| 153 |
+
"initializer_range": 0.02,
|
| 154 |
+
"intermediate_size": 8960,
|
| 155 |
+
"max_position_embeddings": 32768,
|
| 156 |
+
"max_window_layers": 28,
|
| 157 |
+
"model_type": "qwen2_vl",
|
| 158 |
+
"num_attention_heads": 12,
|
| 159 |
+
"num_hidden_layers": 28,
|
| 160 |
+
"num_key_value_heads": 2,
|
| 161 |
+
"pad_token_id": 151654,
|
| 162 |
+
"rms_norm_eps": 1e-06,
|
| 163 |
+
"rope_scaling": {
|
| 164 |
+
"mrope_section": [
|
| 165 |
+
16,
|
| 166 |
+
24,
|
| 167 |
+
24
|
| 168 |
+
],
|
| 169 |
+
"rope_type": "default",
|
| 170 |
+
"type": "default"
|
| 171 |
+
},
|
| 172 |
+
"rope_theta": 1000000.0,
|
| 173 |
+
"sliding_window": 32768,
|
| 174 |
+
"tie_word_embeddings": true,
|
| 175 |
+
"torch_dtype": "bfloat16",
|
| 176 |
+
"transformers_version": "4.50.0",
|
| 177 |
+
"use_cache": true,
|
| 178 |
+
"use_sliding_window": false,
|
| 179 |
+
"video_token_id": 151656,
|
| 180 |
+
"vision_config": {
|
| 181 |
+
"depth": 32,
|
| 182 |
+
"embed_dim": 1280,
|
| 183 |
+
"hidden_act": "quick_gelu",
|
| 184 |
+
"hidden_size": 1536,
|
| 185 |
+
"in_channels": 3,
|
| 186 |
+
"in_chans": 3,
|
| 187 |
+
"mlp_ratio": 4,
|
| 188 |
+
"model_type": "qwen2_vl",
|
| 189 |
+
"num_heads": 16,
|
| 190 |
+
"patch_size": 14,
|
| 191 |
+
"spatial_merge_size": 2,
|
| 192 |
+
"spatial_patch_size": 14,
|
| 193 |
+
"temporal_patch_size": 2
|
| 194 |
+
},
|
| 195 |
+
"vision_end_token_id": 151653,
|
| 196 |
+
"vision_start_token_id": 151652,
|
| 197 |
+
"vision_token_id": 151654,
|
| 198 |
+
"vocab_size": 151936
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
[INFO|2025-04-01 09:07:45] tokenization_utils_base.py:2323 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
|
| 203 |
+
|
| 204 |
+
[INFO|2025-04-01 09:07:46] image_processing_base.py:381 >> loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/preprocessor_config.json
|
| 205 |
+
|
| 206 |
+
[INFO|2025-04-01 09:07:46] image_processing_base.py:381 >> loading configuration file preprocessor_config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/preprocessor_config.json
|
| 207 |
+
|
| 208 |
+
[WARNING|2025-04-01 09:07:46] logging.py:329 >> Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
|
| 209 |
+
|
| 210 |
+
[INFO|2025-04-01 09:07:46] logging.py:143 >> Loading dataset /content/drive/MyDrive/dataset.jsonl...
|
| 211 |
+
|
| 212 |
+
[INFO|2025-04-01 09:07:47] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 213 |
+
|
| 214 |
+
[INFO|2025-04-01 09:07:47] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 215 |
+
"architectures": [
|
| 216 |
+
"Qwen2VLForConditionalGeneration"
|
| 217 |
+
],
|
| 218 |
+
"attention_dropout": 0.0,
|
| 219 |
+
"bos_token_id": 151643,
|
| 220 |
+
"eos_token_id": 151645,
|
| 221 |
+
"hidden_act": "silu",
|
| 222 |
+
"hidden_size": 1536,
|
| 223 |
+
"image_token_id": 151655,
|
| 224 |
+
"initializer_range": 0.02,
|
| 225 |
+
"intermediate_size": 8960,
|
| 226 |
+
"max_position_embeddings": 32768,
|
| 227 |
+
"max_window_layers": 28,
|
| 228 |
+
"model_type": "qwen2_vl",
|
| 229 |
+
"num_attention_heads": 12,
|
| 230 |
+
"num_hidden_layers": 28,
|
| 231 |
+
"num_key_value_heads": 2,
|
| 232 |
+
"pad_token_id": 151654,
|
| 233 |
+
"rms_norm_eps": 1e-06,
|
| 234 |
+
"rope_scaling": {
|
| 235 |
+
"mrope_section": [
|
| 236 |
+
16,
|
| 237 |
+
24,
|
| 238 |
+
24
|
| 239 |
+
],
|
| 240 |
+
"rope_type": "default",
|
| 241 |
+
"type": "default"
|
| 242 |
+
},
|
| 243 |
+
"rope_theta": 1000000.0,
|
| 244 |
+
"sliding_window": 32768,
|
| 245 |
+
"tie_word_embeddings": true,
|
| 246 |
+
"torch_dtype": "bfloat16",
|
| 247 |
+
"transformers_version": "4.50.0",
|
| 248 |
+
"use_cache": true,
|
| 249 |
+
"use_sliding_window": false,
|
| 250 |
+
"video_token_id": 151656,
|
| 251 |
+
"vision_config": {
|
| 252 |
+
"depth": 32,
|
| 253 |
+
"embed_dim": 1280,
|
| 254 |
+
"hidden_act": "quick_gelu",
|
| 255 |
+
"hidden_size": 1536,
|
| 256 |
+
"in_channels": 3,
|
| 257 |
+
"in_chans": 3,
|
| 258 |
+
"mlp_ratio": 4,
|
| 259 |
+
"model_type": "qwen2_vl",
|
| 260 |
+
"num_heads": 16,
|
| 261 |
+
"patch_size": 14,
|
| 262 |
+
"spatial_merge_size": 2,
|
| 263 |
+
"spatial_patch_size": 14,
|
| 264 |
+
"temporal_patch_size": 2
|
| 265 |
+
},
|
| 266 |
+
"vision_end_token_id": 151653,
|
| 267 |
+
"vision_start_token_id": 151652,
|
| 268 |
+
"vision_token_id": 151654,
|
| 269 |
+
"vocab_size": 151936
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
[INFO|2025-04-01 09:07:47] modeling_utils.py:1154 >> loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/model.safetensors
|
| 274 |
+
|
| 275 |
+
[INFO|2025-04-01 09:07:47] modeling_utils.py:2170 >> Instantiating Qwen2VLForConditionalGeneration model under default dtype torch.bfloat16.
|
| 276 |
+
|
| 277 |
+
[INFO|2025-04-01 09:07:47] configuration_utils.py:1139 >> Generate config GenerationConfig {
|
| 278 |
+
"bos_token_id": 151643,
|
| 279 |
+
"eos_token_id": 151645,
|
| 280 |
+
"pad_token_id": 151654
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
[INFO|2025-04-01 09:07:47] modeling_utils.py:2170 >> Instantiating Qwen2VisionTransformerPretrainedModel model under default dtype torch.bfloat16.
|
| 285 |
+
|
| 286 |
+
[INFO|2025-04-01 09:07:50] modeling_utils.py:4987 >> All model checkpoint weights were used when initializing Qwen2VLForConditionalGeneration.
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
[INFO|2025-04-01 09:07:50] modeling_utils.py:4995 >> All the weights of Qwen2VLForConditionalGeneration were initialized from the model checkpoint at prithivMLmods/Qwen2-VL-OCR-2B-Instruct.
|
| 290 |
+
If your task is similar to the task the model of the checkpoint was trained on, you can already use Qwen2VLForConditionalGeneration for predictions without further training.
|
| 291 |
+
|
| 292 |
+
[INFO|2025-04-01 09:07:50] configuration_utils.py:1094 >> loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/generation_config.json
|
| 293 |
+
|
| 294 |
+
[INFO|2025-04-01 09:07:50] configuration_utils.py:1139 >> Generate config GenerationConfig {
|
| 295 |
+
"bos_token_id": 151643,
|
| 296 |
+
"do_sample": true,
|
| 297 |
+
"eos_token_id": [
|
| 298 |
+
151645,
|
| 299 |
+
151643
|
| 300 |
+
],
|
| 301 |
+
"max_length": 32768,
|
| 302 |
+
"pad_token_id": 151654,
|
| 303 |
+
"temperature": 0.01,
|
| 304 |
+
"top_k": 1,
|
| 305 |
+
"top_p": 0.001
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
[INFO|2025-04-01 09:07:50] logging.py:143 >> Gradient checkpointing enabled.
|
| 310 |
+
|
| 311 |
+
[INFO|2025-04-01 09:07:50] logging.py:143 >> Using torch SDPA for faster training and inference.
|
| 312 |
+
|
| 313 |
+
[INFO|2025-04-01 09:07:50] logging.py:143 >> Upcasting trainable params to float32.
|
| 314 |
+
|
| 315 |
+
[INFO|2025-04-01 09:07:50] logging.py:143 >> Fine-tuning method: LoRA
|
| 316 |
+
|
| 317 |
+
[INFO|2025-04-01 09:07:50] logging.py:143 >> Found linear modules: q_proj,v_proj,gate_proj,down_proj,up_proj,o_proj,k_proj
|
| 318 |
+
|
| 319 |
+
[INFO|2025-04-01 09:07:50] logging.py:143 >> Set vision model not trainable: ['visual.patch_embed', 'visual.blocks'].
|
| 320 |
+
|
| 321 |
+
[INFO|2025-04-01 09:07:50] logging.py:143 >> Set multi model projector not trainable: visual.merger.
|
| 322 |
+
|
| 323 |
+
[INFO|2025-04-01 09:07:51] logging.py:143 >> trainable params: 9,232,384 || all params: 2,218,217,984 || trainable%: 0.4162
|
| 324 |
+
|
| 325 |
+
[INFO|2025-04-01 09:07:51] trainer.py:748 >> Using auto half precision backend
|
| 326 |
+
|
| 327 |
+
[WARNING|2025-04-01 09:07:51] trainer.py:783 >> No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
|
| 328 |
+
|
| 329 |
+
[INFO|2025-04-01 09:07:51] trainer.py:2409 >> ***** Running training *****
|
| 330 |
+
|
| 331 |
+
[INFO|2025-04-01 09:07:51] trainer.py:2410 >> Num examples = 2,658
|
| 332 |
+
|
| 333 |
+
[INFO|2025-04-01 09:07:51] trainer.py:2411 >> Num Epochs = 3
|
| 334 |
+
|
| 335 |
+
[INFO|2025-04-01 09:07:51] trainer.py:2412 >> Instantaneous batch size per device = 1
|
| 336 |
+
|
| 337 |
+
[INFO|2025-04-01 09:07:51] trainer.py:2415 >> Total train batch size (w. parallel, distributed & accumulation) = 8
|
| 338 |
+
|
| 339 |
+
[INFO|2025-04-01 09:07:51] trainer.py:2416 >> Gradient Accumulation steps = 8
|
| 340 |
+
|
| 341 |
+
[INFO|2025-04-01 09:07:51] trainer.py:2417 >> Total optimization steps = 996
|
| 342 |
+
|
| 343 |
+
[INFO|2025-04-01 09:07:51] trainer.py:2418 >> Number of trainable parameters = 9,232,384
|
| 344 |
+
|
| 345 |
+
[INFO|2025-04-01 09:08:45] logging.py:143 >> {'loss': 2.4707, 'learning_rate': 4.9997e-05, 'epoch': 0.02, 'throughput': 108.07}
|
| 346 |
+
|
| 347 |
+
[INFO|2025-04-01 09:09:39] logging.py:143 >> {'loss': 2.2509, 'learning_rate': 4.9988e-05, 'epoch': 0.03, 'throughput': 106.09}
|
| 348 |
+
|
| 349 |
+
[INFO|2025-04-01 09:10:31] logging.py:143 >> {'loss': 1.6895, 'learning_rate': 4.9972e-05, 'epoch': 0.05, 'throughput': 106.20}
|
| 350 |
+
|
| 351 |
+
[INFO|2025-04-01 09:11:24] logging.py:143 >> {'loss': 1.4876, 'learning_rate': 4.9950e-05, 'epoch': 0.06, 'throughput': 107.12}
|
| 352 |
+
|
| 353 |
+
[INFO|2025-04-01 09:12:19] logging.py:143 >> {'loss': 1.4812, 'learning_rate': 4.9922e-05, 'epoch': 0.08, 'throughput': 108.37}
|
| 354 |
+
|
| 355 |
+
[INFO|2025-04-01 09:13:11] logging.py:143 >> {'loss': 1.3642, 'learning_rate': 4.9888e-05, 'epoch': 0.09, 'throughput': 108.89}
|
| 356 |
+
|
| 357 |
+
[INFO|2025-04-01 09:14:06] logging.py:143 >> {'loss': 1.3651, 'learning_rate': 4.9848e-05, 'epoch': 0.11, 'throughput': 109.89}
|
| 358 |
+
|
| 359 |
+
[INFO|2025-04-01 09:15:00] logging.py:143 >> {'loss': 1.1321, 'learning_rate': 4.9801e-05, 'epoch': 0.12, 'throughput': 110.22}
|
| 360 |
+
|
| 361 |
+
[INFO|2025-04-01 09:15:53] logging.py:143 >> {'loss': 1.3012, 'learning_rate': 4.9749e-05, 'epoch': 0.14, 'throughput': 110.28}
|
| 362 |
+
|
| 363 |
+
[INFO|2025-04-01 09:16:46] logging.py:143 >> {'loss': 0.9827, 'learning_rate': 4.9690e-05, 'epoch': 0.15, 'throughput': 110.24}
|
| 364 |
+
|
| 365 |
+
[INFO|2025-04-01 09:17:40] logging.py:143 >> {'loss': 1.2313, 'learning_rate': 4.9625e-05, 'epoch': 0.17, 'throughput': 110.02}
|
| 366 |
+
|
| 367 |
+
[INFO|2025-04-01 09:18:35] logging.py:143 >> {'loss': 1.0347, 'learning_rate': 4.9554e-05, 'epoch': 0.18, 'throughput': 110.10}
|
| 368 |
+
|
| 369 |
+
[INFO|2025-04-01 09:19:28] logging.py:143 >> {'loss': 1.0422, 'learning_rate': 4.9476e-05, 'epoch': 0.20, 'throughput': 110.31}
|
| 370 |
+
|
| 371 |
+
[INFO|2025-04-01 09:20:22] logging.py:143 >> {'loss': 0.9996, 'learning_rate': 4.9393e-05, 'epoch': 0.21, 'throughput': 110.36}
|
| 372 |
+
|
| 373 |
+
[INFO|2025-04-01 09:21:16] logging.py:143 >> {'loss': 1.0755, 'learning_rate': 4.9304e-05, 'epoch': 0.23, 'throughput': 110.41}
|
| 374 |
+
|
| 375 |
+
[INFO|2025-04-01 09:22:07] logging.py:143 >> {'loss': 1.0260, 'learning_rate': 4.9208e-05, 'epoch': 0.24, 'throughput': 110.15}
|
| 376 |
+
|
| 377 |
+
[INFO|2025-04-01 09:22:59] logging.py:143 >> {'loss': 1.1307, 'learning_rate': 4.9107e-05, 'epoch': 0.26, 'throughput': 110.04}
|
| 378 |
+
|
| 379 |
+
[INFO|2025-04-01 09:23:52] logging.py:143 >> {'loss': 1.0221, 'learning_rate': 4.8999e-05, 'epoch': 0.27, 'throughput': 109.93}
|
| 380 |
+
|
| 381 |
+
[INFO|2025-04-01 09:24:43] logging.py:143 >> {'loss': 1.0120, 'learning_rate': 4.8886e-05, 'epoch': 0.29, 'throughput': 109.96}
|
| 382 |
+
|
| 383 |
+
[INFO|2025-04-01 09:25:34] logging.py:143 >> {'loss': 1.0151, 'learning_rate': 4.8767e-05, 'epoch': 0.30, 'throughput': 109.77}
|
| 384 |
+
|
| 385 |
+
[INFO|2025-04-01 09:25:34] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-100
|
| 386 |
+
|
| 387 |
+
[INFO|2025-04-01 09:25:34] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 388 |
+
|
| 389 |
+
[INFO|2025-04-01 09:25:34] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 390 |
+
"architectures": [
|
| 391 |
+
"Qwen2VLForConditionalGeneration"
|
| 392 |
+
],
|
| 393 |
+
"attention_dropout": 0.0,
|
| 394 |
+
"bos_token_id": 151643,
|
| 395 |
+
"eos_token_id": 151645,
|
| 396 |
+
"hidden_act": "silu",
|
| 397 |
+
"hidden_size": 1536,
|
| 398 |
+
"image_token_id": 151655,
|
| 399 |
+
"initializer_range": 0.02,
|
| 400 |
+
"intermediate_size": 8960,
|
| 401 |
+
"max_position_embeddings": 32768,
|
| 402 |
+
"max_window_layers": 28,
|
| 403 |
+
"model_type": "qwen2_vl",
|
| 404 |
+
"num_attention_heads": 12,
|
| 405 |
+
"num_hidden_layers": 28,
|
| 406 |
+
"num_key_value_heads": 2,
|
| 407 |
+
"pad_token_id": 151654,
|
| 408 |
+
"rms_norm_eps": 1e-06,
|
| 409 |
+
"rope_scaling": {
|
| 410 |
+
"mrope_section": [
|
| 411 |
+
16,
|
| 412 |
+
24,
|
| 413 |
+
24
|
| 414 |
+
],
|
| 415 |
+
"rope_type": "default",
|
| 416 |
+
"type": "default"
|
| 417 |
+
},
|
| 418 |
+
"rope_theta": 1000000.0,
|
| 419 |
+
"sliding_window": 32768,
|
| 420 |
+
"tie_word_embeddings": true,
|
| 421 |
+
"torch_dtype": "bfloat16",
|
| 422 |
+
"transformers_version": "4.50.0",
|
| 423 |
+
"use_cache": true,
|
| 424 |
+
"use_sliding_window": false,
|
| 425 |
+
"video_token_id": 151656,
|
| 426 |
+
"vision_config": {
|
| 427 |
+
"depth": 32,
|
| 428 |
+
"embed_dim": 1280,
|
| 429 |
+
"hidden_act": "quick_gelu",
|
| 430 |
+
"hidden_size": 1536,
|
| 431 |
+
"in_channels": 3,
|
| 432 |
+
"in_chans": 3,
|
| 433 |
+
"mlp_ratio": 4,
|
| 434 |
+
"model_type": "qwen2_vl",
|
| 435 |
+
"num_heads": 16,
|
| 436 |
+
"patch_size": 14,
|
| 437 |
+
"spatial_merge_size": 2,
|
| 438 |
+
"spatial_patch_size": 14,
|
| 439 |
+
"temporal_patch_size": 2
|
| 440 |
+
},
|
| 441 |
+
"vision_end_token_id": 151653,
|
| 442 |
+
"vision_start_token_id": 151652,
|
| 443 |
+
"vision_token_id": 151654,
|
| 444 |
+
"vocab_size": 151936
|
| 445 |
+
}
|
| 446 |
+
|
| 447 |
+
|
| 448 |
+
[INFO|2025-04-01 09:25:35] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-100/tokenizer_config.json
|
| 449 |
+
|
| 450 |
+
[INFO|2025-04-01 09:25:35] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-100/special_tokens_map.json
|
| 451 |
+
|
| 452 |
+
[INFO|2025-04-01 09:26:26] logging.py:143 >> {'loss': 1.0028, 'learning_rate': 4.8641e-05, 'epoch': 0.32, 'throughput': 109.30}
|
| 453 |
+
|
| 454 |
+
[INFO|2025-04-01 09:27:18] logging.py:143 >> {'loss': 1.1430, 'learning_rate': 4.8510e-05, 'epoch': 0.33, 'throughput': 109.20}
|
| 455 |
+
|
| 456 |
+
[INFO|2025-04-01 09:28:09] logging.py:143 >> {'loss': 0.9695, 'learning_rate': 4.8373e-05, 'epoch': 0.35, 'throughput': 109.18}
|
| 457 |
+
|
| 458 |
+
[INFO|2025-04-01 09:29:04] logging.py:143 >> {'loss': 0.9017, 'learning_rate': 4.8230e-05, 'epoch': 0.36, 'throughput': 109.34}
|
| 459 |
+
|
| 460 |
+
[INFO|2025-04-01 09:29:56] logging.py:143 >> {'loss': 1.0350, 'learning_rate': 4.8082e-05, 'epoch': 0.38, 'throughput': 109.29}
|
| 461 |
+
|
| 462 |
+
[INFO|2025-04-01 09:30:48] logging.py:143 >> {'loss': 1.0128, 'learning_rate': 4.7928e-05, 'epoch': 0.39, 'throughput': 109.25}
|
| 463 |
+
|
| 464 |
+
[INFO|2025-04-01 09:31:41] logging.py:143 >> {'loss': 0.9432, 'learning_rate': 4.7768e-05, 'epoch': 0.41, 'throughput': 109.32}
|
| 465 |
+
|
| 466 |
+
[INFO|2025-04-01 09:32:35] logging.py:143 >> {'loss': 1.0344, 'learning_rate': 4.7602e-05, 'epoch': 0.42, 'throughput': 109.50}
|
| 467 |
+
|
| 468 |
+
[INFO|2025-04-01 09:33:27] logging.py:143 >> {'loss': 0.9452, 'learning_rate': 4.7431e-05, 'epoch': 0.44, 'throughput': 109.55}
|
| 469 |
+
|
| 470 |
+
[INFO|2025-04-01 09:34:19] logging.py:143 >> {'loss': 0.9559, 'learning_rate': 4.7254e-05, 'epoch': 0.45, 'throughput': 109.53}
|
| 471 |
+
|
| 472 |
+
[INFO|2025-04-01 09:35:12] logging.py:143 >> {'loss': 0.9726, 'learning_rate': 4.7071e-05, 'epoch': 0.47, 'throughput': 109.60}
|
| 473 |
+
|
| 474 |
+
[INFO|2025-04-01 09:36:04] logging.py:143 >> {'loss': 0.9344, 'learning_rate': 4.6883e-05, 'epoch': 0.48, 'throughput': 109.48}
|
| 475 |
+
|
| 476 |
+
[INFO|2025-04-01 09:36:56] logging.py:143 >> {'loss': 0.9497, 'learning_rate': 4.6690e-05, 'epoch': 0.50, 'throughput': 109.40}
|
| 477 |
+
|
| 478 |
+
[INFO|2025-04-01 09:37:52] logging.py:143 >> {'loss': 1.0570, 'learning_rate': 4.6491e-05, 'epoch': 0.51, 'throughput': 109.59}
|
| 479 |
+
|
| 480 |
+
[INFO|2025-04-01 09:38:44] logging.py:143 >> {'loss': 0.9847, 'learning_rate': 4.6287e-05, 'epoch': 0.53, 'throughput': 109.65}
|
| 481 |
+
|
| 482 |
+
[INFO|2025-04-01 09:39:36] logging.py:143 >> {'loss': 1.0010, 'learning_rate': 4.6078e-05, 'epoch': 0.54, 'throughput': 109.52}
|
| 483 |
+
|
| 484 |
+
[INFO|2025-04-01 09:40:29] logging.py:143 >> {'loss': 0.9384, 'learning_rate': 4.5863e-05, 'epoch': 0.56, 'throughput': 109.56}
|
| 485 |
+
|
| 486 |
+
[INFO|2025-04-01 09:41:23] logging.py:143 >> {'loss': 1.0312, 'learning_rate': 4.5643e-05, 'epoch': 0.57, 'throughput': 109.68}
|
| 487 |
+
|
| 488 |
+
[INFO|2025-04-01 09:42:16] logging.py:143 >> {'loss': 0.9112, 'learning_rate': 4.5418e-05, 'epoch': 0.59, 'throughput': 109.70}
|
| 489 |
+
|
| 490 |
+
[INFO|2025-04-01 09:43:08] logging.py:143 >> {'loss': 0.9967, 'learning_rate': 4.5188e-05, 'epoch': 0.60, 'throughput': 109.66}
|
| 491 |
+
|
| 492 |
+
[INFO|2025-04-01 09:43:08] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-200
|
| 493 |
+
|
| 494 |
+
[INFO|2025-04-01 09:43:08] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 495 |
+
|
| 496 |
+
[INFO|2025-04-01 09:43:08] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 497 |
+
"architectures": [
|
| 498 |
+
"Qwen2VLForConditionalGeneration"
|
| 499 |
+
],
|
| 500 |
+
"attention_dropout": 0.0,
|
| 501 |
+
"bos_token_id": 151643,
|
| 502 |
+
"eos_token_id": 151645,
|
| 503 |
+
"hidden_act": "silu",
|
| 504 |
+
"hidden_size": 1536,
|
| 505 |
+
"image_token_id": 151655,
|
| 506 |
+
"initializer_range": 0.02,
|
| 507 |
+
"intermediate_size": 8960,
|
| 508 |
+
"max_position_embeddings": 32768,
|
| 509 |
+
"max_window_layers": 28,
|
| 510 |
+
"model_type": "qwen2_vl",
|
| 511 |
+
"num_attention_heads": 12,
|
| 512 |
+
"num_hidden_layers": 28,
|
| 513 |
+
"num_key_value_heads": 2,
|
| 514 |
+
"pad_token_id": 151654,
|
| 515 |
+
"rms_norm_eps": 1e-06,
|
| 516 |
+
"rope_scaling": {
|
| 517 |
+
"mrope_section": [
|
| 518 |
+
16,
|
| 519 |
+
24,
|
| 520 |
+
24
|
| 521 |
+
],
|
| 522 |
+
"rope_type": "default",
|
| 523 |
+
"type": "default"
|
| 524 |
+
},
|
| 525 |
+
"rope_theta": 1000000.0,
|
| 526 |
+
"sliding_window": 32768,
|
| 527 |
+
"tie_word_embeddings": true,
|
| 528 |
+
"torch_dtype": "bfloat16",
|
| 529 |
+
"transformers_version": "4.50.0",
|
| 530 |
+
"use_cache": true,
|
| 531 |
+
"use_sliding_window": false,
|
| 532 |
+
"video_token_id": 151656,
|
| 533 |
+
"vision_config": {
|
| 534 |
+
"depth": 32,
|
| 535 |
+
"embed_dim": 1280,
|
| 536 |
+
"hidden_act": "quick_gelu",
|
| 537 |
+
"hidden_size": 1536,
|
| 538 |
+
"in_channels": 3,
|
| 539 |
+
"in_chans": 3,
|
| 540 |
+
"mlp_ratio": 4,
|
| 541 |
+
"model_type": "qwen2_vl",
|
| 542 |
+
"num_heads": 16,
|
| 543 |
+
"patch_size": 14,
|
| 544 |
+
"spatial_merge_size": 2,
|
| 545 |
+
"spatial_patch_size": 14,
|
| 546 |
+
"temporal_patch_size": 2
|
| 547 |
+
},
|
| 548 |
+
"vision_end_token_id": 151653,
|
| 549 |
+
"vision_start_token_id": 151652,
|
| 550 |
+
"vision_token_id": 151654,
|
| 551 |
+
"vocab_size": 151936
|
| 552 |
+
}
|
| 553 |
+
|
| 554 |
+
|
| 555 |
+
[INFO|2025-04-01 09:43:09] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-200/tokenizer_config.json
|
| 556 |
+
|
| 557 |
+
[INFO|2025-04-01 09:43:09] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-200/special_tokens_map.json
|
| 558 |
+
|
| 559 |
+
[INFO|2025-04-01 09:44:02] logging.py:143 >> {'loss': 1.0905, 'learning_rate': 4.4953e-05, 'epoch': 0.62, 'throughput': 109.64}
|
| 560 |
+
|
| 561 |
+
[INFO|2025-04-01 09:44:55] logging.py:143 >> {'loss': 0.9487, 'learning_rate': 4.4713e-05, 'epoch': 0.63, 'throughput': 109.56}
|
| 562 |
+
|
| 563 |
+
[INFO|2025-04-01 09:45:47] logging.py:143 >> {'loss': 0.8675, 'learning_rate': 4.4468e-05, 'epoch': 0.65, 'throughput': 109.56}
|
| 564 |
+
|
| 565 |
+
[INFO|2025-04-01 09:46:40] logging.py:143 >> {'loss': 0.8624, 'learning_rate': 4.4219e-05, 'epoch': 0.66, 'throughput': 109.61}
|
| 566 |
+
|
| 567 |
+
[INFO|2025-04-01 09:47:35] logging.py:143 >> {'loss': 1.0489, 'learning_rate': 4.3964e-05, 'epoch': 0.68, 'throughput': 109.74}
|
| 568 |
+
|
| 569 |
+
[INFO|2025-04-01 09:48:27] logging.py:143 >> {'loss': 0.9139, 'learning_rate': 4.3705e-05, 'epoch': 0.69, 'throughput': 109.68}
|
| 570 |
+
|
| 571 |
+
[INFO|2025-04-01 09:49:19] logging.py:143 >> {'loss': 0.9905, 'learning_rate': 4.3441e-05, 'epoch': 0.71, 'throughput': 109.62}
|
| 572 |
+
|
| 573 |
+
[INFO|2025-04-01 09:50:13] logging.py:143 >> {'loss': 0.8974, 'learning_rate': 4.3172e-05, 'epoch': 0.72, 'throughput': 109.66}
|
| 574 |
+
|
| 575 |
+
[INFO|2025-04-01 09:51:06] logging.py:143 >> {'loss': 0.9990, 'learning_rate': 4.2899e-05, 'epoch': 0.74, 'throughput': 109.68}
|
| 576 |
+
|
| 577 |
+
[INFO|2025-04-01 09:51:59] logging.py:143 >> {'loss': 0.9916, 'learning_rate': 4.2622e-05, 'epoch': 0.75, 'throughput': 109.66}
|
| 578 |
+
|
| 579 |
+
[INFO|2025-04-01 09:52:51] logging.py:143 >> {'loss': 0.9242, 'learning_rate': 4.2340e-05, 'epoch': 0.77, 'throughput': 109.54}
|
| 580 |
+
|
| 581 |
+
[INFO|2025-04-01 09:53:45] logging.py:143 >> {'loss': 1.0426, 'learning_rate': 4.2054e-05, 'epoch': 0.78, 'throughput': 109.56}
|
| 582 |
+
|
| 583 |
+
[INFO|2025-04-01 09:54:37] logging.py:143 >> {'loss': 0.8625, 'learning_rate': 4.1763e-05, 'epoch': 0.80, 'throughput': 109.50}
|
| 584 |
+
|
| 585 |
+
[INFO|2025-04-01 09:55:30] logging.py:143 >> {'loss': 0.9959, 'learning_rate': 4.1469e-05, 'epoch': 0.81, 'throughput': 109.51}
|
| 586 |
+
|
| 587 |
+
[INFO|2025-04-01 09:56:23] logging.py:143 >> {'loss': 0.9390, 'learning_rate': 4.1170e-05, 'epoch': 0.83, 'throughput': 109.57}
|
| 588 |
+
|
| 589 |
+
[INFO|2025-04-01 09:57:16] logging.py:143 >> {'loss': 0.9741, 'learning_rate': 4.0867e-05, 'epoch': 0.84, 'throughput': 109.61}
|
| 590 |
+
|
| 591 |
+
[INFO|2025-04-01 09:58:10] logging.py:143 >> {'loss': 0.9800, 'learning_rate': 4.0561e-05, 'epoch': 0.86, 'throughput': 109.62}
|
| 592 |
+
|
| 593 |
+
[INFO|2025-04-01 09:59:02] logging.py:143 >> {'loss': 0.8898, 'learning_rate': 4.0250e-05, 'epoch': 0.87, 'throughput': 109.54}
|
| 594 |
+
|
| 595 |
+
[INFO|2025-04-01 09:59:57] logging.py:143 >> {'loss': 0.9530, 'learning_rate': 3.9936e-05, 'epoch': 0.89, 'throughput': 109.67}
|
| 596 |
+
|
| 597 |
+
[INFO|2025-04-01 10:00:50] logging.py:143 >> {'loss': 0.9311, 'learning_rate': 3.9618e-05, 'epoch': 0.90, 'throughput': 109.75}
|
| 598 |
+
|
| 599 |
+
[INFO|2025-04-01 10:00:50] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-300
|
| 600 |
+
|
| 601 |
+
[INFO|2025-04-01 10:00:51] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 602 |
+
|
| 603 |
+
[INFO|2025-04-01 10:00:51] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 604 |
+
"architectures": [
|
| 605 |
+
"Qwen2VLForConditionalGeneration"
|
| 606 |
+
],
|
| 607 |
+
"attention_dropout": 0.0,
|
| 608 |
+
"bos_token_id": 151643,
|
| 609 |
+
"eos_token_id": 151645,
|
| 610 |
+
"hidden_act": "silu",
|
| 611 |
+
"hidden_size": 1536,
|
| 612 |
+
"image_token_id": 151655,
|
| 613 |
+
"initializer_range": 0.02,
|
| 614 |
+
"intermediate_size": 8960,
|
| 615 |
+
"max_position_embeddings": 32768,
|
| 616 |
+
"max_window_layers": 28,
|
| 617 |
+
"model_type": "qwen2_vl",
|
| 618 |
+
"num_attention_heads": 12,
|
| 619 |
+
"num_hidden_layers": 28,
|
| 620 |
+
"num_key_value_heads": 2,
|
| 621 |
+
"pad_token_id": 151654,
|
| 622 |
+
"rms_norm_eps": 1e-06,
|
| 623 |
+
"rope_scaling": {
|
| 624 |
+
"mrope_section": [
|
| 625 |
+
16,
|
| 626 |
+
24,
|
| 627 |
+
24
|
| 628 |
+
],
|
| 629 |
+
"rope_type": "default",
|
| 630 |
+
"type": "default"
|
| 631 |
+
},
|
| 632 |
+
"rope_theta": 1000000.0,
|
| 633 |
+
"sliding_window": 32768,
|
| 634 |
+
"tie_word_embeddings": true,
|
| 635 |
+
"torch_dtype": "bfloat16",
|
| 636 |
+
"transformers_version": "4.50.0",
|
| 637 |
+
"use_cache": true,
|
| 638 |
+
"use_sliding_window": false,
|
| 639 |
+
"video_token_id": 151656,
|
| 640 |
+
"vision_config": {
|
| 641 |
+
"depth": 32,
|
| 642 |
+
"embed_dim": 1280,
|
| 643 |
+
"hidden_act": "quick_gelu",
|
| 644 |
+
"hidden_size": 1536,
|
| 645 |
+
"in_channels": 3,
|
| 646 |
+
"in_chans": 3,
|
| 647 |
+
"mlp_ratio": 4,
|
| 648 |
+
"model_type": "qwen2_vl",
|
| 649 |
+
"num_heads": 16,
|
| 650 |
+
"patch_size": 14,
|
| 651 |
+
"spatial_merge_size": 2,
|
| 652 |
+
"spatial_patch_size": 14,
|
| 653 |
+
"temporal_patch_size": 2
|
| 654 |
+
},
|
| 655 |
+
"vision_end_token_id": 151653,
|
| 656 |
+
"vision_start_token_id": 151652,
|
| 657 |
+
"vision_token_id": 151654,
|
| 658 |
+
"vocab_size": 151936
|
| 659 |
+
}
|
| 660 |
+
|
| 661 |
+
|
| 662 |
+
[INFO|2025-04-01 10:00:51] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-300/tokenizer_config.json
|
| 663 |
+
|
| 664 |
+
[INFO|2025-04-01 10:00:51] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-300/special_tokens_map.json
|
| 665 |
+
|
| 666 |
+
[INFO|2025-04-01 10:01:43] logging.py:143 >> {'loss': 0.9114, 'learning_rate': 3.9296e-05, 'epoch': 0.92, 'throughput': 109.73}
|
| 667 |
+
|
| 668 |
+
[INFO|2025-04-01 10:02:37] logging.py:143 >> {'loss': 0.9674, 'learning_rate': 3.8971e-05, 'epoch': 0.93, 'throughput': 109.85}
|
| 669 |
+
|
| 670 |
+
[INFO|2025-04-01 10:03:30] logging.py:143 >> {'loss': 0.9582, 'learning_rate': 3.8642e-05, 'epoch': 0.95, 'throughput': 109.86}
|
| 671 |
+
|
| 672 |
+
[INFO|2025-04-01 10:04:22] logging.py:143 >> {'loss': 0.9863, 'learning_rate': 3.8310e-05, 'epoch': 0.96, 'throughput': 109.84}
|
| 673 |
+
|
| 674 |
+
[INFO|2025-04-01 10:05:16] logging.py:143 >> {'loss': 0.9060, 'learning_rate': 3.7975e-05, 'epoch': 0.98, 'throughput': 109.89}
|
| 675 |
+
|
| 676 |
+
[INFO|2025-04-01 10:06:09] logging.py:143 >> {'loss': 0.8958, 'learning_rate': 3.7636e-05, 'epoch': 0.99, 'throughput': 109.87}
|
| 677 |
+
|
| 678 |
+
[INFO|2025-04-01 10:06:54] logging.py:143 >> {'loss': 0.8349, 'learning_rate': 3.7295e-05, 'epoch': 1.01, 'throughput': 109.89}
|
| 679 |
+
|
| 680 |
+
[INFO|2025-04-01 10:07:46] logging.py:143 >> {'loss': 0.8507, 'learning_rate': 3.6950e-05, 'epoch': 1.02, 'throughput': 109.86}
|
| 681 |
+
|
| 682 |
+
[INFO|2025-04-01 10:08:39] logging.py:143 >> {'loss': 0.9287, 'learning_rate': 3.6602e-05, 'epoch': 1.04, 'throughput': 109.88}
|
| 683 |
+
|
| 684 |
+
[INFO|2025-04-01 10:09:32] logging.py:143 >> {'loss': 0.9107, 'learning_rate': 3.6251e-05, 'epoch': 1.05, 'throughput': 109.91}
|
| 685 |
+
|
| 686 |
+
[INFO|2025-04-01 10:10:25] logging.py:143 >> {'loss': 0.9520, 'learning_rate': 3.5898e-05, 'epoch': 1.07, 'throughput': 109.93}
|
| 687 |
+
|
| 688 |
+
[INFO|2025-04-01 10:11:18] logging.py:143 >> {'loss': 0.9526, 'learning_rate': 3.5542e-05, 'epoch': 1.08, 'throughput': 109.92}
|
| 689 |
+
|
| 690 |
+
[INFO|2025-04-01 10:12:11] logging.py:143 >> {'loss': 0.8775, 'learning_rate': 3.5183e-05, 'epoch': 1.10, 'throughput': 109.90}
|
| 691 |
+
|
| 692 |
+
[INFO|2025-04-01 10:13:05] logging.py:143 >> {'loss': 0.8830, 'learning_rate': 3.4821e-05, 'epoch': 1.11, 'throughput': 109.94}
|
| 693 |
+
|
| 694 |
+
[INFO|2025-04-01 10:13:58] logging.py:143 >> {'loss': 1.0032, 'learning_rate': 3.4458e-05, 'epoch': 1.13, 'throughput': 109.93}
|
| 695 |
+
|
| 696 |
+
[INFO|2025-04-01 10:14:51] logging.py:143 >> {'loss': 0.9430, 'learning_rate': 3.4092e-05, 'epoch': 1.14, 'throughput': 109.94}
|
| 697 |
+
|
| 698 |
+
[INFO|2025-04-01 10:15:43] logging.py:143 >> {'loss': 0.8010, 'learning_rate': 3.3723e-05, 'epoch': 1.16, 'throughput': 109.91}
|
| 699 |
+
|
| 700 |
+
[INFO|2025-04-01 10:16:35] logging.py:143 >> {'loss': 0.9294, 'learning_rate': 3.3353e-05, 'epoch': 1.17, 'throughput': 109.91}
|
| 701 |
+
|
| 702 |
+
[INFO|2025-04-01 10:17:28] logging.py:143 >> {'loss': 0.9528, 'learning_rate': 3.2980e-05, 'epoch': 1.19, 'throughput': 109.96}
|
| 703 |
+
|
| 704 |
+
[INFO|2025-04-01 10:18:20] logging.py:143 >> {'loss': 0.8981, 'learning_rate': 3.2605e-05, 'epoch': 1.20, 'throughput': 109.93}
|
| 705 |
+
|
| 706 |
+
[INFO|2025-04-01 10:18:21] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-400
|
| 707 |
+
|
| 708 |
+
[INFO|2025-04-01 10:18:21] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 709 |
+
|
| 710 |
+
[INFO|2025-04-01 10:18:21] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 711 |
+
"architectures": [
|
| 712 |
+
"Qwen2VLForConditionalGeneration"
|
| 713 |
+
],
|
| 714 |
+
"attention_dropout": 0.0,
|
| 715 |
+
"bos_token_id": 151643,
|
| 716 |
+
"eos_token_id": 151645,
|
| 717 |
+
"hidden_act": "silu",
|
| 718 |
+
"hidden_size": 1536,
|
| 719 |
+
"image_token_id": 151655,
|
| 720 |
+
"initializer_range": 0.02,
|
| 721 |
+
"intermediate_size": 8960,
|
| 722 |
+
"max_position_embeddings": 32768,
|
| 723 |
+
"max_window_layers": 28,
|
| 724 |
+
"model_type": "qwen2_vl",
|
| 725 |
+
"num_attention_heads": 12,
|
| 726 |
+
"num_hidden_layers": 28,
|
| 727 |
+
"num_key_value_heads": 2,
|
| 728 |
+
"pad_token_id": 151654,
|
| 729 |
+
"rms_norm_eps": 1e-06,
|
| 730 |
+
"rope_scaling": {
|
| 731 |
+
"mrope_section": [
|
| 732 |
+
16,
|
| 733 |
+
24,
|
| 734 |
+
24
|
| 735 |
+
],
|
| 736 |
+
"rope_type": "default",
|
| 737 |
+
"type": "default"
|
| 738 |
+
},
|
| 739 |
+
"rope_theta": 1000000.0,
|
| 740 |
+
"sliding_window": 32768,
|
| 741 |
+
"tie_word_embeddings": true,
|
| 742 |
+
"torch_dtype": "bfloat16",
|
| 743 |
+
"transformers_version": "4.50.0",
|
| 744 |
+
"use_cache": true,
|
| 745 |
+
"use_sliding_window": false,
|
| 746 |
+
"video_token_id": 151656,
|
| 747 |
+
"vision_config": {
|
| 748 |
+
"depth": 32,
|
| 749 |
+
"embed_dim": 1280,
|
| 750 |
+
"hidden_act": "quick_gelu",
|
| 751 |
+
"hidden_size": 1536,
|
| 752 |
+
"in_channels": 3,
|
| 753 |
+
"in_chans": 3,
|
| 754 |
+
"mlp_ratio": 4,
|
| 755 |
+
"model_type": "qwen2_vl",
|
| 756 |
+
"num_heads": 16,
|
| 757 |
+
"patch_size": 14,
|
| 758 |
+
"spatial_merge_size": 2,
|
| 759 |
+
"spatial_patch_size": 14,
|
| 760 |
+
"temporal_patch_size": 2
|
| 761 |
+
},
|
| 762 |
+
"vision_end_token_id": 151653,
|
| 763 |
+
"vision_start_token_id": 151652,
|
| 764 |
+
"vision_token_id": 151654,
|
| 765 |
+
"vocab_size": 151936
|
| 766 |
+
}
|
| 767 |
+
|
| 768 |
+
|
| 769 |
+
[INFO|2025-04-01 10:18:21] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-400/tokenizer_config.json
|
| 770 |
+
|
| 771 |
+
[INFO|2025-04-01 10:18:21] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-400/special_tokens_map.json
|
| 772 |
+
|
| 773 |
+
[INFO|2025-04-01 10:19:15] logging.py:143 >> {'loss': 0.9823, 'learning_rate': 3.2229e-05, 'epoch': 1.22, 'throughput': 109.94}
|
| 774 |
+
|
| 775 |
+
[INFO|2025-04-01 10:20:06] logging.py:143 >> {'loss': 0.9047, 'learning_rate': 3.1850e-05, 'epoch': 1.23, 'throughput': 109.86}
|
| 776 |
+
|
| 777 |
+
[INFO|2025-04-01 10:20:58] logging.py:143 >> {'loss': 0.8582, 'learning_rate': 3.1470e-05, 'epoch': 1.25, 'throughput': 109.84}
|
| 778 |
+
|
| 779 |
+
[INFO|2025-04-01 10:21:50] logging.py:143 >> {'loss': 0.8787, 'learning_rate': 3.1089e-05, 'epoch': 1.26, 'throughput': 109.85}
|
| 780 |
+
|
| 781 |
+
[INFO|2025-04-01 10:22:42] logging.py:143 >> {'loss': 0.8729, 'learning_rate': 3.0706e-05, 'epoch': 1.28, 'throughput': 109.85}
|
| 782 |
+
|
| 783 |
+
[INFO|2025-04-01 10:23:37] logging.py:143 >> {'loss': 0.8772, 'learning_rate': 3.0321e-05, 'epoch': 1.29, 'throughput': 109.95}
|
| 784 |
+
|
| 785 |
+
[INFO|2025-04-01 10:24:28] logging.py:143 >> {'loss': 0.9451, 'learning_rate': 2.9935e-05, 'epoch': 1.31, 'throughput': 109.94}
|
| 786 |
+
|
| 787 |
+
[INFO|2025-04-01 10:25:22] logging.py:143 >> {'loss': 0.8202, 'learning_rate': 2.9548e-05, 'epoch': 1.32, 'throughput': 110.03}
|
| 788 |
+
|
| 789 |
+
[INFO|2025-04-01 10:26:14] logging.py:143 >> {'loss': 0.9773, 'learning_rate': 2.9160e-05, 'epoch': 1.34, 'throughput': 110.05}
|
| 790 |
+
|
| 791 |
+
[INFO|2025-04-01 10:27:04] logging.py:143 >> {'loss': 0.9101, 'learning_rate': 2.8771e-05, 'epoch': 1.35, 'throughput': 109.98}
|
| 792 |
+
|
| 793 |
+
[INFO|2025-04-01 10:27:57] logging.py:143 >> {'loss': 0.9633, 'learning_rate': 2.8380e-05, 'epoch': 1.37, 'throughput': 110.01}
|
| 794 |
+
|
| 795 |
+
[INFO|2025-04-01 10:28:47] logging.py:143 >> {'loss': 0.8886, 'learning_rate': 2.7989e-05, 'epoch': 1.38, 'throughput': 109.96}
|
| 796 |
+
|
| 797 |
+
[INFO|2025-04-01 10:29:39] logging.py:143 >> {'loss': 0.9258, 'learning_rate': 2.7598e-05, 'epoch': 1.40, 'throughput': 109.96}
|
| 798 |
+
|
| 799 |
+
[INFO|2025-04-01 10:30:30] logging.py:143 >> {'loss': 0.9039, 'learning_rate': 2.7205e-05, 'epoch': 1.41, 'throughput': 109.93}
|
| 800 |
+
|
| 801 |
+
[INFO|2025-04-01 10:31:24] logging.py:143 >> {'loss': 1.0116, 'learning_rate': 2.6812e-05, 'epoch': 1.43, 'throughput': 109.99}
|
| 802 |
+
|
| 803 |
+
[INFO|2025-04-01 10:32:14] logging.py:143 >> {'loss': 0.8218, 'learning_rate': 2.6419e-05, 'epoch': 1.44, 'throughput': 109.97}
|
| 804 |
+
|
| 805 |
+
[INFO|2025-04-01 10:33:07] logging.py:143 >> {'loss': 0.8604, 'learning_rate': 2.6025e-05, 'epoch': 1.46, 'throughput': 109.96}
|
| 806 |
+
|
| 807 |
+
[INFO|2025-04-01 10:33:59] logging.py:143 >> {'loss': 0.8044, 'learning_rate': 2.5631e-05, 'epoch': 1.47, 'throughput': 110.00}
|
| 808 |
+
|
| 809 |
+
[INFO|2025-04-01 10:34:51] logging.py:143 >> {'loss': 0.9198, 'learning_rate': 2.5237e-05, 'epoch': 1.49, 'throughput': 109.98}
|
| 810 |
+
|
| 811 |
+
[INFO|2025-04-01 10:35:45] logging.py:143 >> {'loss': 0.9181, 'learning_rate': 2.4842e-05, 'epoch': 1.50, 'throughput': 109.99}
|
| 812 |
+
|
| 813 |
+
[INFO|2025-04-01 10:35:45] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-500
|
| 814 |
+
|
| 815 |
+
[INFO|2025-04-01 10:35:45] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 816 |
+
|
| 817 |
+
[INFO|2025-04-01 10:35:45] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 818 |
+
"architectures": [
|
| 819 |
+
"Qwen2VLForConditionalGeneration"
|
| 820 |
+
],
|
| 821 |
+
"attention_dropout": 0.0,
|
| 822 |
+
"bos_token_id": 151643,
|
| 823 |
+
"eos_token_id": 151645,
|
| 824 |
+
"hidden_act": "silu",
|
| 825 |
+
"hidden_size": 1536,
|
| 826 |
+
"image_token_id": 151655,
|
| 827 |
+
"initializer_range": 0.02,
|
| 828 |
+
"intermediate_size": 8960,
|
| 829 |
+
"max_position_embeddings": 32768,
|
| 830 |
+
"max_window_layers": 28,
|
| 831 |
+
"model_type": "qwen2_vl",
|
| 832 |
+
"num_attention_heads": 12,
|
| 833 |
+
"num_hidden_layers": 28,
|
| 834 |
+
"num_key_value_heads": 2,
|
| 835 |
+
"pad_token_id": 151654,
|
| 836 |
+
"rms_norm_eps": 1e-06,
|
| 837 |
+
"rope_scaling": {
|
| 838 |
+
"mrope_section": [
|
| 839 |
+
16,
|
| 840 |
+
24,
|
| 841 |
+
24
|
| 842 |
+
],
|
| 843 |
+
"rope_type": "default",
|
| 844 |
+
"type": "default"
|
| 845 |
+
},
|
| 846 |
+
"rope_theta": 1000000.0,
|
| 847 |
+
"sliding_window": 32768,
|
| 848 |
+
"tie_word_embeddings": true,
|
| 849 |
+
"torch_dtype": "bfloat16",
|
| 850 |
+
"transformers_version": "4.50.0",
|
| 851 |
+
"use_cache": true,
|
| 852 |
+
"use_sliding_window": false,
|
| 853 |
+
"video_token_id": 151656,
|
| 854 |
+
"vision_config": {
|
| 855 |
+
"depth": 32,
|
| 856 |
+
"embed_dim": 1280,
|
| 857 |
+
"hidden_act": "quick_gelu",
|
| 858 |
+
"hidden_size": 1536,
|
| 859 |
+
"in_channels": 3,
|
| 860 |
+
"in_chans": 3,
|
| 861 |
+
"mlp_ratio": 4,
|
| 862 |
+
"model_type": "qwen2_vl",
|
| 863 |
+
"num_heads": 16,
|
| 864 |
+
"patch_size": 14,
|
| 865 |
+
"spatial_merge_size": 2,
|
| 866 |
+
"spatial_patch_size": 14,
|
| 867 |
+
"temporal_patch_size": 2
|
| 868 |
+
},
|
| 869 |
+
"vision_end_token_id": 151653,
|
| 870 |
+
"vision_start_token_id": 151652,
|
| 871 |
+
"vision_token_id": 151654,
|
| 872 |
+
"vocab_size": 151936
|
| 873 |
+
}
|
| 874 |
+
|
| 875 |
+
|
| 876 |
+
[INFO|2025-04-01 10:35:45] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-500/tokenizer_config.json
|
| 877 |
+
|
| 878 |
+
[INFO|2025-04-01 10:35:45] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-500/special_tokens_map.json
|
| 879 |
+
|
| 880 |
+
[INFO|2025-04-01 10:36:39] logging.py:143 >> {'loss': 0.8644, 'learning_rate': 2.4448e-05, 'epoch': 1.52, 'throughput': 110.01}
|
| 881 |
+
|
| 882 |
+
[INFO|2025-04-01 10:37:33] logging.py:143 >> {'loss': 1.0127, 'learning_rate': 2.4054e-05, 'epoch': 1.53, 'throughput': 110.05}
|
| 883 |
+
|
| 884 |
+
[INFO|2025-04-01 10:38:24] logging.py:143 >> {'loss': 0.7937, 'learning_rate': 2.3660e-05, 'epoch': 1.55, 'throughput': 110.02}
|
| 885 |
+
|
| 886 |
+
[INFO|2025-04-01 10:39:15] logging.py:143 >> {'loss': 0.9806, 'learning_rate': 2.3267e-05, 'epoch': 1.56, 'throughput': 110.01}
|
| 887 |
+
|
| 888 |
+
[INFO|2025-04-01 10:40:09] logging.py:143 >> {'loss': 0.9340, 'learning_rate': 2.2873e-05, 'epoch': 1.58, 'throughput': 110.08}
|
| 889 |
+
|
| 890 |
+
[INFO|2025-04-01 10:41:02] logging.py:143 >> {'loss': 0.9288, 'learning_rate': 2.2481e-05, 'epoch': 1.59, 'throughput': 110.09}
|
| 891 |
+
|
| 892 |
+
[INFO|2025-04-01 10:41:55] logging.py:143 >> {'loss': 0.8597, 'learning_rate': 2.2089e-05, 'epoch': 1.61, 'throughput': 110.07}
|
| 893 |
+
|
| 894 |
+
[INFO|2025-04-01 10:42:48] logging.py:143 >> {'loss': 0.8817, 'learning_rate': 2.1698e-05, 'epoch': 1.62, 'throughput': 110.10}
|
| 895 |
+
|
| 896 |
+
[INFO|2025-04-01 10:43:41] logging.py:143 >> {'loss': 0.7770, 'learning_rate': 2.1307e-05, 'epoch': 1.64, 'throughput': 110.13}
|
| 897 |
+
|
| 898 |
+
[INFO|2025-04-01 10:44:34] logging.py:143 >> {'loss': 0.7980, 'learning_rate': 2.0918e-05, 'epoch': 1.65, 'throughput': 110.11}
|
| 899 |
+
|
| 900 |
+
[INFO|2025-04-01 10:45:28] logging.py:143 >> {'loss': 0.9104, 'learning_rate': 2.0529e-05, 'epoch': 1.67, 'throughput': 110.17}
|
| 901 |
+
|
| 902 |
+
[INFO|2025-04-01 10:46:22] logging.py:143 >> {'loss': 0.8293, 'learning_rate': 2.0142e-05, 'epoch': 1.68, 'throughput': 110.26}
|
| 903 |
+
|
| 904 |
+
[INFO|2025-04-01 10:47:14] logging.py:143 >> {'loss': 0.8821, 'learning_rate': 1.9756e-05, 'epoch': 1.70, 'throughput': 110.23}
|
| 905 |
+
|
| 906 |
+
[INFO|2025-04-01 10:48:07] logging.py:143 >> {'loss': 0.8253, 'learning_rate': 1.9371e-05, 'epoch': 1.71, 'throughput': 110.27}
|
| 907 |
+
|
| 908 |
+
[INFO|2025-04-01 10:48:59] logging.py:143 >> {'loss': 0.9391, 'learning_rate': 1.8988e-05, 'epoch': 1.73, 'throughput': 110.25}
|
| 909 |
+
|
| 910 |
+
[INFO|2025-04-01 10:49:52] logging.py:143 >> {'loss': 0.8711, 'learning_rate': 1.8606e-05, 'epoch': 1.74, 'throughput': 110.28}
|
| 911 |
+
|
| 912 |
+
[INFO|2025-04-01 10:50:44] logging.py:143 >> {'loss': 0.8346, 'learning_rate': 1.8225e-05, 'epoch': 1.76, 'throughput': 110.26}
|
| 913 |
+
|
| 914 |
+
[INFO|2025-04-01 10:51:36] logging.py:143 >> {'loss': 0.8275, 'learning_rate': 1.7847e-05, 'epoch': 1.77, 'throughput': 110.27}
|
| 915 |
+
|
| 916 |
+
[INFO|2025-04-01 10:52:27] logging.py:143 >> {'loss': 0.9435, 'learning_rate': 1.7470e-05, 'epoch': 1.79, 'throughput': 110.27}
|
| 917 |
+
|
| 918 |
+
[INFO|2025-04-01 10:53:20] logging.py:143 >> {'loss': 0.8584, 'learning_rate': 1.7095e-05, 'epoch': 1.80, 'throughput': 110.29}
|
| 919 |
+
|
| 920 |
+
[INFO|2025-04-01 10:53:20] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-600
|
| 921 |
+
|
| 922 |
+
[INFO|2025-04-01 10:53:20] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 923 |
+
|
| 924 |
+
[INFO|2025-04-01 10:53:20] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 925 |
+
"architectures": [
|
| 926 |
+
"Qwen2VLForConditionalGeneration"
|
| 927 |
+
],
|
| 928 |
+
"attention_dropout": 0.0,
|
| 929 |
+
"bos_token_id": 151643,
|
| 930 |
+
"eos_token_id": 151645,
|
| 931 |
+
"hidden_act": "silu",
|
| 932 |
+
"hidden_size": 1536,
|
| 933 |
+
"image_token_id": 151655,
|
| 934 |
+
"initializer_range": 0.02,
|
| 935 |
+
"intermediate_size": 8960,
|
| 936 |
+
"max_position_embeddings": 32768,
|
| 937 |
+
"max_window_layers": 28,
|
| 938 |
+
"model_type": "qwen2_vl",
|
| 939 |
+
"num_attention_heads": 12,
|
| 940 |
+
"num_hidden_layers": 28,
|
| 941 |
+
"num_key_value_heads": 2,
|
| 942 |
+
"pad_token_id": 151654,
|
| 943 |
+
"rms_norm_eps": 1e-06,
|
| 944 |
+
"rope_scaling": {
|
| 945 |
+
"mrope_section": [
|
| 946 |
+
16,
|
| 947 |
+
24,
|
| 948 |
+
24
|
| 949 |
+
],
|
| 950 |
+
"rope_type": "default",
|
| 951 |
+
"type": "default"
|
| 952 |
+
},
|
| 953 |
+
"rope_theta": 1000000.0,
|
| 954 |
+
"sliding_window": 32768,
|
| 955 |
+
"tie_word_embeddings": true,
|
| 956 |
+
"torch_dtype": "bfloat16",
|
| 957 |
+
"transformers_version": "4.50.0",
|
| 958 |
+
"use_cache": true,
|
| 959 |
+
"use_sliding_window": false,
|
| 960 |
+
"video_token_id": 151656,
|
| 961 |
+
"vision_config": {
|
| 962 |
+
"depth": 32,
|
| 963 |
+
"embed_dim": 1280,
|
| 964 |
+
"hidden_act": "quick_gelu",
|
| 965 |
+
"hidden_size": 1536,
|
| 966 |
+
"in_channels": 3,
|
| 967 |
+
"in_chans": 3,
|
| 968 |
+
"mlp_ratio": 4,
|
| 969 |
+
"model_type": "qwen2_vl",
|
| 970 |
+
"num_heads": 16,
|
| 971 |
+
"patch_size": 14,
|
| 972 |
+
"spatial_merge_size": 2,
|
| 973 |
+
"spatial_patch_size": 14,
|
| 974 |
+
"temporal_patch_size": 2
|
| 975 |
+
},
|
| 976 |
+
"vision_end_token_id": 151653,
|
| 977 |
+
"vision_start_token_id": 151652,
|
| 978 |
+
"vision_token_id": 151654,
|
| 979 |
+
"vocab_size": 151936
|
| 980 |
+
}
|
| 981 |
+
|
| 982 |
+
|
| 983 |
+
[INFO|2025-04-01 10:53:21] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-600/tokenizer_config.json
|
| 984 |
+
|
| 985 |
+
[INFO|2025-04-01 10:53:21] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-600/special_tokens_map.json
|
| 986 |
+
|
| 987 |
+
[INFO|2025-04-01 10:54:14] logging.py:143 >> {'loss': 0.8800, 'learning_rate': 1.6722e-05, 'epoch': 1.82, 'throughput': 110.30}
|
| 988 |
+
|
| 989 |
+
[INFO|2025-04-01 10:55:06] logging.py:143 >> {'loss': 0.8825, 'learning_rate': 1.6351e-05, 'epoch': 1.83, 'throughput': 110.30}
|
| 990 |
+
|
| 991 |
+
[INFO|2025-04-01 10:56:00] logging.py:143 >> {'loss': 0.9978, 'learning_rate': 1.5982e-05, 'epoch': 1.85, 'throughput': 110.33}
|
| 992 |
+
|
| 993 |
+
[INFO|2025-04-01 10:56:54] logging.py:143 >> {'loss': 0.9626, 'learning_rate': 1.5615e-05, 'epoch': 1.86, 'throughput': 110.38}
|
| 994 |
+
|
| 995 |
+
[INFO|2025-04-01 10:57:45] logging.py:143 >> {'loss': 0.9308, 'learning_rate': 1.5251e-05, 'epoch': 1.88, 'throughput': 110.37}
|
| 996 |
+
|
| 997 |
+
[INFO|2025-04-01 10:58:39] logging.py:143 >> {'loss': 0.9757, 'learning_rate': 1.4889e-05, 'epoch': 1.89, 'throughput': 110.38}
|
| 998 |
+
|
| 999 |
+
[INFO|2025-04-01 10:59:33] logging.py:143 >> {'loss': 0.7670, 'learning_rate': 1.4530e-05, 'epoch': 1.91, 'throughput': 110.43}
|
| 1000 |
+
|
| 1001 |
+
[INFO|2025-04-01 11:00:23] logging.py:143 >> {'loss': 0.9272, 'learning_rate': 1.4173e-05, 'epoch': 1.92, 'throughput': 110.40}
|
| 1002 |
+
|
| 1003 |
+
[INFO|2025-04-01 11:01:16] logging.py:143 >> {'loss': 0.7941, 'learning_rate': 1.3819e-05, 'epoch': 1.94, 'throughput': 110.41}
|
| 1004 |
+
|
| 1005 |
+
[INFO|2025-04-01 11:02:08] logging.py:143 >> {'loss': 0.8408, 'learning_rate': 1.3468e-05, 'epoch': 1.95, 'throughput': 110.38}
|
| 1006 |
+
|
| 1007 |
+
[INFO|2025-04-01 11:03:00] logging.py:143 >> {'loss': 0.8459, 'learning_rate': 1.3120e-05, 'epoch': 1.97, 'throughput': 110.35}
|
| 1008 |
+
|
| 1009 |
+
[INFO|2025-04-01 11:03:52] logging.py:143 >> {'loss': 1.0117, 'learning_rate': 1.2774e-05, 'epoch': 1.98, 'throughput': 110.36}
|
| 1010 |
+
|
| 1011 |
+
[INFO|2025-04-01 11:04:43] logging.py:143 >> {'loss': 0.9665, 'learning_rate': 1.2432e-05, 'epoch': 2.00, 'throughput': 110.32}
|
| 1012 |
+
|
| 1013 |
+
[INFO|2025-04-01 11:05:28] logging.py:143 >> {'loss': 0.7625, 'learning_rate': 1.2093e-05, 'epoch': 2.01, 'throughput': 110.34}
|
| 1014 |
+
|
| 1015 |
+
[INFO|2025-04-01 11:06:21] logging.py:143 >> {'loss': 0.8667, 'learning_rate': 1.1756e-05, 'epoch': 2.03, 'throughput': 110.33}
|
| 1016 |
+
|
| 1017 |
+
[INFO|2025-04-01 11:07:14] logging.py:143 >> {'loss': 0.8297, 'learning_rate': 1.1424e-05, 'epoch': 2.04, 'throughput': 110.34}
|
| 1018 |
+
|
| 1019 |
+
[INFO|2025-04-01 11:08:06] logging.py:143 >> {'loss': 0.8774, 'learning_rate': 1.1094e-05, 'epoch': 2.06, 'throughput': 110.35}
|
| 1020 |
+
|
| 1021 |
+
[INFO|2025-04-01 11:08:59] logging.py:143 >> {'loss': 0.8476, 'learning_rate': 1.0768e-05, 'epoch': 2.07, 'throughput': 110.37}
|
| 1022 |
+
|
| 1023 |
+
[INFO|2025-04-01 11:09:51] logging.py:143 >> {'loss': 0.8641, 'learning_rate': 1.0446e-05, 'epoch': 2.09, 'throughput': 110.35}
|
| 1024 |
+
|
| 1025 |
+
[INFO|2025-04-01 11:10:44] logging.py:143 >> {'loss': 0.8383, 'learning_rate': 1.0127e-05, 'epoch': 2.10, 'throughput': 110.36}
|
| 1026 |
+
|
| 1027 |
+
[INFO|2025-04-01 11:10:44] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-700
|
| 1028 |
+
|
| 1029 |
+
[INFO|2025-04-01 11:10:45] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 1030 |
+
|
| 1031 |
+
[INFO|2025-04-01 11:10:45] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 1032 |
+
"architectures": [
|
| 1033 |
+
"Qwen2VLForConditionalGeneration"
|
| 1034 |
+
],
|
| 1035 |
+
"attention_dropout": 0.0,
|
| 1036 |
+
"bos_token_id": 151643,
|
| 1037 |
+
"eos_token_id": 151645,
|
| 1038 |
+
"hidden_act": "silu",
|
| 1039 |
+
"hidden_size": 1536,
|
| 1040 |
+
"image_token_id": 151655,
|
| 1041 |
+
"initializer_range": 0.02,
|
| 1042 |
+
"intermediate_size": 8960,
|
| 1043 |
+
"max_position_embeddings": 32768,
|
| 1044 |
+
"max_window_layers": 28,
|
| 1045 |
+
"model_type": "qwen2_vl",
|
| 1046 |
+
"num_attention_heads": 12,
|
| 1047 |
+
"num_hidden_layers": 28,
|
| 1048 |
+
"num_key_value_heads": 2,
|
| 1049 |
+
"pad_token_id": 151654,
|
| 1050 |
+
"rms_norm_eps": 1e-06,
|
| 1051 |
+
"rope_scaling": {
|
| 1052 |
+
"mrope_section": [
|
| 1053 |
+
16,
|
| 1054 |
+
24,
|
| 1055 |
+
24
|
| 1056 |
+
],
|
| 1057 |
+
"rope_type": "default",
|
| 1058 |
+
"type": "default"
|
| 1059 |
+
},
|
| 1060 |
+
"rope_theta": 1000000.0,
|
| 1061 |
+
"sliding_window": 32768,
|
| 1062 |
+
"tie_word_embeddings": true,
|
| 1063 |
+
"torch_dtype": "bfloat16",
|
| 1064 |
+
"transformers_version": "4.50.0",
|
| 1065 |
+
"use_cache": true,
|
| 1066 |
+
"use_sliding_window": false,
|
| 1067 |
+
"video_token_id": 151656,
|
| 1068 |
+
"vision_config": {
|
| 1069 |
+
"depth": 32,
|
| 1070 |
+
"embed_dim": 1280,
|
| 1071 |
+
"hidden_act": "quick_gelu",
|
| 1072 |
+
"hidden_size": 1536,
|
| 1073 |
+
"in_channels": 3,
|
| 1074 |
+
"in_chans": 3,
|
| 1075 |
+
"mlp_ratio": 4,
|
| 1076 |
+
"model_type": "qwen2_vl",
|
| 1077 |
+
"num_heads": 16,
|
| 1078 |
+
"patch_size": 14,
|
| 1079 |
+
"spatial_merge_size": 2,
|
| 1080 |
+
"spatial_patch_size": 14,
|
| 1081 |
+
"temporal_patch_size": 2
|
| 1082 |
+
},
|
| 1083 |
+
"vision_end_token_id": 151653,
|
| 1084 |
+
"vision_start_token_id": 151652,
|
| 1085 |
+
"vision_token_id": 151654,
|
| 1086 |
+
"vocab_size": 151936
|
| 1087 |
+
}
|
| 1088 |
+
|
| 1089 |
+
|
| 1090 |
+
[INFO|2025-04-01 11:10:45] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-700/tokenizer_config.json
|
| 1091 |
+
|
| 1092 |
+
[INFO|2025-04-01 11:10:45] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-700/special_tokens_map.json
|
| 1093 |
+
|
| 1094 |
+
[INFO|2025-04-01 11:11:38] logging.py:143 >> {'loss': 0.9123, 'learning_rate': 9.8123e-06, 'epoch': 2.12, 'throughput': 110.32}
|
| 1095 |
+
|
| 1096 |
+
[INFO|2025-04-01 11:12:28] logging.py:143 >> {'loss': 0.9635, 'learning_rate': 9.5010e-06, 'epoch': 2.13, 'throughput': 110.28}
|
| 1097 |
+
|
| 1098 |
+
[INFO|2025-04-01 11:13:21] logging.py:143 >> {'loss': 0.9221, 'learning_rate': 9.1936e-06, 'epoch': 2.15, 'throughput': 110.30}
|
| 1099 |
+
|
| 1100 |
+
[INFO|2025-04-01 11:14:16] logging.py:143 >> {'loss': 0.8757, 'learning_rate': 8.8901e-06, 'epoch': 2.16, 'throughput': 110.34}
|
| 1101 |
+
|
| 1102 |
+
[INFO|2025-04-01 11:15:08] logging.py:143 >> {'loss': 0.7958, 'learning_rate': 8.5906e-06, 'epoch': 2.18, 'throughput': 110.36}
|
| 1103 |
+
|
| 1104 |
+
[INFO|2025-04-01 11:16:01] logging.py:143 >> {'loss': 0.7993, 'learning_rate': 8.2952e-06, 'epoch': 2.19, 'throughput': 110.36}
|
| 1105 |
+
|
| 1106 |
+
[INFO|2025-04-01 11:16:55] logging.py:143 >> {'loss': 0.8436, 'learning_rate': 8.0039e-06, 'epoch': 2.21, 'throughput': 110.37}
|
| 1107 |
+
|
| 1108 |
+
[INFO|2025-04-01 11:17:48] logging.py:143 >> {'loss': 0.8960, 'learning_rate': 7.7169e-06, 'epoch': 2.22, 'throughput': 110.38}
|
| 1109 |
+
|
| 1110 |
+
[INFO|2025-04-01 11:18:39] logging.py:143 >> {'loss': 0.8948, 'learning_rate': 7.4342e-06, 'epoch': 2.24, 'throughput': 110.35}
|
| 1111 |
+
|
| 1112 |
+
[INFO|2025-04-01 11:19:31] logging.py:143 >> {'loss': 0.8546, 'learning_rate': 7.1558e-06, 'epoch': 2.25, 'throughput': 110.33}
|
| 1113 |
+
|
| 1114 |
+
[INFO|2025-04-01 11:20:23] logging.py:143 >> {'loss': 0.8494, 'learning_rate': 6.8819e-06, 'epoch': 2.27, 'throughput': 110.35}
|
| 1115 |
+
|
| 1116 |
+
[INFO|2025-04-01 11:21:17] logging.py:143 >> {'loss': 0.7723, 'learning_rate': 6.6125e-06, 'epoch': 2.28, 'throughput': 110.40}
|
| 1117 |
+
|
| 1118 |
+
[INFO|2025-04-01 11:22:08] logging.py:143 >> {'loss': 0.9168, 'learning_rate': 6.3477e-06, 'epoch': 2.30, 'throughput': 110.37}
|
| 1119 |
+
|
| 1120 |
+
[INFO|2025-04-01 11:23:01] logging.py:143 >> {'loss': 0.8831, 'learning_rate': 6.0875e-06, 'epoch': 2.31, 'throughput': 110.38}
|
| 1121 |
+
|
| 1122 |
+
[INFO|2025-04-01 11:23:52] logging.py:143 >> {'loss': 0.8540, 'learning_rate': 5.8320e-06, 'epoch': 2.33, 'throughput': 110.32}
|
| 1123 |
+
|
| 1124 |
+
[INFO|2025-04-01 11:24:43] logging.py:143 >> {'loss': 0.8843, 'learning_rate': 5.5813e-06, 'epoch': 2.34, 'throughput': 110.32}
|
| 1125 |
+
|
| 1126 |
+
[INFO|2025-04-01 11:25:36] logging.py:143 >> {'loss': 0.8620, 'learning_rate': 5.3354e-06, 'epoch': 2.36, 'throughput': 110.34}
|
| 1127 |
+
|
| 1128 |
+
[INFO|2025-04-01 11:26:29] logging.py:143 >> {'loss': 0.9850, 'learning_rate': 5.0944e-06, 'epoch': 2.37, 'throughput': 110.34}
|
| 1129 |
+
|
| 1130 |
+
[INFO|2025-04-01 11:27:20] logging.py:143 >> {'loss': 0.7679, 'learning_rate': 4.8583e-06, 'epoch': 2.39, 'throughput': 110.31}
|
| 1131 |
+
|
| 1132 |
+
[INFO|2025-04-01 11:28:11] logging.py:143 >> {'loss': 0.8198, 'learning_rate': 4.6273e-06, 'epoch': 2.40, 'throughput': 110.30}
|
| 1133 |
+
|
| 1134 |
+
[INFO|2025-04-01 11:28:11] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-800
|
| 1135 |
+
|
| 1136 |
+
[INFO|2025-04-01 11:28:12] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 1137 |
+
|
| 1138 |
+
[INFO|2025-04-01 11:28:12] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 1139 |
+
"architectures": [
|
| 1140 |
+
"Qwen2VLForConditionalGeneration"
|
| 1141 |
+
],
|
| 1142 |
+
"attention_dropout": 0.0,
|
| 1143 |
+
"bos_token_id": 151643,
|
| 1144 |
+
"eos_token_id": 151645,
|
| 1145 |
+
"hidden_act": "silu",
|
| 1146 |
+
"hidden_size": 1536,
|
| 1147 |
+
"image_token_id": 151655,
|
| 1148 |
+
"initializer_range": 0.02,
|
| 1149 |
+
"intermediate_size": 8960,
|
| 1150 |
+
"max_position_embeddings": 32768,
|
| 1151 |
+
"max_window_layers": 28,
|
| 1152 |
+
"model_type": "qwen2_vl",
|
| 1153 |
+
"num_attention_heads": 12,
|
| 1154 |
+
"num_hidden_layers": 28,
|
| 1155 |
+
"num_key_value_heads": 2,
|
| 1156 |
+
"pad_token_id": 151654,
|
| 1157 |
+
"rms_norm_eps": 1e-06,
|
| 1158 |
+
"rope_scaling": {
|
| 1159 |
+
"mrope_section": [
|
| 1160 |
+
16,
|
| 1161 |
+
24,
|
| 1162 |
+
24
|
| 1163 |
+
],
|
| 1164 |
+
"rope_type": "default",
|
| 1165 |
+
"type": "default"
|
| 1166 |
+
},
|
| 1167 |
+
"rope_theta": 1000000.0,
|
| 1168 |
+
"sliding_window": 32768,
|
| 1169 |
+
"tie_word_embeddings": true,
|
| 1170 |
+
"torch_dtype": "bfloat16",
|
| 1171 |
+
"transformers_version": "4.50.0",
|
| 1172 |
+
"use_cache": true,
|
| 1173 |
+
"use_sliding_window": false,
|
| 1174 |
+
"video_token_id": 151656,
|
| 1175 |
+
"vision_config": {
|
| 1176 |
+
"depth": 32,
|
| 1177 |
+
"embed_dim": 1280,
|
| 1178 |
+
"hidden_act": "quick_gelu",
|
| 1179 |
+
"hidden_size": 1536,
|
| 1180 |
+
"in_channels": 3,
|
| 1181 |
+
"in_chans": 3,
|
| 1182 |
+
"mlp_ratio": 4,
|
| 1183 |
+
"model_type": "qwen2_vl",
|
| 1184 |
+
"num_heads": 16,
|
| 1185 |
+
"patch_size": 14,
|
| 1186 |
+
"spatial_merge_size": 2,
|
| 1187 |
+
"spatial_patch_size": 14,
|
| 1188 |
+
"temporal_patch_size": 2
|
| 1189 |
+
},
|
| 1190 |
+
"vision_end_token_id": 151653,
|
| 1191 |
+
"vision_start_token_id": 151652,
|
| 1192 |
+
"vision_token_id": 151654,
|
| 1193 |
+
"vocab_size": 151936
|
| 1194 |
+
}
|
| 1195 |
+
|
| 1196 |
+
|
| 1197 |
+
[INFO|2025-04-01 11:28:12] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-800/tokenizer_config.json
|
| 1198 |
+
|
| 1199 |
+
[INFO|2025-04-01 11:28:12] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-800/special_tokens_map.json
|
| 1200 |
+
|
| 1201 |
+
[INFO|2025-04-01 11:29:06] logging.py:143 >> {'loss': 0.7773, 'learning_rate': 4.4013e-06, 'epoch': 2.42, 'throughput': 110.28}
|
| 1202 |
+
|
| 1203 |
+
[INFO|2025-04-01 11:29:58] logging.py:143 >> {'loss': 0.9312, 'learning_rate': 4.1805e-06, 'epoch': 2.43, 'throughput': 110.27}
|
| 1204 |
+
|
| 1205 |
+
[INFO|2025-04-01 11:30:50] logging.py:143 >> {'loss': 0.8497, 'learning_rate': 3.9648e-06, 'epoch': 2.45, 'throughput': 110.28}
|
| 1206 |
+
|
| 1207 |
+
[INFO|2025-04-01 11:31:43] logging.py:143 >> {'loss': 0.7820, 'learning_rate': 3.7543e-06, 'epoch': 2.46, 'throughput': 110.29}
|
| 1208 |
+
|
| 1209 |
+
[INFO|2025-04-01 11:32:36] logging.py:143 >> {'loss': 0.8937, 'learning_rate': 3.5492e-06, 'epoch': 2.48, 'throughput': 110.31}
|
| 1210 |
+
|
| 1211 |
+
[INFO|2025-04-01 11:33:29] logging.py:143 >> {'loss': 0.7039, 'learning_rate': 3.3494e-06, 'epoch': 2.49, 'throughput': 110.36}
|
| 1212 |
+
|
| 1213 |
+
[INFO|2025-04-01 11:34:22] logging.py:143 >> {'loss': 0.9265, 'learning_rate': 3.1549e-06, 'epoch': 2.51, 'throughput': 110.36}
|
| 1214 |
+
|
| 1215 |
+
[INFO|2025-04-01 11:35:13] logging.py:143 >> {'loss': 0.8669, 'learning_rate': 2.9659e-06, 'epoch': 2.52, 'throughput': 110.36}
|
| 1216 |
+
|
| 1217 |
+
[INFO|2025-04-01 11:36:05] logging.py:143 >> {'loss': 0.9174, 'learning_rate': 2.7824e-06, 'epoch': 2.54, 'throughput': 110.37}
|
| 1218 |
+
|
| 1219 |
+
[INFO|2025-04-01 11:36:58] logging.py:143 >> {'loss': 0.8718, 'learning_rate': 2.6044e-06, 'epoch': 2.55, 'throughput': 110.38}
|
| 1220 |
+
|
| 1221 |
+
[INFO|2025-04-01 11:37:50] logging.py:143 >> {'loss': 0.8634, 'learning_rate': 2.4320e-06, 'epoch': 2.57, 'throughput': 110.37}
|
| 1222 |
+
|
| 1223 |
+
[INFO|2025-04-01 11:38:43] logging.py:143 >> {'loss': 0.8450, 'learning_rate': 2.2652e-06, 'epoch': 2.58, 'throughput': 110.38}
|
| 1224 |
+
|
| 1225 |
+
[INFO|2025-04-01 11:39:34] logging.py:143 >> {'loss': 0.8008, 'learning_rate': 2.1040e-06, 'epoch': 2.60, 'throughput': 110.36}
|
| 1226 |
+
|
| 1227 |
+
[INFO|2025-04-01 11:40:26] logging.py:143 >> {'loss': 0.8797, 'learning_rate': 1.9485e-06, 'epoch': 2.61, 'throughput': 110.36}
|
| 1228 |
+
|
| 1229 |
+
[INFO|2025-04-01 11:41:19] logging.py:143 >> {'loss': 0.9460, 'learning_rate': 1.7988e-06, 'epoch': 2.63, 'throughput': 110.37}
|
| 1230 |
+
|
| 1231 |
+
[INFO|2025-04-01 11:42:10] logging.py:143 >> {'loss': 0.8032, 'learning_rate': 1.6548e-06, 'epoch': 2.64, 'throughput': 110.36}
|
| 1232 |
+
|
| 1233 |
+
[INFO|2025-04-01 11:43:02] logging.py:143 >> {'loss': 0.8892, 'learning_rate': 1.5167e-06, 'epoch': 2.66, 'throughput': 110.37}
|
| 1234 |
+
|
| 1235 |
+
[INFO|2025-04-01 11:43:56] logging.py:143 >> {'loss': 0.8560, 'learning_rate': 1.3844e-06, 'epoch': 2.67, 'throughput': 110.39}
|
| 1236 |
+
|
| 1237 |
+
[INFO|2025-04-01 11:44:49] logging.py:143 >> {'loss': 0.8617, 'learning_rate': 1.2579e-06, 'epoch': 2.69, 'throughput': 110.43}
|
| 1238 |
+
|
| 1239 |
+
[INFO|2025-04-01 11:45:43] logging.py:143 >> {'loss': 0.9117, 'learning_rate': 1.1374e-06, 'epoch': 2.70, 'throughput': 110.46}
|
| 1240 |
+
|
| 1241 |
+
[INFO|2025-04-01 11:45:43] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-900
|
| 1242 |
+
|
| 1243 |
+
[INFO|2025-04-01 11:45:44] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 1244 |
+
|
| 1245 |
+
[INFO|2025-04-01 11:45:44] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 1246 |
+
"architectures": [
|
| 1247 |
+
"Qwen2VLForConditionalGeneration"
|
| 1248 |
+
],
|
| 1249 |
+
"attention_dropout": 0.0,
|
| 1250 |
+
"bos_token_id": 151643,
|
| 1251 |
+
"eos_token_id": 151645,
|
| 1252 |
+
"hidden_act": "silu",
|
| 1253 |
+
"hidden_size": 1536,
|
| 1254 |
+
"image_token_id": 151655,
|
| 1255 |
+
"initializer_range": 0.02,
|
| 1256 |
+
"intermediate_size": 8960,
|
| 1257 |
+
"max_position_embeddings": 32768,
|
| 1258 |
+
"max_window_layers": 28,
|
| 1259 |
+
"model_type": "qwen2_vl",
|
| 1260 |
+
"num_attention_heads": 12,
|
| 1261 |
+
"num_hidden_layers": 28,
|
| 1262 |
+
"num_key_value_heads": 2,
|
| 1263 |
+
"pad_token_id": 151654,
|
| 1264 |
+
"rms_norm_eps": 1e-06,
|
| 1265 |
+
"rope_scaling": {
|
| 1266 |
+
"mrope_section": [
|
| 1267 |
+
16,
|
| 1268 |
+
24,
|
| 1269 |
+
24
|
| 1270 |
+
],
|
| 1271 |
+
"rope_type": "default",
|
| 1272 |
+
"type": "default"
|
| 1273 |
+
},
|
| 1274 |
+
"rope_theta": 1000000.0,
|
| 1275 |
+
"sliding_window": 32768,
|
| 1276 |
+
"tie_word_embeddings": true,
|
| 1277 |
+
"torch_dtype": "bfloat16",
|
| 1278 |
+
"transformers_version": "4.50.0",
|
| 1279 |
+
"use_cache": true,
|
| 1280 |
+
"use_sliding_window": false,
|
| 1281 |
+
"video_token_id": 151656,
|
| 1282 |
+
"vision_config": {
|
| 1283 |
+
"depth": 32,
|
| 1284 |
+
"embed_dim": 1280,
|
| 1285 |
+
"hidden_act": "quick_gelu",
|
| 1286 |
+
"hidden_size": 1536,
|
| 1287 |
+
"in_channels": 3,
|
| 1288 |
+
"in_chans": 3,
|
| 1289 |
+
"mlp_ratio": 4,
|
| 1290 |
+
"model_type": "qwen2_vl",
|
| 1291 |
+
"num_heads": 16,
|
| 1292 |
+
"patch_size": 14,
|
| 1293 |
+
"spatial_merge_size": 2,
|
| 1294 |
+
"spatial_patch_size": 14,
|
| 1295 |
+
"temporal_patch_size": 2
|
| 1296 |
+
},
|
| 1297 |
+
"vision_end_token_id": 151653,
|
| 1298 |
+
"vision_start_token_id": 151652,
|
| 1299 |
+
"vision_token_id": 151654,
|
| 1300 |
+
"vocab_size": 151936
|
| 1301 |
+
}
|
| 1302 |
+
|
| 1303 |
+
|
| 1304 |
+
[INFO|2025-04-01 11:45:44] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-900/tokenizer_config.json
|
| 1305 |
+
|
| 1306 |
+
[INFO|2025-04-01 11:45:44] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-900/special_tokens_map.json
|
| 1307 |
+
|
| 1308 |
+
[INFO|2025-04-01 11:46:37] logging.py:143 >> {'loss': 0.7855, 'learning_rate': 1.0228e-06, 'epoch': 2.72, 'throughput': 110.44}
|
| 1309 |
+
|
| 1310 |
+
[INFO|2025-04-01 11:47:29] logging.py:143 >> {'loss': 0.8212, 'learning_rate': 9.1416e-07, 'epoch': 2.73, 'throughput': 110.42}
|
| 1311 |
+
|
| 1312 |
+
[INFO|2025-04-01 11:48:22] logging.py:143 >> {'loss': 0.8404, 'learning_rate': 8.1152e-07, 'epoch': 2.75, 'throughput': 110.45}
|
| 1313 |
+
|
| 1314 |
+
[INFO|2025-04-01 11:49:15] logging.py:143 >> {'loss': 0.7782, 'learning_rate': 7.1489e-07, 'epoch': 2.76, 'throughput': 110.45}
|
| 1315 |
+
|
| 1316 |
+
[INFO|2025-04-01 11:50:08] logging.py:143 >> {'loss': 0.7847, 'learning_rate': 6.2430e-07, 'epoch': 2.78, 'throughput': 110.45}
|
| 1317 |
+
|
| 1318 |
+
[INFO|2025-04-01 11:51:01] logging.py:143 >> {'loss': 0.8857, 'learning_rate': 5.3977e-07, 'epoch': 2.79, 'throughput': 110.46}
|
| 1319 |
+
|
| 1320 |
+
[INFO|2025-04-01 11:51:54] logging.py:143 >> {'loss': 0.8029, 'learning_rate': 4.6133e-07, 'epoch': 2.81, 'throughput': 110.48}
|
| 1321 |
+
|
| 1322 |
+
[INFO|2025-04-01 11:52:46] logging.py:143 >> {'loss': 0.8154, 'learning_rate': 3.8899e-07, 'epoch': 2.82, 'throughput': 110.45}
|
| 1323 |
+
|
| 1324 |
+
[INFO|2025-04-01 11:53:39] logging.py:143 >> {'loss': 0.8791, 'learning_rate': 3.2277e-07, 'epoch': 2.84, 'throughput': 110.45}
|
| 1325 |
+
|
| 1326 |
+
[INFO|2025-04-01 11:54:32] logging.py:143 >> {'loss': 0.7870, 'learning_rate': 2.6269e-07, 'epoch': 2.85, 'throughput': 110.46}
|
| 1327 |
+
|
| 1328 |
+
[INFO|2025-04-01 11:55:26] logging.py:143 >> {'loss': 0.8831, 'learning_rate': 2.0876e-07, 'epoch': 2.87, 'throughput': 110.49}
|
| 1329 |
+
|
| 1330 |
+
[INFO|2025-04-01 11:56:19] logging.py:143 >> {'loss': 0.7677, 'learning_rate': 1.6100e-07, 'epoch': 2.88, 'throughput': 110.49}
|
| 1331 |
+
|
| 1332 |
+
[INFO|2025-04-01 11:57:10] logging.py:143 >> {'loss': 0.7567, 'learning_rate': 1.1942e-07, 'epoch': 2.90, 'throughput': 110.47}
|
| 1333 |
+
|
| 1334 |
+
[INFO|2025-04-01 11:58:02] logging.py:143 >> {'loss': 0.8944, 'learning_rate': 8.4022e-08, 'epoch': 2.91, 'throughput': 110.46}
|
| 1335 |
+
|
| 1336 |
+
[INFO|2025-04-01 11:58:53] logging.py:143 >> {'loss': 0.9737, 'learning_rate': 5.4824e-08, 'epoch': 2.93, 'throughput': 110.44}
|
| 1337 |
+
|
| 1338 |
+
[INFO|2025-04-01 11:59:45] logging.py:143 >> {'loss': 0.8965, 'learning_rate': 3.1830e-08, 'epoch': 2.95, 'throughput': 110.44}
|
| 1339 |
+
|
| 1340 |
+
[INFO|2025-04-01 12:00:37] logging.py:143 >> {'loss': 0.8370, 'learning_rate': 1.5046e-08, 'epoch': 2.96, 'throughput': 110.41}
|
| 1341 |
+
|
| 1342 |
+
[INFO|2025-04-01 12:01:29] logging.py:143 >> {'loss': 0.7812, 'learning_rate': 4.4769e-09, 'epoch': 2.98, 'throughput': 110.40}
|
| 1343 |
+
|
| 1344 |
+
[INFO|2025-04-01 12:02:24] logging.py:143 >> {'loss': 0.8613, 'learning_rate': 1.2436e-10, 'epoch': 2.99, 'throughput': 110.45}
|
| 1345 |
+
|
| 1346 |
+
[INFO|2025-04-01 12:02:35] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-996
|
| 1347 |
+
|
| 1348 |
+
[INFO|2025-04-01 12:02:35] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 1349 |
+
|
| 1350 |
+
[INFO|2025-04-01 12:02:35] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 1351 |
+
"architectures": [
|
| 1352 |
+
"Qwen2VLForConditionalGeneration"
|
| 1353 |
+
],
|
| 1354 |
+
"attention_dropout": 0.0,
|
| 1355 |
+
"bos_token_id": 151643,
|
| 1356 |
+
"eos_token_id": 151645,
|
| 1357 |
+
"hidden_act": "silu",
|
| 1358 |
+
"hidden_size": 1536,
|
| 1359 |
+
"image_token_id": 151655,
|
| 1360 |
+
"initializer_range": 0.02,
|
| 1361 |
+
"intermediate_size": 8960,
|
| 1362 |
+
"max_position_embeddings": 32768,
|
| 1363 |
+
"max_window_layers": 28,
|
| 1364 |
+
"model_type": "qwen2_vl",
|
| 1365 |
+
"num_attention_heads": 12,
|
| 1366 |
+
"num_hidden_layers": 28,
|
| 1367 |
+
"num_key_value_heads": 2,
|
| 1368 |
+
"pad_token_id": 151654,
|
| 1369 |
+
"rms_norm_eps": 1e-06,
|
| 1370 |
+
"rope_scaling": {
|
| 1371 |
+
"mrope_section": [
|
| 1372 |
+
16,
|
| 1373 |
+
24,
|
| 1374 |
+
24
|
| 1375 |
+
],
|
| 1376 |
+
"rope_type": "default",
|
| 1377 |
+
"type": "default"
|
| 1378 |
+
},
|
| 1379 |
+
"rope_theta": 1000000.0,
|
| 1380 |
+
"sliding_window": 32768,
|
| 1381 |
+
"tie_word_embeddings": true,
|
| 1382 |
+
"torch_dtype": "bfloat16",
|
| 1383 |
+
"transformers_version": "4.50.0",
|
| 1384 |
+
"use_cache": true,
|
| 1385 |
+
"use_sliding_window": false,
|
| 1386 |
+
"video_token_id": 151656,
|
| 1387 |
+
"vision_config": {
|
| 1388 |
+
"depth": 32,
|
| 1389 |
+
"embed_dim": 1280,
|
| 1390 |
+
"hidden_act": "quick_gelu",
|
| 1391 |
+
"hidden_size": 1536,
|
| 1392 |
+
"in_channels": 3,
|
| 1393 |
+
"in_chans": 3,
|
| 1394 |
+
"mlp_ratio": 4,
|
| 1395 |
+
"model_type": "qwen2_vl",
|
| 1396 |
+
"num_heads": 16,
|
| 1397 |
+
"patch_size": 14,
|
| 1398 |
+
"spatial_merge_size": 2,
|
| 1399 |
+
"spatial_patch_size": 14,
|
| 1400 |
+
"temporal_patch_size": 2
|
| 1401 |
+
},
|
| 1402 |
+
"vision_end_token_id": 151653,
|
| 1403 |
+
"vision_start_token_id": 151652,
|
| 1404 |
+
"vision_token_id": 151654,
|
| 1405 |
+
"vocab_size": 151936
|
| 1406 |
+
}
|
| 1407 |
+
|
| 1408 |
+
|
| 1409 |
+
[INFO|2025-04-01 12:02:35] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-996/tokenizer_config.json
|
| 1410 |
+
|
| 1411 |
+
[INFO|2025-04-01 12:02:35] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/checkpoint-996/special_tokens_map.json
|
| 1412 |
+
|
| 1413 |
+
[INFO|2025-04-01 12:02:36] trainer.py:2665 >>
|
| 1414 |
+
|
| 1415 |
+
Training completed. Do not forget to share your model on huggingface.co/models =)
|
| 1416 |
+
|
| 1417 |
+
|
| 1418 |
+
|
| 1419 |
+
[INFO|2025-04-01 12:02:36] trainer.py:3966 >> Saving model checkpoint to saves/Custom/lora/train_2025-04-01-09-06-36
|
| 1420 |
+
|
| 1421 |
+
[INFO|2025-04-01 12:02:37] configuration_utils.py:699 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--prithivMLmods--Qwen2-VL-OCR-2B-Instruct/snapshots/a54254d5cc9f82e1c362db82adede275d20bbc6b/config.json
|
| 1422 |
+
|
| 1423 |
+
[INFO|2025-04-01 12:02:37] configuration_utils.py:771 >> Model config Qwen2VLConfig {
|
| 1424 |
+
"architectures": [
|
| 1425 |
+
"Qwen2VLForConditionalGeneration"
|
| 1426 |
+
],
|
| 1427 |
+
"attention_dropout": 0.0,
|
| 1428 |
+
"bos_token_id": 151643,
|
| 1429 |
+
"eos_token_id": 151645,
|
| 1430 |
+
"hidden_act": "silu",
|
| 1431 |
+
"hidden_size": 1536,
|
| 1432 |
+
"image_token_id": 151655,
|
| 1433 |
+
"initializer_range": 0.02,
|
| 1434 |
+
"intermediate_size": 8960,
|
| 1435 |
+
"max_position_embeddings": 32768,
|
| 1436 |
+
"max_window_layers": 28,
|
| 1437 |
+
"model_type": "qwen2_vl",
|
| 1438 |
+
"num_attention_heads": 12,
|
| 1439 |
+
"num_hidden_layers": 28,
|
| 1440 |
+
"num_key_value_heads": 2,
|
| 1441 |
+
"pad_token_id": 151654,
|
| 1442 |
+
"rms_norm_eps": 1e-06,
|
| 1443 |
+
"rope_scaling": {
|
| 1444 |
+
"mrope_section": [
|
| 1445 |
+
16,
|
| 1446 |
+
24,
|
| 1447 |
+
24
|
| 1448 |
+
],
|
| 1449 |
+
"rope_type": "default",
|
| 1450 |
+
"type": "default"
|
| 1451 |
+
},
|
| 1452 |
+
"rope_theta": 1000000.0,
|
| 1453 |
+
"sliding_window": 32768,
|
| 1454 |
+
"tie_word_embeddings": true,
|
| 1455 |
+
"torch_dtype": "bfloat16",
|
| 1456 |
+
"transformers_version": "4.50.0",
|
| 1457 |
+
"use_cache": true,
|
| 1458 |
+
"use_sliding_window": false,
|
| 1459 |
+
"video_token_id": 151656,
|
| 1460 |
+
"vision_config": {
|
| 1461 |
+
"depth": 32,
|
| 1462 |
+
"embed_dim": 1280,
|
| 1463 |
+
"hidden_act": "quick_gelu",
|
| 1464 |
+
"hidden_size": 1536,
|
| 1465 |
+
"in_channels": 3,
|
| 1466 |
+
"in_chans": 3,
|
| 1467 |
+
"mlp_ratio": 4,
|
| 1468 |
+
"model_type": "qwen2_vl",
|
| 1469 |
+
"num_heads": 16,
|
| 1470 |
+
"patch_size": 14,
|
| 1471 |
+
"spatial_merge_size": 2,
|
| 1472 |
+
"spatial_patch_size": 14,
|
| 1473 |
+
"temporal_patch_size": 2
|
| 1474 |
+
},
|
| 1475 |
+
"vision_end_token_id": 151653,
|
| 1476 |
+
"vision_start_token_id": 151652,
|
| 1477 |
+
"vision_token_id": 151654,
|
| 1478 |
+
"vocab_size": 151936
|
| 1479 |
+
}
|
| 1480 |
+
|
| 1481 |
+
|
| 1482 |
+
[INFO|2025-04-01 12:02:37] tokenization_utils_base.py:2510 >> tokenizer config file saved in saves/Custom/lora/train_2025-04-01-09-06-36/tokenizer_config.json
|
| 1483 |
+
|
| 1484 |
+
[INFO|2025-04-01 12:02:37] tokenization_utils_base.py:2519 >> Special tokens file saved in saves/Custom/lora/train_2025-04-01-09-06-36/special_tokens_map.json
|
| 1485 |
+
|
| 1486 |
+
[WARNING|2025-04-01 12:02:37] logging.py:148 >> No metric eval_loss to plot.
|
| 1487 |
+
|
| 1488 |
+
[WARNING|2025-04-01 12:02:37] logging.py:148 >> No metric eval_accuracy to plot.
|
| 1489 |
+
|
| 1490 |
+
[INFO|2025-04-01 12:02:37] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields:
|
| 1491 |
+
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}
|
| 1492 |
+
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"eos_token": {
|
| 3 |
+
"content": "<|endoftext|>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"pad_token": {
|
| 10 |
+
"content": "<|endoftext|>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"unk_token": {
|
| 17 |
+
"content": "<|endoftext|>",
|
| 18 |
+
"lstrip": false,
|
| 19 |
+
"normalized": false,
|
| 20 |
+
"rstrip": false,
|
| 21 |
+
"single_word": false
|
| 22 |
+
}
|
| 23 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7d044cddc0af2b81635b0de71dba0a4a4d494dc953a5febbf525672df5af2e23
|
| 3 |
+
size 11420365
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"151643": {
|
| 4 |
+
"content": "<|endoftext|>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"151644": {
|
| 12 |
+
"content": "<|im_start|>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": true,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": false
|
| 18 |
+
},
|
| 19 |
+
"151645": {
|
| 20 |
+
"content": "<|im_end|>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": true,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": false
|
| 26 |
+
},
|
| 27 |
+
"151646": {
|
| 28 |
+
"content": "<|object_ref_start|>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": true,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": false
|
| 34 |
+
},
|
| 35 |
+
"151647": {
|
| 36 |
+
"content": "<|object_ref_end|>",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": true,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": false
|
| 42 |
+
},
|
| 43 |
+
"151648": {
|
| 44 |
+
"content": "<|box_start|>",
|
| 45 |
+
"lstrip": false,
|
| 46 |
+
"normalized": true,
|
| 47 |
+
"rstrip": false,
|
| 48 |
+
"single_word": false,
|
| 49 |
+
"special": false
|
| 50 |
+
},
|
| 51 |
+
"151649": {
|
| 52 |
+
"content": "<|box_end|>",
|
| 53 |
+
"lstrip": false,
|
| 54 |
+
"normalized": true,
|
| 55 |
+
"rstrip": false,
|
| 56 |
+
"single_word": false,
|
| 57 |
+
"special": false
|
| 58 |
+
},
|
| 59 |
+
"151650": {
|
| 60 |
+
"content": "<|quad_start|>",
|
| 61 |
+
"lstrip": false,
|
| 62 |
+
"normalized": true,
|
| 63 |
+
"rstrip": false,
|
| 64 |
+
"single_word": false,
|
| 65 |
+
"special": false
|
| 66 |
+
},
|
| 67 |
+
"151651": {
|
| 68 |
+
"content": "<|quad_end|>",
|
| 69 |
+
"lstrip": false,
|
| 70 |
+
"normalized": true,
|
| 71 |
+
"rstrip": false,
|
| 72 |
+
"single_word": false,
|
| 73 |
+
"special": false
|
| 74 |
+
},
|
| 75 |
+
"151652": {
|
| 76 |
+
"content": "<|vision_start|>",
|
| 77 |
+
"lstrip": false,
|
| 78 |
+
"normalized": true,
|
| 79 |
+
"rstrip": false,
|
| 80 |
+
"single_word": false,
|
| 81 |
+
"special": false
|
| 82 |
+
},
|
| 83 |
+
"151653": {
|
| 84 |
+
"content": "<|vision_end|>",
|
| 85 |
+
"lstrip": false,
|
| 86 |
+
"normalized": true,
|
| 87 |
+
"rstrip": false,
|
| 88 |
+
"single_word": false,
|
| 89 |
+
"special": false
|
| 90 |
+
},
|
| 91 |
+
"151654": {
|
| 92 |
+
"content": "<|vision_pad|>",
|
| 93 |
+
"lstrip": false,
|
| 94 |
+
"normalized": true,
|
| 95 |
+
"rstrip": false,
|
| 96 |
+
"single_word": false,
|
| 97 |
+
"special": false
|
| 98 |
+
},
|
| 99 |
+
"151655": {
|
| 100 |
+
"content": "<|image_pad|>",
|
| 101 |
+
"lstrip": false,
|
| 102 |
+
"normalized": true,
|
| 103 |
+
"rstrip": false,
|
| 104 |
+
"single_word": false,
|
| 105 |
+
"special": false
|
| 106 |
+
},
|
| 107 |
+
"151656": {
|
| 108 |
+
"content": "<|video_pad|>",
|
| 109 |
+
"lstrip": false,
|
| 110 |
+
"normalized": true,
|
| 111 |
+
"rstrip": false,
|
| 112 |
+
"single_word": false,
|
| 113 |
+
"special": false
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"bos_token": null,
|
| 117 |
+
"chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}{% if system_message is defined %}{{ 'System: ' + system_message + '<|endoftext|>' + '\n' }}{% endif %}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ 'Human: ' + content + '<|endoftext|>' + '\nAssistant:' }}{% elif message['role'] == 'assistant' %}{{ content + '<|endoftext|>' + '\n' }}{% endif %}{% endfor %}",
|
| 118 |
+
"clean_up_tokenization_spaces": false,
|
| 119 |
+
"eos_token": "<|endoftext|>",
|
| 120 |
+
"errors": "replace",
|
| 121 |
+
"extra_special_tokens": {},
|
| 122 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 123 |
+
"pad_token": "<|endoftext|>",
|
| 124 |
+
"padding_side": "right",
|
| 125 |
+
"split_special_tokens": false,
|
| 126 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 127 |
+
"unk_token": "<|endoftext|>"
|
| 128 |
+
}
|
train_results.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"epoch": 2.9932279909706545,
|
| 3 |
+
"num_input_tokens_seen": 1157808,
|
| 4 |
+
"total_flos": 1.3788411572404224e+16,
|
| 5 |
+
"train_loss": 0.939127180590687,
|
| 6 |
+
"train_runtime": 10484.6402,
|
| 7 |
+
"train_samples_per_second": 0.761,
|
| 8 |
+
"train_steps_per_second": 0.095
|
| 9 |
+
}
|
trainer_log.jsonl
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"current_steps": 5, "total_steps": 996, "loss": 2.4707, "lr": 4.9996890990217804e-05, "epoch": 0.015048908954100828, "percentage": 0.5, "elapsed_time": "0:00:54", "remaining_time": "2:59:14", "throughput": 108.07, "total_tokens": 5864}
|
| 2 |
+
{"current_steps": 10, "total_steps": 996, "loss": 2.2509, "lr": 4.9987564734146566e-05, "epoch": 0.030097817908201655, "percentage": 1.0, "elapsed_time": "0:01:47", "remaining_time": "2:57:04", "throughput": 106.09, "total_tokens": 11432}
|
| 3 |
+
{"current_steps": 15, "total_steps": 996, "loss": 1.6895, "lr": 4.997202355141999e-05, "epoch": 0.045146726862302484, "percentage": 1.51, "elapsed_time": "0:02:40", "remaining_time": "2:54:28", "throughput": 106.2, "total_tokens": 17000}
|
| 4 |
+
{"current_steps": 20, "total_steps": 996, "loss": 1.4876, "lr": 4.995027130745321e-05, "epoch": 0.06019563581640331, "percentage": 2.01, "elapsed_time": "0:03:33", "remaining_time": "2:53:25", "throughput": 107.12, "total_tokens": 22840}
|
| 5 |
+
{"current_steps": 25, "total_steps": 996, "loss": 1.4812, "lr": 4.992231341248137e-05, "epoch": 0.07524454477050414, "percentage": 2.51, "elapsed_time": "0:04:27", "remaining_time": "2:53:07", "throughput": 108.37, "total_tokens": 28984}
|
| 6 |
+
{"current_steps": 30, "total_steps": 996, "loss": 1.3642, "lr": 4.9888156820213974e-05, "epoch": 0.09029345372460497, "percentage": 3.01, "elapsed_time": "0:05:20", "remaining_time": "2:51:47", "throughput": 108.89, "total_tokens": 34856}
|
| 7 |
+
{"current_steps": 35, "total_steps": 996, "loss": 1.3651, "lr": 4.9847810026105394e-05, "epoch": 0.1053423626787058, "percentage": 3.51, "elapsed_time": "0:06:15", "remaining_time": "2:51:37", "throughput": 109.89, "total_tokens": 41216}
|
| 8 |
+
{"current_steps": 40, "total_steps": 996, "loss": 1.1321, "lr": 4.980128306524183e-05, "epoch": 0.12039127163280662, "percentage": 4.02, "elapsed_time": "0:07:09", "remaining_time": "2:50:57", "throughput": 110.22, "total_tokens": 47304}
|
| 9 |
+
{"current_steps": 45, "total_steps": 996, "loss": 1.3012, "lr": 4.97485875098454e-05, "epoch": 0.13544018058690746, "percentage": 4.52, "elapsed_time": "0:08:02", "remaining_time": "2:49:52", "throughput": 110.28, "total_tokens": 53184}
|
| 10 |
+
{"current_steps": 50, "total_steps": 996, "loss": 0.9827, "lr": 4.968973646639589e-05, "epoch": 0.1504890895410083, "percentage": 5.02, "elapsed_time": "0:08:55", "remaining_time": "2:48:50", "throughput": 110.24, "total_tokens": 59024}
|
| 11 |
+
{"current_steps": 55, "total_steps": 996, "loss": 1.2313, "lr": 4.9624744572370865e-05, "epoch": 0.1655379984951091, "percentage": 5.52, "elapsed_time": "0:09:49", "remaining_time": "2:47:59", "throughput": 110.02, "total_tokens": 64816}
|
| 12 |
+
{"current_steps": 60, "total_steps": 996, "loss": 1.0347, "lr": 4.9553627992605066e-05, "epoch": 0.18058690744920994, "percentage": 6.02, "elapsed_time": "0:10:43", "remaining_time": "2:47:18", "throughput": 110.1, "total_tokens": 70848}
|
| 13 |
+
{"current_steps": 65, "total_steps": 996, "loss": 1.0422, "lr": 4.947640441526989e-05, "epoch": 0.19563581640331076, "percentage": 6.53, "elapsed_time": "0:11:37", "remaining_time": "2:46:23", "throughput": 110.31, "total_tokens": 76888}
|
| 14 |
+
{"current_steps": 70, "total_steps": 996, "loss": 0.9996, "lr": 4.939309304747391e-05, "epoch": 0.2106847253574116, "percentage": 7.03, "elapsed_time": "0:12:30", "remaining_time": "2:45:29", "throughput": 110.36, "total_tokens": 82840}
|
| 15 |
+
{"current_steps": 75, "total_steps": 996, "loss": 1.0755, "lr": 4.930371461048571e-05, "epoch": 0.22573363431151242, "percentage": 7.53, "elapsed_time": "0:13:24", "remaining_time": "2:44:38", "throughput": 110.41, "total_tokens": 88824}
|
| 16 |
+
{"current_steps": 80, "total_steps": 996, "loss": 1.026, "lr": 4.9208291334580104e-05, "epoch": 0.24078254326561324, "percentage": 8.03, "elapsed_time": "0:14:15", "remaining_time": "2:43:18", "throughput": 110.15, "total_tokens": 94264}
|
| 17 |
+
{"current_steps": 85, "total_steps": 996, "loss": 1.1307, "lr": 4.910684695350895e-05, "epoch": 0.2558314522197141, "percentage": 8.53, "elapsed_time": "0:15:07", "remaining_time": "2:42:09", "throughput": 110.04, "total_tokens": 99896}
|
| 18 |
+
{"current_steps": 90, "total_steps": 996, "loss": 1.0221, "lr": 4.8999406698598074e-05, "epoch": 0.2708803611738149, "percentage": 9.04, "elapsed_time": "0:16:00", "remaining_time": "2:41:13", "throughput": 109.93, "total_tokens": 105640}
|
| 19 |
+
{"current_steps": 95, "total_steps": 996, "loss": 1.012, "lr": 4.8885997292471774e-05, "epoch": 0.28592927012791575, "percentage": 9.54, "elapsed_time": "0:16:52", "remaining_time": "2:39:58", "throughput": 109.96, "total_tokens": 111280}
|
| 20 |
+
{"current_steps": 100, "total_steps": 996, "loss": 1.0151, "lr": 4.87666469424063e-05, "epoch": 0.3009781790820166, "percentage": 10.04, "elapsed_time": "0:17:42", "remaining_time": "2:38:41", "throughput": 109.77, "total_tokens": 116640}
|
| 21 |
+
{"current_steps": 105, "total_steps": 996, "loss": 1.0028, "lr": 4.86413853333141e-05, "epoch": 0.3160270880361174, "percentage": 10.54, "elapsed_time": "0:18:34", "remaining_time": "2:37:40", "throughput": 109.3, "total_tokens": 121864}
|
| 22 |
+
{"current_steps": 110, "total_steps": 996, "loss": 1.143, "lr": 4.851024362036064e-05, "epoch": 0.3310759969902182, "percentage": 11.04, "elapsed_time": "0:19:26", "remaining_time": "2:36:35", "throughput": 109.2, "total_tokens": 127384}
|
| 23 |
+
{"current_steps": 115, "total_steps": 996, "loss": 0.9695, "lr": 4.837325442121538e-05, "epoch": 0.34612490594431905, "percentage": 11.55, "elapsed_time": "0:20:18", "remaining_time": "2:35:32", "throughput": 109.18, "total_tokens": 133008}
|
| 24 |
+
{"current_steps": 120, "total_steps": 996, "loss": 0.9017, "lr": 4.8230451807939135e-05, "epoch": 0.3611738148984199, "percentage": 12.05, "elapsed_time": "0:21:12", "remaining_time": "2:34:49", "throughput": 109.34, "total_tokens": 139144}
|
| 25 |
+
{"current_steps": 125, "total_steps": 996, "loss": 1.035, "lr": 4.808187129850963e-05, "epoch": 0.3762227238525207, "percentage": 12.55, "elapsed_time": "0:22:05", "remaining_time": "2:33:55", "throughput": 109.29, "total_tokens": 144848}
|
| 26 |
+
{"current_steps": 130, "total_steps": 996, "loss": 1.0128, "lr": 4.792754984798745e-05, "epoch": 0.3912716328066215, "percentage": 13.05, "elapsed_time": "0:22:57", "remaining_time": "2:32:55", "throughput": 109.25, "total_tokens": 150480}
|
| 27 |
+
{"current_steps": 135, "total_steps": 996, "loss": 0.9432, "lr": 4.776752583932454e-05, "epoch": 0.40632054176072235, "percentage": 13.55, "elapsed_time": "0:23:50", "remaining_time": "2:32:00", "throughput": 109.32, "total_tokens": 156336}
|
| 28 |
+
{"current_steps": 140, "total_steps": 996, "loss": 1.0344, "lr": 4.760183907381757e-05, "epoch": 0.4213694507148232, "percentage": 14.06, "elapsed_time": "0:24:43", "remaining_time": "2:31:10", "throughput": 109.5, "total_tokens": 162440}
|
| 29 |
+
{"current_steps": 145, "total_steps": 996, "loss": 0.9452, "lr": 4.7430530761208494e-05, "epoch": 0.436418359668924, "percentage": 14.56, "elapsed_time": "0:25:36", "remaining_time": "2:30:16", "throughput": 109.55, "total_tokens": 168304}
|
| 30 |
+
{"current_steps": 150, "total_steps": 996, "loss": 0.9559, "lr": 4.725364350943492e-05, "epoch": 0.45146726862302483, "percentage": 15.06, "elapsed_time": "0:26:28", "remaining_time": "2:29:18", "throughput": 109.53, "total_tokens": 173984}
|
| 31 |
+
{"current_steps": 155, "total_steps": 996, "loss": 0.9726, "lr": 4.707122131403251e-05, "epoch": 0.46651617757712566, "percentage": 15.56, "elapsed_time": "0:27:21", "remaining_time": "2:28:25", "throughput": 109.6, "total_tokens": 179896}
|
| 32 |
+
{"current_steps": 160, "total_steps": 996, "loss": 0.9344, "lr": 4.6883309547192476e-05, "epoch": 0.4815650865312265, "percentage": 16.06, "elapsed_time": "0:28:12", "remaining_time": "2:27:23", "throughput": 109.48, "total_tokens": 185296}
|
| 33 |
+
{"current_steps": 165, "total_steps": 996, "loss": 0.9497, "lr": 4.668995494647653e-05, "epoch": 0.4966139954853273, "percentage": 16.57, "elapsed_time": "0:29:05", "remaining_time": "2:26:29", "throughput": 109.4, "total_tokens": 190928}
|
| 34 |
+
{"current_steps": 170, "total_steps": 996, "loss": 1.057, "lr": 4.649120560319225e-05, "epoch": 0.5116629044394282, "percentage": 17.07, "elapsed_time": "0:30:00", "remaining_time": "2:25:49", "throughput": 109.59, "total_tokens": 197352}
|
| 35 |
+
{"current_steps": 175, "total_steps": 996, "loss": 0.9847, "lr": 4.6287110950431865e-05, "epoch": 0.526711813393529, "percentage": 17.57, "elapsed_time": "0:30:53", "remaining_time": "2:24:54", "throughput": 109.65, "total_tokens": 203216}
|
| 36 |
+
{"current_steps": 180, "total_steps": 996, "loss": 1.001, "lr": 4.607772175077711e-05, "epoch": 0.5417607223476298, "percentage": 18.07, "elapsed_time": "0:31:44", "remaining_time": "2:23:55", "throughput": 109.52, "total_tokens": 208624}
|
| 37 |
+
{"current_steps": 185, "total_steps": 996, "loss": 0.9384, "lr": 4.586309008367359e-05, "epoch": 0.5568096313017307, "percentage": 18.57, "elapsed_time": "0:32:38", "remaining_time": "2:23:04", "throughput": 109.56, "total_tokens": 214552}
|
| 38 |
+
{"current_steps": 190, "total_steps": 996, "loss": 1.0312, "lr": 4.564326933247752e-05, "epoch": 0.5718585402558315, "percentage": 19.08, "elapsed_time": "0:33:32", "remaining_time": "2:22:16", "throughput": 109.68, "total_tokens": 220704}
|
| 39 |
+
{"current_steps": 195, "total_steps": 996, "loss": 0.9112, "lr": 4.541831417117815e-05, "epoch": 0.5869074492099323, "percentage": 19.58, "elapsed_time": "0:34:24", "remaining_time": "2:21:20", "throughput": 109.7, "total_tokens": 226480}
|
| 40 |
+
{"current_steps": 200, "total_steps": 996, "loss": 0.9967, "lr": 4.518828055079925e-05, "epoch": 0.6019563581640331, "percentage": 20.08, "elapsed_time": "0:35:16", "remaining_time": "2:20:25", "throughput": 109.66, "total_tokens": 232136}
|
| 41 |
+
{"current_steps": 205, "total_steps": 996, "loss": 1.0905, "lr": 4.4953225685482904e-05, "epoch": 0.617005267118134, "percentage": 20.58, "elapsed_time": "0:36:11", "remaining_time": "2:19:38", "throughput": 109.64, "total_tokens": 238072}
|
| 42 |
+
{"current_steps": 210, "total_steps": 996, "loss": 0.9487, "lr": 4.471320803825915e-05, "epoch": 0.6320541760722348, "percentage": 21.08, "elapsed_time": "0:37:04", "remaining_time": "2:18:44", "throughput": 109.56, "total_tokens": 243680}
|
| 43 |
+
{"current_steps": 215, "total_steps": 996, "loss": 0.8675, "lr": 4.4468287306505045e-05, "epoch": 0.6471030850263356, "percentage": 21.59, "elapsed_time": "0:37:56", "remaining_time": "2:17:48", "throughput": 109.56, "total_tokens": 249376}
|
| 44 |
+
{"current_steps": 220, "total_steps": 996, "loss": 0.8624, "lr": 4.421852440709666e-05, "epoch": 0.6621519939804364, "percentage": 22.09, "elapsed_time": "0:38:48", "remaining_time": "2:16:54", "throughput": 109.61, "total_tokens": 255288}
|
| 45 |
+
{"current_steps": 225, "total_steps": 996, "loss": 1.0489, "lr": 4.39639814612578e-05, "epoch": 0.6772009029345373, "percentage": 22.59, "elapsed_time": "0:39:43", "remaining_time": "2:16:08", "throughput": 109.74, "total_tokens": 261592}
|
| 46 |
+
{"current_steps": 230, "total_steps": 996, "loss": 0.9139, "lr": 4.370472177910914e-05, "epoch": 0.6922498118886381, "percentage": 23.09, "elapsed_time": "0:40:36", "remaining_time": "2:15:13", "throughput": 109.68, "total_tokens": 267192}
|
| 47 |
+
{"current_steps": 235, "total_steps": 996, "loss": 0.9905, "lr": 4.3440809843921725e-05, "epoch": 0.7072987208427389, "percentage": 23.59, "elapsed_time": "0:41:27", "remaining_time": "2:14:16", "throughput": 109.62, "total_tokens": 272712}
|
| 48 |
+
{"current_steps": 240, "total_steps": 996, "loss": 0.8974, "lr": 4.3172311296078595e-05, "epoch": 0.7223476297968398, "percentage": 24.1, "elapsed_time": "0:42:21", "remaining_time": "2:13:25", "throughput": 109.66, "total_tokens": 278720}
|
| 49 |
+
{"current_steps": 245, "total_steps": 996, "loss": 0.999, "lr": 4.28992929167487e-05, "epoch": 0.7373965387509406, "percentage": 24.6, "elapsed_time": "0:43:14", "remaining_time": "2:12:33", "throughput": 109.68, "total_tokens": 284584}
|
| 50 |
+
{"current_steps": 250, "total_steps": 996, "loss": 0.9916, "lr": 4.2621822611277e-05, "epoch": 0.7524454477050414, "percentage": 25.1, "elapsed_time": "0:44:08", "remaining_time": "2:11:42", "throughput": 109.66, "total_tokens": 290408}
|
| 51 |
+
{"current_steps": 255, "total_steps": 996, "loss": 0.9242, "lr": 4.233996939229502e-05, "epoch": 0.7674943566591422, "percentage": 25.6, "elapsed_time": "0:45:00", "remaining_time": "2:10:46", "throughput": 109.54, "total_tokens": 295776}
|
| 52 |
+
{"current_steps": 260, "total_steps": 996, "loss": 1.0426, "lr": 4.205380336255594e-05, "epoch": 0.782543265613243, "percentage": 26.1, "elapsed_time": "0:45:54", "remaining_time": "2:09:55", "throughput": 109.56, "total_tokens": 301736}
|
| 53 |
+
{"current_steps": 265, "total_steps": 996, "loss": 0.8625, "lr": 4.176339569749865e-05, "epoch": 0.7975921745673439, "percentage": 26.61, "elapsed_time": "0:46:45", "remaining_time": "2:08:59", "throughput": 109.5, "total_tokens": 307224}
|
| 54 |
+
{"current_steps": 270, "total_steps": 996, "loss": 0.9959, "lr": 4.1468818627544845e-05, "epoch": 0.8126410835214447, "percentage": 27.11, "elapsed_time": "0:47:38", "remaining_time": "2:08:06", "throughput": 109.51, "total_tokens": 313040}
|
| 55 |
+
{"current_steps": 275, "total_steps": 996, "loss": 0.939, "lr": 4.11701454201339e-05, "epoch": 0.8276899924755455, "percentage": 27.61, "elapsed_time": "0:48:32", "remaining_time": "2:07:15", "throughput": 109.57, "total_tokens": 319112}
|
| 56 |
+
{"current_steps": 280, "total_steps": 996, "loss": 0.9741, "lr": 4.08674503614997e-05, "epoch": 0.8427389014296464, "percentage": 28.11, "elapsed_time": "0:49:25", "remaining_time": "2:06:22", "throughput": 109.61, "total_tokens": 325040}
|
| 57 |
+
{"current_steps": 285, "total_steps": 996, "loss": 0.98, "lr": 4.0560808738194114e-05, "epoch": 0.8577878103837472, "percentage": 28.61, "elapsed_time": "0:50:18", "remaining_time": "2:05:30", "throughput": 109.62, "total_tokens": 330904}
|
| 58 |
+
{"current_steps": 290, "total_steps": 996, "loss": 0.8898, "lr": 4.0250296818361647e-05, "epoch": 0.872836719337848, "percentage": 29.12, "elapsed_time": "0:51:10", "remaining_time": "2:04:35", "throughput": 109.54, "total_tokens": 336392}
|
| 59 |
+
{"current_steps": 295, "total_steps": 996, "loss": 0.953, "lr": 3.993599183277001e-05, "epoch": 0.8878856282919488, "percentage": 29.62, "elapsed_time": "0:52:06", "remaining_time": "2:03:48", "throughput": 109.67, "total_tokens": 342832}
|
| 60 |
+
{"current_steps": 300, "total_steps": 996, "loss": 0.9311, "lr": 3.961797195560118e-05, "epoch": 0.9029345372460497, "percentage": 30.12, "elapsed_time": "0:52:59", "remaining_time": "2:02:56", "throughput": 109.75, "total_tokens": 348944}
|
| 61 |
+
{"current_steps": 305, "total_steps": 996, "loss": 0.9114, "lr": 3.9296316285007887e-05, "epoch": 0.9179834462001505, "percentage": 30.62, "elapsed_time": "0:53:52", "remaining_time": "2:02:03", "throughput": 109.73, "total_tokens": 354680}
|
| 62 |
+
{"current_steps": 310, "total_steps": 996, "loss": 0.9674, "lr": 3.897110482344024e-05, "epoch": 0.9330323551542513, "percentage": 31.12, "elapsed_time": "0:54:46", "remaining_time": "2:01:12", "throughput": 109.85, "total_tokens": 361008}
|
| 63 |
+
{"current_steps": 315, "total_steps": 996, "loss": 0.9582, "lr": 3.864241845774746e-05, "epoch": 0.9480812641083521, "percentage": 31.63, "elapsed_time": "0:55:38", "remaining_time": "2:00:17", "throughput": 109.86, "total_tokens": 366760}
|
| 64 |
+
{"current_steps": 320, "total_steps": 996, "loss": 0.9863, "lr": 3.8310338939059644e-05, "epoch": 0.963130173062453, "percentage": 32.13, "elapsed_time": "0:56:30", "remaining_time": "1:59:23", "throughput": 109.84, "total_tokens": 372448}
|
| 65 |
+
{"current_steps": 325, "total_steps": 996, "loss": 0.906, "lr": 3.797494886245456e-05, "epoch": 0.9781790820165538, "percentage": 32.63, "elapsed_time": "0:57:24", "remaining_time": "1:58:31", "throughput": 109.89, "total_tokens": 378520}
|
| 66 |
+
{"current_steps": 330, "total_steps": 996, "loss": 0.8958, "lr": 3.7636331646414524e-05, "epoch": 0.9932279909706546, "percentage": 33.13, "elapsed_time": "0:58:17", "remaining_time": "1:57:38", "throughput": 109.87, "total_tokens": 384272}
|
| 67 |
+
{"current_steps": 335, "total_steps": 996, "loss": 0.8349, "lr": 3.7294571512078506e-05, "epoch": 1.0060195635816402, "percentage": 33.63, "elapsed_time": "0:59:02", "remaining_time": "1:56:30", "throughput": 109.89, "total_tokens": 389280}
|
| 68 |
+
{"current_steps": 340, "total_steps": 996, "loss": 0.8507, "lr": 3.694975346229458e-05, "epoch": 1.021068472535741, "percentage": 34.14, "elapsed_time": "0:59:54", "remaining_time": "1:55:36", "throughput": 109.86, "total_tokens": 394944}
|
| 69 |
+
{"current_steps": 345, "total_steps": 996, "loss": 0.9287, "lr": 3.6601963260477924e-05, "epoch": 1.036117381489842, "percentage": 34.64, "elapsed_time": "1:00:47", "remaining_time": "1:54:42", "throughput": 109.88, "total_tokens": 400800}
|
| 70 |
+
{"current_steps": 350, "total_steps": 996, "loss": 0.9107, "lr": 3.625128740927971e-05, "epoch": 1.0511662904439427, "percentage": 35.14, "elapsed_time": "1:01:40", "remaining_time": "1:53:50", "throughput": 109.91, "total_tokens": 406728}
|
| 71 |
+
{"current_steps": 355, "total_steps": 996, "loss": 0.952, "lr": 3.589781312907207e-05, "epoch": 1.0662151993980435, "percentage": 35.64, "elapsed_time": "1:02:33", "remaining_time": "1:52:57", "throughput": 109.93, "total_tokens": 412656}
|
| 72 |
+
{"current_steps": 360, "total_steps": 996, "loss": 0.9526, "lr": 3.55416283362546e-05, "epoch": 1.0812641083521444, "percentage": 36.14, "elapsed_time": "1:03:27", "remaining_time": "1:52:05", "throughput": 109.92, "total_tokens": 418488}
|
| 73 |
+
{"current_steps": 365, "total_steps": 996, "loss": 0.8775, "lr": 3.518282162138772e-05, "epoch": 1.0963130173062452, "percentage": 36.65, "elapsed_time": "1:04:19", "remaining_time": "1:51:12", "throughput": 109.9, "total_tokens": 424192}
|
| 74 |
+
{"current_steps": 370, "total_steps": 996, "loss": 0.883, "lr": 3.482148222715835e-05, "epoch": 1.111361926260346, "percentage": 37.15, "elapsed_time": "1:05:14", "remaining_time": "1:50:22", "throughput": 109.94, "total_tokens": 430312}
|
| 75 |
+
{"current_steps": 375, "total_steps": 996, "loss": 1.0032, "lr": 3.4457700026183374e-05, "epoch": 1.1264108352144468, "percentage": 37.65, "elapsed_time": "1:06:07", "remaining_time": "1:49:29", "throughput": 109.93, "total_tokens": 436128}
|
| 76 |
+
{"current_steps": 380, "total_steps": 996, "loss": 0.943, "lr": 3.409156549865654e-05, "epoch": 1.141459744168548, "percentage": 38.15, "elapsed_time": "1:06:59", "remaining_time": "1:48:36", "throughput": 109.94, "total_tokens": 441928}
|
| 77 |
+
{"current_steps": 385, "total_steps": 996, "loss": 0.801, "lr": 3.3723169709844026e-05, "epoch": 1.1565086531226485, "percentage": 38.65, "elapsed_time": "1:07:51", "remaining_time": "1:47:42", "throughput": 109.91, "total_tokens": 447560}
|
| 78 |
+
{"current_steps": 390, "total_steps": 996, "loss": 0.9294, "lr": 3.335260428743475e-05, "epoch": 1.1715575620767495, "percentage": 39.16, "elapsed_time": "1:08:44", "remaining_time": "1:46:48", "throughput": 109.91, "total_tokens": 453296}
|
| 79 |
+
{"current_steps": 395, "total_steps": 996, "loss": 0.9528, "lr": 3.297996139875055e-05, "epoch": 1.1866064710308502, "percentage": 39.66, "elapsed_time": "1:09:37", "remaining_time": "1:45:56", "throughput": 109.96, "total_tokens": 459336}
|
| 80 |
+
{"current_steps": 400, "total_steps": 996, "loss": 0.8981, "lr": 3.260533372782234e-05, "epoch": 1.2016553799849512, "percentage": 40.16, "elapsed_time": "1:10:29", "remaining_time": "1:45:01", "throughput": 109.93, "total_tokens": 464944}
|
| 81 |
+
{"current_steps": 405, "total_steps": 996, "loss": 0.9823, "lr": 3.222881445233759e-05, "epoch": 1.2167042889390518, "percentage": 40.66, "elapsed_time": "1:11:24", "remaining_time": "1:44:11", "throughput": 109.94, "total_tokens": 470992}
|
| 82 |
+
{"current_steps": 410, "total_steps": 996, "loss": 0.9047, "lr": 3.185049722046516e-05, "epoch": 1.2317531978931529, "percentage": 41.16, "elapsed_time": "1:12:14", "remaining_time": "1:43:15", "throughput": 109.86, "total_tokens": 476216}
|
| 83 |
+
{"current_steps": 415, "total_steps": 996, "loss": 0.8582, "lr": 3.147047612756302e-05, "epoch": 1.2468021068472535, "percentage": 41.67, "elapsed_time": "1:13:06", "remaining_time": "1:42:21", "throughput": 109.84, "total_tokens": 481824}
|
| 84 |
+
{"current_steps": 420, "total_steps": 996, "loss": 0.8787, "lr": 3.10888456927748e-05, "epoch": 1.2618510158013545, "percentage": 42.17, "elapsed_time": "1:13:58", "remaining_time": "1:41:27", "throughput": 109.85, "total_tokens": 487576}
|
| 85 |
+
{"current_steps": 425, "total_steps": 996, "loss": 0.8729, "lr": 3.0705700835520895e-05, "epoch": 1.276899924755455, "percentage": 42.67, "elapsed_time": "1:14:50", "remaining_time": "1:40:33", "throughput": 109.85, "total_tokens": 493336}
|
| 86 |
+
{"current_steps": 430, "total_steps": 996, "loss": 0.8772, "lr": 3.0321136851890036e-05, "epoch": 1.2919488337095562, "percentage": 43.17, "elapsed_time": "1:15:45", "remaining_time": "1:39:43", "throughput": 109.95, "total_tokens": 499760}
|
| 87 |
+
{"current_steps": 435, "total_steps": 996, "loss": 0.9451, "lr": 2.9935249390937183e-05, "epoch": 1.3069977426636568, "percentage": 43.67, "elapsed_time": "1:16:37", "remaining_time": "1:38:48", "throughput": 109.94, "total_tokens": 505400}
|
| 88 |
+
{"current_steps": 440, "total_steps": 996, "loss": 0.8202, "lr": 2.9548134430893604e-05, "epoch": 1.3220466516177578, "percentage": 44.18, "elapsed_time": "1:17:31", "remaining_time": "1:37:57", "throughput": 110.03, "total_tokens": 511760}
|
| 89 |
+
{"current_steps": 445, "total_steps": 996, "loss": 0.9773, "lr": 2.9159888255295116e-05, "epoch": 1.3370955605718584, "percentage": 44.68, "elapsed_time": "1:18:23", "remaining_time": "1:37:03", "throughput": 110.05, "total_tokens": 517616}
|
| 90 |
+
{"current_steps": 450, "total_steps": 996, "loss": 0.9101, "lr": 2.8770607429034352e-05, "epoch": 1.3521444695259595, "percentage": 45.18, "elapsed_time": "1:19:13", "remaining_time": "1:36:07", "throughput": 109.98, "total_tokens": 522744}
|
| 91 |
+
{"current_steps": 455, "total_steps": 996, "loss": 0.9633, "lr": 2.8380388774343047e-05, "epoch": 1.36719337848006, "percentage": 45.68, "elapsed_time": "1:20:05", "remaining_time": "1:35:13", "throughput": 110.01, "total_tokens": 528648}
|
| 92 |
+
{"current_steps": 460, "total_steps": 996, "loss": 0.8886, "lr": 2.7989329346710375e-05, "epoch": 1.382242287434161, "percentage": 46.18, "elapsed_time": "1:20:56", "remaining_time": "1:34:18", "throughput": 109.96, "total_tokens": 534000}
|
| 93 |
+
{"current_steps": 465, "total_steps": 996, "loss": 0.9258, "lr": 2.759752641074322e-05, "epoch": 1.3972911963882617, "percentage": 46.69, "elapsed_time": "1:21:48", "remaining_time": "1:33:24", "throughput": 109.96, "total_tokens": 539688}
|
| 94 |
+
{"current_steps": 470, "total_steps": 996, "loss": 0.9039, "lr": 2.7205077415974416e-05, "epoch": 1.4123401053423628, "percentage": 47.19, "elapsed_time": "1:22:38", "remaining_time": "1:32:29", "throughput": 109.93, "total_tokens": 545112}
|
| 95 |
+
{"current_steps": 475, "total_steps": 996, "loss": 1.0116, "lr": 2.6812079972625077e-05, "epoch": 1.4273890142964636, "percentage": 47.69, "elapsed_time": "1:23:32", "remaining_time": "1:31:37", "throughput": 109.99, "total_tokens": 551328}
|
| 96 |
+
{"current_steps": 480, "total_steps": 996, "loss": 0.8218, "lr": 2.6418631827326857e-05, "epoch": 1.4424379232505644, "percentage": 48.19, "elapsed_time": "1:24:23", "remaining_time": "1:30:42", "throughput": 109.97, "total_tokens": 556816}
|
| 97 |
+
{"current_steps": 485, "total_steps": 996, "loss": 0.8604, "lr": 2.602483083881035e-05, "epoch": 1.4574868322046652, "percentage": 48.69, "elapsed_time": "1:25:15", "remaining_time": "1:29:50", "throughput": 109.96, "total_tokens": 562552}
|
| 98 |
+
{"current_steps": 490, "total_steps": 996, "loss": 0.8044, "lr": 2.563077495356561e-05, "epoch": 1.472535741158766, "percentage": 49.2, "elapsed_time": "1:26:08", "remaining_time": "1:28:56", "throughput": 110.0, "total_tokens": 568480}
|
| 99 |
+
{"current_steps": 495, "total_steps": 996, "loss": 0.9198, "lr": 2.5236562181480794e-05, "epoch": 1.487584650112867, "percentage": 49.7, "elapsed_time": "1:26:59", "remaining_time": "1:28:03", "throughput": 109.98, "total_tokens": 574072}
|
| 100 |
+
{"current_steps": 500, "total_steps": 996, "loss": 0.9181, "lr": 2.484229057146507e-05, "epoch": 1.5026335590669677, "percentage": 50.2, "elapsed_time": "1:27:53", "remaining_time": "1:27:11", "throughput": 109.99, "total_tokens": 580040}
|
| 101 |
+
{"current_steps": 505, "total_steps": 996, "loss": 0.8644, "lr": 2.4448058187061835e-05, "epoch": 1.5176824680210683, "percentage": 50.7, "elapsed_time": "1:28:48", "remaining_time": "1:26:20", "throughput": 110.01, "total_tokens": 586128}
|
| 102 |
+
{"current_steps": 510, "total_steps": 996, "loss": 1.0127, "lr": 2.4053963082058244e-05, "epoch": 1.5327313769751694, "percentage": 51.2, "elapsed_time": "1:29:41", "remaining_time": "1:25:28", "throughput": 110.05, "total_tokens": 592256}
|
| 103 |
+
{"current_steps": 515, "total_steps": 996, "loss": 0.7937, "lr": 2.3660103276097232e-05, "epoch": 1.54778028592927, "percentage": 51.71, "elapsed_time": "1:30:32", "remaining_time": "1:24:33", "throughput": 110.02, "total_tokens": 597704}
|
| 104 |
+
{"current_steps": 520, "total_steps": 996, "loss": 0.9806, "lr": 2.3266576730297956e-05, "epoch": 1.562829194883371, "percentage": 52.21, "elapsed_time": "1:31:23", "remaining_time": "1:23:39", "throughput": 110.01, "total_tokens": 603240}
|
| 105 |
+
{"current_steps": 525, "total_steps": 996, "loss": 0.934, "lr": 2.2873481322890862e-05, "epoch": 1.5778781038374716, "percentage": 52.71, "elapsed_time": "1:32:18", "remaining_time": "1:22:48", "throughput": 110.08, "total_tokens": 609616}
|
| 106 |
+
{"current_steps": 530, "total_steps": 996, "loss": 0.9288, "lr": 2.2480914824873297e-05, "epoch": 1.5929270127915727, "percentage": 53.21, "elapsed_time": "1:33:11", "remaining_time": "1:21:56", "throughput": 110.09, "total_tokens": 615520}
|
| 107 |
+
{"current_steps": 535, "total_steps": 996, "loss": 0.8597, "lr": 2.2088974875691863e-05, "epoch": 1.6079759217456733, "percentage": 53.71, "elapsed_time": "1:34:03", "remaining_time": "1:21:03", "throughput": 110.07, "total_tokens": 621208}
|
| 108 |
+
{"current_steps": 540, "total_steps": 996, "loss": 0.8817, "lr": 2.1697758958957448e-05, "epoch": 1.6230248306997743, "percentage": 54.22, "elapsed_time": "1:34:56", "remaining_time": "1:20:10", "throughput": 110.1, "total_tokens": 627176}
|
| 109 |
+
{"current_steps": 545, "total_steps": 996, "loss": 0.777, "lr": 2.1307364378199005e-05, "epoch": 1.6380737396538751, "percentage": 54.72, "elapsed_time": "1:35:50", "remaining_time": "1:19:18", "throughput": 110.13, "total_tokens": 633248}
|
| 110 |
+
{"current_steps": 550, "total_steps": 996, "loss": 0.798, "lr": 2.0917888232662196e-05, "epoch": 1.653122648607976, "percentage": 55.22, "elapsed_time": "1:36:43", "remaining_time": "1:18:25", "throughput": 110.11, "total_tokens": 639000}
|
| 111 |
+
{"current_steps": 555, "total_steps": 996, "loss": 0.9104, "lr": 2.0529427393158705e-05, "epoch": 1.6681715575620768, "percentage": 55.72, "elapsed_time": "1:37:37", "remaining_time": "1:17:33", "throughput": 110.17, "total_tokens": 645280}
|
| 112 |
+
{"current_steps": 560, "total_steps": 996, "loss": 0.8293, "lr": 2.014207847797256e-05, "epoch": 1.6832204665161776, "percentage": 56.22, "elapsed_time": "1:38:31", "remaining_time": "1:16:42", "throughput": 110.26, "total_tokens": 651760}
|
| 113 |
+
{"current_steps": 565, "total_steps": 996, "loss": 0.8821, "lr": 1.9755937828829067e-05, "epoch": 1.6982693754702785, "percentage": 56.73, "elapsed_time": "1:39:22", "remaining_time": "1:15:48", "throughput": 110.23, "total_tokens": 657272}
|
| 114 |
+
{"current_steps": 570, "total_steps": 996, "loss": 0.8253, "lr": 1.937110148693265e-05, "epoch": 1.7133182844243793, "percentage": 57.23, "elapsed_time": "1:40:15", "remaining_time": "1:14:56", "throughput": 110.27, "total_tokens": 663336}
|
| 115 |
+
{"current_steps": 575, "total_steps": 996, "loss": 0.9391, "lr": 1.8987665169079454e-05, "epoch": 1.72836719337848, "percentage": 57.73, "elapsed_time": "1:41:07", "remaining_time": "1:14:02", "throughput": 110.25, "total_tokens": 668936}
|
| 116 |
+
{"current_steps": 580, "total_steps": 996, "loss": 0.8711, "lr": 1.8605724243850502e-05, "epoch": 1.743416102332581, "percentage": 58.23, "elapsed_time": "1:42:00", "remaining_time": "1:13:10", "throughput": 110.28, "total_tokens": 675000}
|
| 117 |
+
{"current_steps": 585, "total_steps": 996, "loss": 0.8346, "lr": 1.822537370789163e-05, "epoch": 1.7584650112866818, "percentage": 58.73, "elapsed_time": "1:42:52", "remaining_time": "1:12:16", "throughput": 110.26, "total_tokens": 680584}
|
| 118 |
+
{"current_steps": 590, "total_steps": 996, "loss": 0.8275, "lr": 1.7846708162285785e-05, "epoch": 1.7735139202407826, "percentage": 59.24, "elapsed_time": "1:43:44", "remaining_time": "1:11:23", "throughput": 110.27, "total_tokens": 686416}
|
| 119 |
+
{"current_steps": 595, "total_steps": 996, "loss": 0.9435, "lr": 1.7469821789023815e-05, "epoch": 1.7885628291948834, "percentage": 59.74, "elapsed_time": "1:44:35", "remaining_time": "1:10:29", "throughput": 110.27, "total_tokens": 692016}
|
| 120 |
+
{"current_steps": 600, "total_steps": 996, "loss": 0.8584, "lr": 1.70948083275794e-05, "epoch": 1.8036117381489842, "percentage": 60.24, "elapsed_time": "1:45:28", "remaining_time": "1:09:36", "throughput": 110.29, "total_tokens": 697984}
|
| 121 |
+
{"current_steps": 605, "total_steps": 996, "loss": 0.88, "lr": 1.672176105159417e-05, "epoch": 1.818660647103085, "percentage": 60.74, "elapsed_time": "1:46:23", "remaining_time": "1:08:45", "throughput": 110.3, "total_tokens": 704056}
|
| 122 |
+
{"current_steps": 610, "total_steps": 996, "loss": 0.8825, "lr": 1.635077274567854e-05, "epoch": 1.8337095560571859, "percentage": 61.24, "elapsed_time": "1:47:15", "remaining_time": "1:07:52", "throughput": 110.3, "total_tokens": 709760}
|
| 123 |
+
{"current_steps": 615, "total_steps": 996, "loss": 0.9978, "lr": 1.5981935682334264e-05, "epoch": 1.8487584650112867, "percentage": 61.75, "elapsed_time": "1:48:08", "remaining_time": "1:06:59", "throughput": 110.33, "total_tokens": 715872}
|
| 124 |
+
{"current_steps": 620, "total_steps": 996, "loss": 0.9626, "lr": 1.561534159900441e-05, "epoch": 1.8638073739653875, "percentage": 62.25, "elapsed_time": "1:49:02", "remaining_time": "1:06:07", "throughput": 110.38, "total_tokens": 722184}
|
| 125 |
+
{"current_steps": 625, "total_steps": 996, "loss": 0.9308, "lr": 1.525108167525624e-05, "epoch": 1.8788562829194884, "percentage": 62.75, "elapsed_time": "1:49:54", "remaining_time": "1:05:14", "throughput": 110.37, "total_tokens": 727776}
|
| 126 |
+
{"current_steps": 630, "total_steps": 996, "loss": 0.9757, "lr": 1.4889246510103077e-05, "epoch": 1.8939051918735892, "percentage": 63.25, "elapsed_time": "1:50:47", "remaining_time": "1:04:21", "throughput": 110.38, "total_tokens": 733760}
|
| 127 |
+
{"current_steps": 635, "total_steps": 996, "loss": 0.767, "lr": 1.4529926099470348e-05, "epoch": 1.90895410082769, "percentage": 63.76, "elapsed_time": "1:51:41", "remaining_time": "1:03:29", "throughput": 110.43, "total_tokens": 740024}
|
| 128 |
+
{"current_steps": 640, "total_steps": 996, "loss": 0.9272, "lr": 1.4173209813811788e-05, "epoch": 1.9240030097817908, "percentage": 64.26, "elapsed_time": "1:52:32", "remaining_time": "1:02:36", "throughput": 110.4, "total_tokens": 745480}
|
| 129 |
+
{"current_steps": 645, "total_steps": 996, "loss": 0.7941, "lr": 1.381918637588112e-05, "epoch": 1.9390519187358917, "percentage": 64.76, "elapsed_time": "1:53:25", "remaining_time": "1:01:43", "throughput": 110.41, "total_tokens": 751384}
|
| 130 |
+
{"current_steps": 650, "total_steps": 996, "loss": 0.8408, "lr": 1.3467943838664863e-05, "epoch": 1.9541008276899925, "percentage": 65.26, "elapsed_time": "1:54:17", "remaining_time": "1:00:50", "throughput": 110.38, "total_tokens": 756920}
|
| 131 |
+
{"current_steps": 655, "total_steps": 996, "loss": 0.8459, "lr": 1.311956956348177e-05, "epoch": 1.9691497366440933, "percentage": 65.76, "elapsed_time": "1:55:08", "remaining_time": "0:59:56", "throughput": 110.35, "total_tokens": 762424}
|
| 132 |
+
{"current_steps": 660, "total_steps": 996, "loss": 1.0117, "lr": 1.277415019825417e-05, "epoch": 1.9841986455981941, "percentage": 66.27, "elapsed_time": "1:56:01", "remaining_time": "0:59:03", "throughput": 110.36, "total_tokens": 768224}
|
| 133 |
+
{"current_steps": 665, "total_steps": 996, "loss": 0.9665, "lr": 1.2431771655956925e-05, "epoch": 1.999247554552295, "percentage": 66.77, "elapsed_time": "1:56:52", "remaining_time": "0:58:10", "throughput": 110.32, "total_tokens": 773568}
|
| 134 |
+
{"current_steps": 670, "total_steps": 996, "loss": 0.7625, "lr": 1.2092519093248988e-05, "epoch": 2.0120391271632805, "percentage": 67.27, "elapsed_time": "1:57:37", "remaining_time": "0:57:13", "throughput": 110.34, "total_tokens": 778672}
|
| 135 |
+
{"current_steps": 675, "total_steps": 996, "loss": 0.8667, "lr": 1.1756476889293269e-05, "epoch": 2.0270880361173815, "percentage": 67.77, "elapsed_time": "1:58:30", "remaining_time": "0:56:21", "throughput": 110.33, "total_tokens": 784488}
|
| 136 |
+
{"current_steps": 680, "total_steps": 996, "loss": 0.8297, "lr": 1.1423728624769695e-05, "epoch": 2.042136945071482, "percentage": 68.27, "elapsed_time": "1:59:22", "remaining_time": "0:55:28", "throughput": 110.34, "total_tokens": 790304}
|
| 137 |
+
{"current_steps": 685, "total_steps": 996, "loss": 0.8774, "lr": 1.1094357061087033e-05, "epoch": 2.057185854025583, "percentage": 68.78, "elapsed_time": "2:00:15", "remaining_time": "0:54:35", "throughput": 110.35, "total_tokens": 796192}
|
| 138 |
+
{"current_steps": 690, "total_steps": 996, "loss": 0.8476, "lr": 1.0768444119798357e-05, "epoch": 2.072234762979684, "percentage": 69.28, "elapsed_time": "2:01:07", "remaining_time": "0:53:43", "throughput": 110.37, "total_tokens": 802144}
|
| 139 |
+
{"current_steps": 695, "total_steps": 996, "loss": 0.8641, "lr": 1.0446070862225463e-05, "epoch": 2.087283671933785, "percentage": 69.78, "elapsed_time": "2:02:00", "remaining_time": "0:52:50", "throughput": 110.35, "total_tokens": 807768}
|
| 140 |
+
{"current_steps": 700, "total_steps": 996, "loss": 0.8383, "lr": 1.0127317469297277e-05, "epoch": 2.1023325808878854, "percentage": 70.28, "elapsed_time": "2:02:53", "remaining_time": "0:51:57", "throughput": 110.36, "total_tokens": 813712}
|
| 141 |
+
{"current_steps": 705, "total_steps": 996, "loss": 0.9123, "lr": 9.812263221607112e-06, "epoch": 2.1173814898419865, "percentage": 70.78, "elapsed_time": "2:03:46", "remaining_time": "0:51:05", "throughput": 110.32, "total_tokens": 819360}
|
| 142 |
+
{"current_steps": 710, "total_steps": 996, "loss": 0.9635, "lr": 9.500986479694036e-06, "epoch": 2.132430398796087, "percentage": 71.29, "elapsed_time": "2:04:37", "remaining_time": "0:50:11", "throughput": 110.28, "total_tokens": 824584}
|
| 143 |
+
{"current_steps": 715, "total_steps": 996, "loss": 0.9221, "lr": 9.19356466455287e-06, "epoch": 2.147479307750188, "percentage": 71.79, "elapsed_time": "2:05:30", "remaining_time": "0:49:19", "throughput": 110.3, "total_tokens": 830600}
|
| 144 |
+
{"current_steps": 720, "total_steps": 996, "loss": 0.8757, "lr": 8.890074238378074e-06, "epoch": 2.1625282167042887, "percentage": 72.29, "elapsed_time": "2:06:24", "remaining_time": "0:48:27", "throughput": 110.34, "total_tokens": 836856}
|
| 145 |
+
{"current_steps": 725, "total_steps": 996, "loss": 0.7958, "lr": 8.590590685545946e-06, "epoch": 2.17757712565839, "percentage": 72.79, "elapsed_time": "2:07:17", "remaining_time": "0:47:34", "throughput": 110.36, "total_tokens": 842872}
|
| 146 |
+
{"current_steps": 730, "total_steps": 996, "loss": 0.7993, "lr": 8.295188493840104e-06, "epoch": 2.1926260346124904, "percentage": 73.29, "elapsed_time": "2:08:10", "remaining_time": "0:46:42", "throughput": 110.36, "total_tokens": 848664}
|
| 147 |
+
{"current_steps": 735, "total_steps": 996, "loss": 0.8436, "lr": 8.003941135924858e-06, "epoch": 2.2076749435665914, "percentage": 73.8, "elapsed_time": "2:09:04", "remaining_time": "0:45:49", "throughput": 110.37, "total_tokens": 854712}
|
| 148 |
+
{"current_steps": 740, "total_steps": 996, "loss": 0.896, "lr": 7.71692105107098e-06, "epoch": 2.222723852520692, "percentage": 74.3, "elapsed_time": "2:09:56", "remaining_time": "0:44:57", "throughput": 110.38, "total_tokens": 860648}
|
| 149 |
+
{"current_steps": 745, "total_steps": 996, "loss": 0.8948, "lr": 7.434199627138602e-06, "epoch": 2.237772761474793, "percentage": 74.8, "elapsed_time": "2:10:48", "remaining_time": "0:44:04", "throughput": 110.35, "total_tokens": 866080}
|
| 150 |
+
{"current_steps": 750, "total_steps": 996, "loss": 0.8546, "lr": 7.155847182821523e-06, "epoch": 2.2528216704288937, "percentage": 75.3, "elapsed_time": "2:11:39", "remaining_time": "0:43:11", "throughput": 110.33, "total_tokens": 871560}
|
| 151 |
+
{"current_steps": 755, "total_steps": 996, "loss": 0.8494, "lr": 6.881932950157538e-06, "epoch": 2.2678705793829947, "percentage": 75.8, "elapsed_time": "2:12:32", "remaining_time": "0:42:18", "throughput": 110.35, "total_tokens": 877568}
|
| 152 |
+
{"current_steps": 760, "total_steps": 996, "loss": 0.7723, "lr": 6.612525057308949e-06, "epoch": 2.282919488337096, "percentage": 76.31, "elapsed_time": "2:13:25", "remaining_time": "0:41:26", "throughput": 110.4, "total_tokens": 883808}
|
| 153 |
+
{"current_steps": 765, "total_steps": 996, "loss": 0.9168, "lr": 6.347690511617693e-06, "epoch": 2.2979683972911964, "percentage": 76.81, "elapsed_time": "2:14:17", "remaining_time": "0:40:32", "throughput": 110.37, "total_tokens": 889296}
|
| 154 |
+
{"current_steps": 770, "total_steps": 996, "loss": 0.8831, "lr": 6.0874951829392234e-06, "epoch": 2.313017306245297, "percentage": 77.31, "elapsed_time": "2:15:09", "remaining_time": "0:39:40", "throughput": 110.38, "total_tokens": 895120}
|
| 155 |
+
{"current_steps": 775, "total_steps": 996, "loss": 0.854, "lr": 5.832003787259327e-06, "epoch": 2.328066215199398, "percentage": 77.81, "elapsed_time": "2:16:00", "remaining_time": "0:38:47", "throughput": 110.32, "total_tokens": 900320}
|
| 156 |
+
{"current_steps": 780, "total_steps": 996, "loss": 0.8843, "lr": 5.581279870597867e-06, "epoch": 2.343115124153499, "percentage": 78.31, "elapsed_time": "2:16:52", "remaining_time": "0:37:54", "throughput": 110.32, "total_tokens": 905928}
|
| 157 |
+
{"current_steps": 785, "total_steps": 996, "loss": 0.862, "lr": 5.335385793203604e-06, "epoch": 2.3581640331075997, "percentage": 78.82, "elapsed_time": "2:17:44", "remaining_time": "0:37:01", "throughput": 110.34, "total_tokens": 911976}
|
| 158 |
+
{"current_steps": 790, "total_steps": 996, "loss": 0.985, "lr": 5.094382714043907e-06, "epoch": 2.3732129420617003, "percentage": 79.32, "elapsed_time": "2:18:37", "remaining_time": "0:36:08", "throughput": 110.34, "total_tokens": 917840}
|
| 159 |
+
{"current_steps": 795, "total_steps": 996, "loss": 0.7679, "lr": 4.85833057559322e-06, "epoch": 2.3882618510158014, "percentage": 79.82, "elapsed_time": "2:19:28", "remaining_time": "0:35:15", "throughput": 110.31, "total_tokens": 923168}
|
| 160 |
+
{"current_steps": 800, "total_steps": 996, "loss": 0.8198, "lr": 4.627288088924156e-06, "epoch": 2.4033107599699024, "percentage": 80.32, "elapsed_time": "2:20:20", "remaining_time": "0:34:22", "throughput": 110.3, "total_tokens": 928720}
|
| 161 |
+
{"current_steps": 805, "total_steps": 996, "loss": 0.7773, "lr": 4.401312719104802e-06, "epoch": 2.418359668924003, "percentage": 80.82, "elapsed_time": "2:21:14", "remaining_time": "0:33:30", "throughput": 110.28, "total_tokens": 934568}
|
| 162 |
+
{"current_steps": 810, "total_steps": 996, "loss": 0.9312, "lr": 4.180460670905978e-06, "epoch": 2.4334085778781036, "percentage": 81.33, "elapsed_time": "2:22:06", "remaining_time": "0:32:38", "throughput": 110.27, "total_tokens": 940264}
|
| 163 |
+
{"current_steps": 815, "total_steps": 996, "loss": 0.8497, "lr": 3.964786874821955e-06, "epoch": 2.4484574868322047, "percentage": 81.83, "elapsed_time": "2:22:59", "remaining_time": "0:31:45", "throughput": 110.28, "total_tokens": 946128}
|
| 164 |
+
{"current_steps": 820, "total_steps": 996, "loss": 0.782, "lr": 3.754344973408064e-06, "epoch": 2.4635063957863057, "percentage": 82.33, "elapsed_time": "2:23:52", "remaining_time": "0:30:52", "throughput": 110.29, "total_tokens": 952032}
|
| 165 |
+
{"current_steps": 825, "total_steps": 996, "loss": 0.8937, "lr": 3.5491873079387256e-06, "epoch": 2.4785553047404063, "percentage": 82.83, "elapsed_time": "2:24:44", "remaining_time": "0:30:00", "throughput": 110.31, "total_tokens": 957960}
|
| 166 |
+
{"current_steps": 830, "total_steps": 996, "loss": 0.7039, "lr": 3.3493649053890326e-06, "epoch": 2.493604213694507, "percentage": 83.33, "elapsed_time": "2:25:38", "remaining_time": "0:29:07", "throughput": 110.36, "total_tokens": 964336}
|
| 167 |
+
{"current_steps": 835, "total_steps": 996, "loss": 0.9265, "lr": 3.1549274657433375e-06, "epoch": 2.508653122648608, "percentage": 83.84, "elapsed_time": "2:26:30", "remaining_time": "0:28:15", "throughput": 110.36, "total_tokens": 970168}
|
| 168 |
+
{"current_steps": 840, "total_steps": 996, "loss": 0.8669, "lr": 2.9659233496337786e-06, "epoch": 2.523702031602709, "percentage": 84.34, "elapsed_time": "2:27:21", "remaining_time": "0:27:22", "throughput": 110.36, "total_tokens": 975752}
|
| 169 |
+
{"current_steps": 845, "total_steps": 996, "loss": 0.9174, "lr": 2.7823995663120327e-06, "epoch": 2.5387509405568096, "percentage": 84.84, "elapsed_time": "2:28:14", "remaining_time": "0:26:29", "throughput": 110.37, "total_tokens": 981672}
|
| 170 |
+
{"current_steps": 850, "total_steps": 996, "loss": 0.8718, "lr": 2.6044017619571065e-06, "epoch": 2.55379984951091, "percentage": 85.34, "elapsed_time": "2:29:06", "remaining_time": "0:25:36", "throughput": 110.38, "total_tokens": 987560}
|
| 171 |
+
{"current_steps": 855, "total_steps": 996, "loss": 0.8634, "lr": 2.431974208322191e-06, "epoch": 2.5688487584650113, "percentage": 85.84, "elapsed_time": "2:29:58", "remaining_time": "0:24:44", "throughput": 110.37, "total_tokens": 993200}
|
| 172 |
+
{"current_steps": 860, "total_steps": 996, "loss": 0.845, "lr": 2.265159791723373e-06, "epoch": 2.5838976674191123, "percentage": 86.35, "elapsed_time": "2:30:52", "remaining_time": "0:23:51", "throughput": 110.38, "total_tokens": 999192}
|
| 173 |
+
{"current_steps": 865, "total_steps": 996, "loss": 0.8008, "lr": 2.104000002372886e-06, "epoch": 2.598946576373213, "percentage": 86.85, "elapsed_time": "2:31:42", "remaining_time": "0:22:58", "throughput": 110.36, "total_tokens": 1004576}
|
| 174 |
+
{"current_steps": 870, "total_steps": 996, "loss": 0.8797, "lr": 1.9485349240596613e-06, "epoch": 2.6139954853273135, "percentage": 87.35, "elapsed_time": "2:32:34", "remaining_time": "0:22:05", "throughput": 110.36, "total_tokens": 1010352}
|
| 175 |
+
{"current_steps": 875, "total_steps": 996, "loss": 0.946, "lr": 1.7988032241796376e-06, "epoch": 2.6290443942814146, "percentage": 87.85, "elapsed_time": "2:33:27", "remaining_time": "0:21:13", "throughput": 110.37, "total_tokens": 1016272}
|
| 176 |
+
{"current_steps": 880, "total_steps": 996, "loss": 0.8032, "lr": 1.6548421441183875e-06, "epoch": 2.6440933032355156, "percentage": 88.35, "elapsed_time": "2:34:19", "remaining_time": "0:20:20", "throughput": 110.36, "total_tokens": 1021896}
|
| 177 |
+
{"current_steps": 885, "total_steps": 996, "loss": 0.8892, "lr": 1.5166874899884053e-06, "epoch": 2.659142212189616, "percentage": 88.86, "elapsed_time": "2:35:11", "remaining_time": "0:19:27", "throughput": 110.37, "total_tokens": 1027704}
|
| 178 |
+
{"current_steps": 890, "total_steps": 996, "loss": 0.856, "lr": 1.3843736237233784e-06, "epoch": 2.674191121143717, "percentage": 89.36, "elapsed_time": "2:36:04", "remaining_time": "0:18:35", "throughput": 110.39, "total_tokens": 1033800}
|
| 179 |
+
{"current_steps": 895, "total_steps": 996, "loss": 0.8617, "lr": 1.2579334545316733e-06, "epoch": 2.689240030097818, "percentage": 89.86, "elapsed_time": "2:36:58", "remaining_time": "0:17:42", "throughput": 110.43, "total_tokens": 1040008}
|
| 180 |
+
{"current_steps": 900, "total_steps": 996, "loss": 0.9117, "lr": 1.137398430711123e-06, "epoch": 2.704288939051919, "percentage": 90.36, "elapsed_time": "2:37:52", "remaining_time": "0:16:50", "throughput": 110.46, "total_tokens": 1046272}
|
| 181 |
+
{"current_steps": 905, "total_steps": 996, "loss": 0.7855, "lr": 1.0227985318271682e-06, "epoch": 2.7193378480060195, "percentage": 90.86, "elapsed_time": "2:38:45", "remaining_time": "0:15:57", "throughput": 110.44, "total_tokens": 1052032}
|
| 182 |
+
{"current_steps": 910, "total_steps": 996, "loss": 0.8212, "lr": 9.141622612563571e-07, "epoch": 2.73438675696012, "percentage": 91.37, "elapsed_time": "2:39:37", "remaining_time": "0:15:05", "throughput": 110.42, "total_tokens": 1057584}
|
| 183 |
+
{"current_steps": 915, "total_steps": 996, "loss": 0.8404, "lr": 8.115166390969125e-07, "epoch": 2.749435665914221, "percentage": 91.87, "elapsed_time": "2:40:31", "remaining_time": "0:14:12", "throughput": 110.45, "total_tokens": 1063760}
|
| 184 |
+
{"current_steps": 920, "total_steps": 996, "loss": 0.7782, "lr": 7.148871954483105e-07, "epoch": 2.764484574868322, "percentage": 92.37, "elapsed_time": "2:41:23", "remaining_time": "0:13:19", "throughput": 110.45, "total_tokens": 1069544}
|
| 185 |
+
{"current_steps": 925, "total_steps": 996, "loss": 0.7847, "lr": 6.242979640613933e-07, "epoch": 2.779533483822423, "percentage": 92.87, "elapsed_time": "2:42:16", "remaining_time": "0:12:27", "throughput": 110.45, "total_tokens": 1075472}
|
| 186 |
+
{"current_steps": 930, "total_steps": 996, "loss": 0.8857, "lr": 5.397714763606843e-07, "epoch": 2.7945823927765234, "percentage": 93.37, "elapsed_time": "2:43:10", "remaining_time": "0:11:34", "throughput": 110.46, "total_tokens": 1081464}
|
| 187 |
+
{"current_steps": 935, "total_steps": 996, "loss": 0.8029, "lr": 4.613287558403512e-07, "epoch": 2.8096313017306245, "percentage": 93.88, "elapsed_time": "2:44:03", "remaining_time": "0:10:42", "throughput": 110.48, "total_tokens": 1087464}
|
| 188 |
+
{"current_steps": 940, "total_steps": 996, "loss": 0.8154, "lr": 3.8898931283523344e-07, "epoch": 2.8246802106847255, "percentage": 94.38, "elapsed_time": "2:44:54", "remaining_time": "0:09:49", "throughput": 110.45, "total_tokens": 1092888}
|
| 189 |
+
{"current_steps": 945, "total_steps": 996, "loss": 0.8791, "lr": 3.227711396682015e-07, "epoch": 2.839729119638826, "percentage": 94.88, "elapsed_time": "2:45:48", "remaining_time": "0:08:56", "throughput": 110.45, "total_tokens": 1098808}
|
| 190 |
+
{"current_steps": 950, "total_steps": 996, "loss": 0.787, "lr": 2.626907061751116e-07, "epoch": 2.854778028592927, "percentage": 95.38, "elapsed_time": "2:46:40", "remaining_time": "0:08:04", "throughput": 110.46, "total_tokens": 1104688}
|
| 191 |
+
{"current_steps": 955, "total_steps": 996, "loss": 0.8831, "lr": 2.0876295560839364e-07, "epoch": 2.869826937547028, "percentage": 95.88, "elapsed_time": "2:47:34", "remaining_time": "0:07:11", "throughput": 110.49, "total_tokens": 1110960}
|
| 192 |
+
{"current_steps": 960, "total_steps": 996, "loss": 0.7677, "lr": 1.6100130092037703e-07, "epoch": 2.884875846501129, "percentage": 96.39, "elapsed_time": "2:48:27", "remaining_time": "0:06:19", "throughput": 110.49, "total_tokens": 1116800}
|
| 193 |
+
{"current_steps": 965, "total_steps": 996, "loss": 0.7567, "lr": 1.194176214271897e-07, "epoch": 2.8999247554552294, "percentage": 96.89, "elapsed_time": "2:49:18", "remaining_time": "0:05:26", "throughput": 110.47, "total_tokens": 1122248}
|
| 194 |
+
{"current_steps": 970, "total_steps": 996, "loss": 0.8944, "lr": 8.402225985413848e-08, "epoch": 2.9149736644093305, "percentage": 97.39, "elapsed_time": "2:50:10", "remaining_time": "0:04:33", "throughput": 110.46, "total_tokens": 1127928}
|
| 195 |
+
{"current_steps": 975, "total_steps": 996, "loss": 0.9737, "lr": 5.4824019763252685e-08, "epoch": 2.930022573363431, "percentage": 97.89, "elapsed_time": "2:51:01", "remaining_time": "0:03:41", "throughput": 110.44, "total_tokens": 1133336}
|
| 196 |
+
{"current_steps": 980, "total_steps": 996, "loss": 0.8965, "lr": 3.1830163363655296e-08, "epoch": 2.945071482317532, "percentage": 98.39, "elapsed_time": "2:51:54", "remaining_time": "0:02:48", "throughput": 110.44, "total_tokens": 1139048}
|
| 197 |
+
{"current_steps": 985, "total_steps": 996, "loss": 0.837, "lr": 1.504640970531046e-08, "epoch": 2.9601203912716327, "percentage": 98.9, "elapsed_time": "2:52:45", "remaining_time": "0:01:55", "throughput": 110.41, "total_tokens": 1144456}
|
| 198 |
+
{"current_steps": 990, "total_steps": 996, "loss": 0.7812, "lr": 4.4769332565558485e-09, "epoch": 2.975169300225734, "percentage": 99.4, "elapsed_time": "2:53:37", "remaining_time": "0:01:03", "throughput": 110.4, "total_tokens": 1150160}
|
| 199 |
+
{"current_steps": 995, "total_steps": 996, "loss": 0.8613, "lr": 1.2436286584982527e-10, "epoch": 2.9902182091798344, "percentage": 99.9, "elapsed_time": "2:54:33", "remaining_time": "0:00:10", "throughput": 110.45, "total_tokens": 1156704}
|
| 200 |
+
{"current_steps": 996, "total_steps": 996, "epoch": 2.9932279909706545, "percentage": 100.0, "elapsed_time": "2:54:44", "remaining_time": "0:00:00", "throughput": 110.43, "total_tokens": 1157808}
|
trainer_state.json
ADDED
|
@@ -0,0 +1,1636 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.9932279909706545,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 996,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.015048908954100828,
|
| 14 |
+
"grad_norm": 1.2988319396972656,
|
| 15 |
+
"learning_rate": 4.9996890990217804e-05,
|
| 16 |
+
"loss": 2.4707,
|
| 17 |
+
"num_input_tokens_seen": 5864,
|
| 18 |
+
"step": 5
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"epoch": 0.030097817908201655,
|
| 22 |
+
"grad_norm": 1.8058427572250366,
|
| 23 |
+
"learning_rate": 4.9987564734146566e-05,
|
| 24 |
+
"loss": 2.2509,
|
| 25 |
+
"num_input_tokens_seen": 11432,
|
| 26 |
+
"step": 10
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"epoch": 0.045146726862302484,
|
| 30 |
+
"grad_norm": 0.8231738209724426,
|
| 31 |
+
"learning_rate": 4.997202355141999e-05,
|
| 32 |
+
"loss": 1.6895,
|
| 33 |
+
"num_input_tokens_seen": 17000,
|
| 34 |
+
"step": 15
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"epoch": 0.06019563581640331,
|
| 38 |
+
"grad_norm": 0.7266705632209778,
|
| 39 |
+
"learning_rate": 4.995027130745321e-05,
|
| 40 |
+
"loss": 1.4876,
|
| 41 |
+
"num_input_tokens_seen": 22840,
|
| 42 |
+
"step": 20
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"epoch": 0.07524454477050414,
|
| 46 |
+
"grad_norm": 1.1722582578659058,
|
| 47 |
+
"learning_rate": 4.992231341248137e-05,
|
| 48 |
+
"loss": 1.4812,
|
| 49 |
+
"num_input_tokens_seen": 28984,
|
| 50 |
+
"step": 25
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"epoch": 0.09029345372460497,
|
| 54 |
+
"grad_norm": 0.9262341260910034,
|
| 55 |
+
"learning_rate": 4.9888156820213974e-05,
|
| 56 |
+
"loss": 1.3642,
|
| 57 |
+
"num_input_tokens_seen": 34856,
|
| 58 |
+
"step": 30
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"epoch": 0.1053423626787058,
|
| 62 |
+
"grad_norm": 0.8832902908325195,
|
| 63 |
+
"learning_rate": 4.9847810026105394e-05,
|
| 64 |
+
"loss": 1.3651,
|
| 65 |
+
"num_input_tokens_seen": 41216,
|
| 66 |
+
"step": 35
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.12039127163280662,
|
| 70 |
+
"grad_norm": 0.8503655791282654,
|
| 71 |
+
"learning_rate": 4.980128306524183e-05,
|
| 72 |
+
"loss": 1.1321,
|
| 73 |
+
"num_input_tokens_seen": 47304,
|
| 74 |
+
"step": 40
|
| 75 |
+
},
|
| 76 |
+
{
|
| 77 |
+
"epoch": 0.13544018058690746,
|
| 78 |
+
"grad_norm": 1.348948359489441,
|
| 79 |
+
"learning_rate": 4.97485875098454e-05,
|
| 80 |
+
"loss": 1.3012,
|
| 81 |
+
"num_input_tokens_seen": 53184,
|
| 82 |
+
"step": 45
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
"epoch": 0.1504890895410083,
|
| 86 |
+
"grad_norm": 0.7177269458770752,
|
| 87 |
+
"learning_rate": 4.968973646639589e-05,
|
| 88 |
+
"loss": 0.9827,
|
| 89 |
+
"num_input_tokens_seen": 59024,
|
| 90 |
+
"step": 50
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"epoch": 0.1655379984951091,
|
| 94 |
+
"grad_norm": 0.6005258560180664,
|
| 95 |
+
"learning_rate": 4.9624744572370865e-05,
|
| 96 |
+
"loss": 1.2313,
|
| 97 |
+
"num_input_tokens_seen": 64816,
|
| 98 |
+
"step": 55
|
| 99 |
+
},
|
| 100 |
+
{
|
| 101 |
+
"epoch": 0.18058690744920994,
|
| 102 |
+
"grad_norm": 0.6153081059455872,
|
| 103 |
+
"learning_rate": 4.9553627992605066e-05,
|
| 104 |
+
"loss": 1.0347,
|
| 105 |
+
"num_input_tokens_seen": 70848,
|
| 106 |
+
"step": 60
|
| 107 |
+
},
|
| 108 |
+
{
|
| 109 |
+
"epoch": 0.19563581640331076,
|
| 110 |
+
"grad_norm": 0.7796200513839722,
|
| 111 |
+
"learning_rate": 4.947640441526989e-05,
|
| 112 |
+
"loss": 1.0422,
|
| 113 |
+
"num_input_tokens_seen": 76888,
|
| 114 |
+
"step": 65
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"epoch": 0.2106847253574116,
|
| 118 |
+
"grad_norm": 0.7273033857345581,
|
| 119 |
+
"learning_rate": 4.939309304747391e-05,
|
| 120 |
+
"loss": 0.9996,
|
| 121 |
+
"num_input_tokens_seen": 82840,
|
| 122 |
+
"step": 70
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.22573363431151242,
|
| 126 |
+
"grad_norm": 0.7943289875984192,
|
| 127 |
+
"learning_rate": 4.930371461048571e-05,
|
| 128 |
+
"loss": 1.0755,
|
| 129 |
+
"num_input_tokens_seen": 88824,
|
| 130 |
+
"step": 75
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"epoch": 0.24078254326561324,
|
| 134 |
+
"grad_norm": 0.6128024458885193,
|
| 135 |
+
"learning_rate": 4.9208291334580104e-05,
|
| 136 |
+
"loss": 1.026,
|
| 137 |
+
"num_input_tokens_seen": 94264,
|
| 138 |
+
"step": 80
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"epoch": 0.2558314522197141,
|
| 142 |
+
"grad_norm": 0.7087495923042297,
|
| 143 |
+
"learning_rate": 4.910684695350895e-05,
|
| 144 |
+
"loss": 1.1307,
|
| 145 |
+
"num_input_tokens_seen": 99896,
|
| 146 |
+
"step": 85
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"epoch": 0.2708803611738149,
|
| 150 |
+
"grad_norm": 0.711476743221283,
|
| 151 |
+
"learning_rate": 4.8999406698598074e-05,
|
| 152 |
+
"loss": 1.0221,
|
| 153 |
+
"num_input_tokens_seen": 105640,
|
| 154 |
+
"step": 90
|
| 155 |
+
},
|
| 156 |
+
{
|
| 157 |
+
"epoch": 0.28592927012791575,
|
| 158 |
+
"grad_norm": 0.5772566795349121,
|
| 159 |
+
"learning_rate": 4.8885997292471774e-05,
|
| 160 |
+
"loss": 1.012,
|
| 161 |
+
"num_input_tokens_seen": 111280,
|
| 162 |
+
"step": 95
|
| 163 |
+
},
|
| 164 |
+
{
|
| 165 |
+
"epoch": 0.3009781790820166,
|
| 166 |
+
"grad_norm": 0.6769325137138367,
|
| 167 |
+
"learning_rate": 4.87666469424063e-05,
|
| 168 |
+
"loss": 1.0151,
|
| 169 |
+
"num_input_tokens_seen": 116640,
|
| 170 |
+
"step": 100
|
| 171 |
+
},
|
| 172 |
+
{
|
| 173 |
+
"epoch": 0.3160270880361174,
|
| 174 |
+
"grad_norm": 0.679373025894165,
|
| 175 |
+
"learning_rate": 4.86413853333141e-05,
|
| 176 |
+
"loss": 1.0028,
|
| 177 |
+
"num_input_tokens_seen": 121864,
|
| 178 |
+
"step": 105
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.3310759969902182,
|
| 182 |
+
"grad_norm": 0.9181504845619202,
|
| 183 |
+
"learning_rate": 4.851024362036064e-05,
|
| 184 |
+
"loss": 1.143,
|
| 185 |
+
"num_input_tokens_seen": 127384,
|
| 186 |
+
"step": 110
|
| 187 |
+
},
|
| 188 |
+
{
|
| 189 |
+
"epoch": 0.34612490594431905,
|
| 190 |
+
"grad_norm": 0.7842696905136108,
|
| 191 |
+
"learning_rate": 4.837325442121538e-05,
|
| 192 |
+
"loss": 0.9695,
|
| 193 |
+
"num_input_tokens_seen": 133008,
|
| 194 |
+
"step": 115
|
| 195 |
+
},
|
| 196 |
+
{
|
| 197 |
+
"epoch": 0.3611738148984199,
|
| 198 |
+
"grad_norm": 0.6459535360336304,
|
| 199 |
+
"learning_rate": 4.8230451807939135e-05,
|
| 200 |
+
"loss": 0.9017,
|
| 201 |
+
"num_input_tokens_seen": 139144,
|
| 202 |
+
"step": 120
|
| 203 |
+
},
|
| 204 |
+
{
|
| 205 |
+
"epoch": 0.3762227238525207,
|
| 206 |
+
"grad_norm": 0.6695935726165771,
|
| 207 |
+
"learning_rate": 4.808187129850963e-05,
|
| 208 |
+
"loss": 1.035,
|
| 209 |
+
"num_input_tokens_seen": 144848,
|
| 210 |
+
"step": 125
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"epoch": 0.3912716328066215,
|
| 214 |
+
"grad_norm": 0.9289236664772034,
|
| 215 |
+
"learning_rate": 4.792754984798745e-05,
|
| 216 |
+
"loss": 1.0128,
|
| 217 |
+
"num_input_tokens_seen": 150480,
|
| 218 |
+
"step": 130
|
| 219 |
+
},
|
| 220 |
+
{
|
| 221 |
+
"epoch": 0.40632054176072235,
|
| 222 |
+
"grad_norm": 0.6192979216575623,
|
| 223 |
+
"learning_rate": 4.776752583932454e-05,
|
| 224 |
+
"loss": 0.9432,
|
| 225 |
+
"num_input_tokens_seen": 156336,
|
| 226 |
+
"step": 135
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"epoch": 0.4213694507148232,
|
| 230 |
+
"grad_norm": 0.7946303486824036,
|
| 231 |
+
"learning_rate": 4.760183907381757e-05,
|
| 232 |
+
"loss": 1.0344,
|
| 233 |
+
"num_input_tokens_seen": 162440,
|
| 234 |
+
"step": 140
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.436418359668924,
|
| 238 |
+
"grad_norm": 0.6548484563827515,
|
| 239 |
+
"learning_rate": 4.7430530761208494e-05,
|
| 240 |
+
"loss": 0.9452,
|
| 241 |
+
"num_input_tokens_seen": 168304,
|
| 242 |
+
"step": 145
|
| 243 |
+
},
|
| 244 |
+
{
|
| 245 |
+
"epoch": 0.45146726862302483,
|
| 246 |
+
"grad_norm": 0.9075986742973328,
|
| 247 |
+
"learning_rate": 4.725364350943492e-05,
|
| 248 |
+
"loss": 0.9559,
|
| 249 |
+
"num_input_tokens_seen": 173984,
|
| 250 |
+
"step": 150
|
| 251 |
+
},
|
| 252 |
+
{
|
| 253 |
+
"epoch": 0.46651617757712566,
|
| 254 |
+
"grad_norm": 0.8047800660133362,
|
| 255 |
+
"learning_rate": 4.707122131403251e-05,
|
| 256 |
+
"loss": 0.9726,
|
| 257 |
+
"num_input_tokens_seen": 179896,
|
| 258 |
+
"step": 155
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"epoch": 0.4815650865312265,
|
| 262 |
+
"grad_norm": 0.6954847574234009,
|
| 263 |
+
"learning_rate": 4.6883309547192476e-05,
|
| 264 |
+
"loss": 0.9344,
|
| 265 |
+
"num_input_tokens_seen": 185296,
|
| 266 |
+
"step": 160
|
| 267 |
+
},
|
| 268 |
+
{
|
| 269 |
+
"epoch": 0.4966139954853273,
|
| 270 |
+
"grad_norm": 0.7912609577178955,
|
| 271 |
+
"learning_rate": 4.668995494647653e-05,
|
| 272 |
+
"loss": 0.9497,
|
| 273 |
+
"num_input_tokens_seen": 190928,
|
| 274 |
+
"step": 165
|
| 275 |
+
},
|
| 276 |
+
{
|
| 277 |
+
"epoch": 0.5116629044394282,
|
| 278 |
+
"grad_norm": 0.7360678315162659,
|
| 279 |
+
"learning_rate": 4.649120560319225e-05,
|
| 280 |
+
"loss": 1.057,
|
| 281 |
+
"num_input_tokens_seen": 197352,
|
| 282 |
+
"step": 170
|
| 283 |
+
},
|
| 284 |
+
{
|
| 285 |
+
"epoch": 0.526711813393529,
|
| 286 |
+
"grad_norm": 0.7325194478034973,
|
| 287 |
+
"learning_rate": 4.6287110950431865e-05,
|
| 288 |
+
"loss": 0.9847,
|
| 289 |
+
"num_input_tokens_seen": 203216,
|
| 290 |
+
"step": 175
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 0.5417607223476298,
|
| 294 |
+
"grad_norm": 0.7140082120895386,
|
| 295 |
+
"learning_rate": 4.607772175077711e-05,
|
| 296 |
+
"loss": 1.001,
|
| 297 |
+
"num_input_tokens_seen": 208624,
|
| 298 |
+
"step": 180
|
| 299 |
+
},
|
| 300 |
+
{
|
| 301 |
+
"epoch": 0.5568096313017307,
|
| 302 |
+
"grad_norm": 0.9454194903373718,
|
| 303 |
+
"learning_rate": 4.586309008367359e-05,
|
| 304 |
+
"loss": 0.9384,
|
| 305 |
+
"num_input_tokens_seen": 214552,
|
| 306 |
+
"step": 185
|
| 307 |
+
},
|
| 308 |
+
{
|
| 309 |
+
"epoch": 0.5718585402558315,
|
| 310 |
+
"grad_norm": 0.9370235800743103,
|
| 311 |
+
"learning_rate": 4.564326933247752e-05,
|
| 312 |
+
"loss": 1.0312,
|
| 313 |
+
"num_input_tokens_seen": 220704,
|
| 314 |
+
"step": 190
|
| 315 |
+
},
|
| 316 |
+
{
|
| 317 |
+
"epoch": 0.5869074492099323,
|
| 318 |
+
"grad_norm": 0.7274216413497925,
|
| 319 |
+
"learning_rate": 4.541831417117815e-05,
|
| 320 |
+
"loss": 0.9112,
|
| 321 |
+
"num_input_tokens_seen": 226480,
|
| 322 |
+
"step": 195
|
| 323 |
+
},
|
| 324 |
+
{
|
| 325 |
+
"epoch": 0.6019563581640331,
|
| 326 |
+
"grad_norm": 0.9026529788970947,
|
| 327 |
+
"learning_rate": 4.518828055079925e-05,
|
| 328 |
+
"loss": 0.9967,
|
| 329 |
+
"num_input_tokens_seen": 232136,
|
| 330 |
+
"step": 200
|
| 331 |
+
},
|
| 332 |
+
{
|
| 333 |
+
"epoch": 0.617005267118134,
|
| 334 |
+
"grad_norm": 0.9668667316436768,
|
| 335 |
+
"learning_rate": 4.4953225685482904e-05,
|
| 336 |
+
"loss": 1.0905,
|
| 337 |
+
"num_input_tokens_seen": 238072,
|
| 338 |
+
"step": 205
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"epoch": 0.6320541760722348,
|
| 342 |
+
"grad_norm": 0.7728851437568665,
|
| 343 |
+
"learning_rate": 4.471320803825915e-05,
|
| 344 |
+
"loss": 0.9487,
|
| 345 |
+
"num_input_tokens_seen": 243680,
|
| 346 |
+
"step": 210
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 0.6471030850263356,
|
| 350 |
+
"grad_norm": 0.7141396999359131,
|
| 351 |
+
"learning_rate": 4.4468287306505045e-05,
|
| 352 |
+
"loss": 0.8675,
|
| 353 |
+
"num_input_tokens_seen": 249376,
|
| 354 |
+
"step": 215
|
| 355 |
+
},
|
| 356 |
+
{
|
| 357 |
+
"epoch": 0.6621519939804364,
|
| 358 |
+
"grad_norm": 0.7524191737174988,
|
| 359 |
+
"learning_rate": 4.421852440709666e-05,
|
| 360 |
+
"loss": 0.8624,
|
| 361 |
+
"num_input_tokens_seen": 255288,
|
| 362 |
+
"step": 220
|
| 363 |
+
},
|
| 364 |
+
{
|
| 365 |
+
"epoch": 0.6772009029345373,
|
| 366 |
+
"grad_norm": 1.1502355337142944,
|
| 367 |
+
"learning_rate": 4.39639814612578e-05,
|
| 368 |
+
"loss": 1.0489,
|
| 369 |
+
"num_input_tokens_seen": 261592,
|
| 370 |
+
"step": 225
|
| 371 |
+
},
|
| 372 |
+
{
|
| 373 |
+
"epoch": 0.6922498118886381,
|
| 374 |
+
"grad_norm": 0.7467320561408997,
|
| 375 |
+
"learning_rate": 4.370472177910914e-05,
|
| 376 |
+
"loss": 0.9139,
|
| 377 |
+
"num_input_tokens_seen": 267192,
|
| 378 |
+
"step": 230
|
| 379 |
+
},
|
| 380 |
+
{
|
| 381 |
+
"epoch": 0.7072987208427389,
|
| 382 |
+
"grad_norm": 0.6400129795074463,
|
| 383 |
+
"learning_rate": 4.3440809843921725e-05,
|
| 384 |
+
"loss": 0.9905,
|
| 385 |
+
"num_input_tokens_seen": 272712,
|
| 386 |
+
"step": 235
|
| 387 |
+
},
|
| 388 |
+
{
|
| 389 |
+
"epoch": 0.7223476297968398,
|
| 390 |
+
"grad_norm": 0.6654481291770935,
|
| 391 |
+
"learning_rate": 4.3172311296078595e-05,
|
| 392 |
+
"loss": 0.8974,
|
| 393 |
+
"num_input_tokens_seen": 278720,
|
| 394 |
+
"step": 240
|
| 395 |
+
},
|
| 396 |
+
{
|
| 397 |
+
"epoch": 0.7373965387509406,
|
| 398 |
+
"grad_norm": 0.7487585544586182,
|
| 399 |
+
"learning_rate": 4.28992929167487e-05,
|
| 400 |
+
"loss": 0.999,
|
| 401 |
+
"num_input_tokens_seen": 284584,
|
| 402 |
+
"step": 245
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 0.7524454477050414,
|
| 406 |
+
"grad_norm": 0.6885581612586975,
|
| 407 |
+
"learning_rate": 4.2621822611277e-05,
|
| 408 |
+
"loss": 0.9916,
|
| 409 |
+
"num_input_tokens_seen": 290408,
|
| 410 |
+
"step": 250
|
| 411 |
+
},
|
| 412 |
+
{
|
| 413 |
+
"epoch": 0.7674943566591422,
|
| 414 |
+
"grad_norm": 0.774027407169342,
|
| 415 |
+
"learning_rate": 4.233996939229502e-05,
|
| 416 |
+
"loss": 0.9242,
|
| 417 |
+
"num_input_tokens_seen": 295776,
|
| 418 |
+
"step": 255
|
| 419 |
+
},
|
| 420 |
+
{
|
| 421 |
+
"epoch": 0.782543265613243,
|
| 422 |
+
"grad_norm": 0.8608073592185974,
|
| 423 |
+
"learning_rate": 4.205380336255594e-05,
|
| 424 |
+
"loss": 1.0426,
|
| 425 |
+
"num_input_tokens_seen": 301736,
|
| 426 |
+
"step": 260
|
| 427 |
+
},
|
| 428 |
+
{
|
| 429 |
+
"epoch": 0.7975921745673439,
|
| 430 |
+
"grad_norm": 0.6539498567581177,
|
| 431 |
+
"learning_rate": 4.176339569749865e-05,
|
| 432 |
+
"loss": 0.8625,
|
| 433 |
+
"num_input_tokens_seen": 307224,
|
| 434 |
+
"step": 265
|
| 435 |
+
},
|
| 436 |
+
{
|
| 437 |
+
"epoch": 0.8126410835214447,
|
| 438 |
+
"grad_norm": 0.8432996273040771,
|
| 439 |
+
"learning_rate": 4.1468818627544845e-05,
|
| 440 |
+
"loss": 0.9959,
|
| 441 |
+
"num_input_tokens_seen": 313040,
|
| 442 |
+
"step": 270
|
| 443 |
+
},
|
| 444 |
+
{
|
| 445 |
+
"epoch": 0.8276899924755455,
|
| 446 |
+
"grad_norm": 0.877001166343689,
|
| 447 |
+
"learning_rate": 4.11701454201339e-05,
|
| 448 |
+
"loss": 0.939,
|
| 449 |
+
"num_input_tokens_seen": 319112,
|
| 450 |
+
"step": 275
|
| 451 |
+
},
|
| 452 |
+
{
|
| 453 |
+
"epoch": 0.8427389014296464,
|
| 454 |
+
"grad_norm": 0.9003238081932068,
|
| 455 |
+
"learning_rate": 4.08674503614997e-05,
|
| 456 |
+
"loss": 0.9741,
|
| 457 |
+
"num_input_tokens_seen": 325040,
|
| 458 |
+
"step": 280
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 0.8577878103837472,
|
| 462 |
+
"grad_norm": 0.8585950136184692,
|
| 463 |
+
"learning_rate": 4.0560808738194114e-05,
|
| 464 |
+
"loss": 0.98,
|
| 465 |
+
"num_input_tokens_seen": 330904,
|
| 466 |
+
"step": 285
|
| 467 |
+
},
|
| 468 |
+
{
|
| 469 |
+
"epoch": 0.872836719337848,
|
| 470 |
+
"grad_norm": 0.8015385270118713,
|
| 471 |
+
"learning_rate": 4.0250296818361647e-05,
|
| 472 |
+
"loss": 0.8898,
|
| 473 |
+
"num_input_tokens_seen": 336392,
|
| 474 |
+
"step": 290
|
| 475 |
+
},
|
| 476 |
+
{
|
| 477 |
+
"epoch": 0.8878856282919488,
|
| 478 |
+
"grad_norm": 0.8380082845687866,
|
| 479 |
+
"learning_rate": 3.993599183277001e-05,
|
| 480 |
+
"loss": 0.953,
|
| 481 |
+
"num_input_tokens_seen": 342832,
|
| 482 |
+
"step": 295
|
| 483 |
+
},
|
| 484 |
+
{
|
| 485 |
+
"epoch": 0.9029345372460497,
|
| 486 |
+
"grad_norm": 0.8890098929405212,
|
| 487 |
+
"learning_rate": 3.961797195560118e-05,
|
| 488 |
+
"loss": 0.9311,
|
| 489 |
+
"num_input_tokens_seen": 348944,
|
| 490 |
+
"step": 300
|
| 491 |
+
},
|
| 492 |
+
{
|
| 493 |
+
"epoch": 0.9179834462001505,
|
| 494 |
+
"grad_norm": 0.9356483221054077,
|
| 495 |
+
"learning_rate": 3.9296316285007887e-05,
|
| 496 |
+
"loss": 0.9114,
|
| 497 |
+
"num_input_tokens_seen": 354680,
|
| 498 |
+
"step": 305
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"epoch": 0.9330323551542513,
|
| 502 |
+
"grad_norm": 0.8241044878959656,
|
| 503 |
+
"learning_rate": 3.897110482344024e-05,
|
| 504 |
+
"loss": 0.9674,
|
| 505 |
+
"num_input_tokens_seen": 361008,
|
| 506 |
+
"step": 310
|
| 507 |
+
},
|
| 508 |
+
{
|
| 509 |
+
"epoch": 0.9480812641083521,
|
| 510 |
+
"grad_norm": 0.7882922887802124,
|
| 511 |
+
"learning_rate": 3.864241845774746e-05,
|
| 512 |
+
"loss": 0.9582,
|
| 513 |
+
"num_input_tokens_seen": 366760,
|
| 514 |
+
"step": 315
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 0.963130173062453,
|
| 518 |
+
"grad_norm": 0.7503064274787903,
|
| 519 |
+
"learning_rate": 3.8310338939059644e-05,
|
| 520 |
+
"loss": 0.9863,
|
| 521 |
+
"num_input_tokens_seen": 372448,
|
| 522 |
+
"step": 320
|
| 523 |
+
},
|
| 524 |
+
{
|
| 525 |
+
"epoch": 0.9781790820165538,
|
| 526 |
+
"grad_norm": 0.6487952470779419,
|
| 527 |
+
"learning_rate": 3.797494886245456e-05,
|
| 528 |
+
"loss": 0.906,
|
| 529 |
+
"num_input_tokens_seen": 378520,
|
| 530 |
+
"step": 325
|
| 531 |
+
},
|
| 532 |
+
{
|
| 533 |
+
"epoch": 0.9932279909706546,
|
| 534 |
+
"grad_norm": 0.8584316968917847,
|
| 535 |
+
"learning_rate": 3.7636331646414524e-05,
|
| 536 |
+
"loss": 0.8958,
|
| 537 |
+
"num_input_tokens_seen": 384272,
|
| 538 |
+
"step": 330
|
| 539 |
+
},
|
| 540 |
+
{
|
| 541 |
+
"epoch": 1.0060195635816402,
|
| 542 |
+
"grad_norm": 0.8825767040252686,
|
| 543 |
+
"learning_rate": 3.7294571512078506e-05,
|
| 544 |
+
"loss": 0.8349,
|
| 545 |
+
"num_input_tokens_seen": 389280,
|
| 546 |
+
"step": 335
|
| 547 |
+
},
|
| 548 |
+
{
|
| 549 |
+
"epoch": 1.021068472535741,
|
| 550 |
+
"grad_norm": 0.8422874808311462,
|
| 551 |
+
"learning_rate": 3.694975346229458e-05,
|
| 552 |
+
"loss": 0.8507,
|
| 553 |
+
"num_input_tokens_seen": 394944,
|
| 554 |
+
"step": 340
|
| 555 |
+
},
|
| 556 |
+
{
|
| 557 |
+
"epoch": 1.036117381489842,
|
| 558 |
+
"grad_norm": 0.8337146639823914,
|
| 559 |
+
"learning_rate": 3.6601963260477924e-05,
|
| 560 |
+
"loss": 0.9287,
|
| 561 |
+
"num_input_tokens_seen": 400800,
|
| 562 |
+
"step": 345
|
| 563 |
+
},
|
| 564 |
+
{
|
| 565 |
+
"epoch": 1.0511662904439427,
|
| 566 |
+
"grad_norm": 0.936469316482544,
|
| 567 |
+
"learning_rate": 3.625128740927971e-05,
|
| 568 |
+
"loss": 0.9107,
|
| 569 |
+
"num_input_tokens_seen": 406728,
|
| 570 |
+
"step": 350
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 1.0662151993980435,
|
| 574 |
+
"grad_norm": 0.8475446105003357,
|
| 575 |
+
"learning_rate": 3.589781312907207e-05,
|
| 576 |
+
"loss": 0.952,
|
| 577 |
+
"num_input_tokens_seen": 412656,
|
| 578 |
+
"step": 355
|
| 579 |
+
},
|
| 580 |
+
{
|
| 581 |
+
"epoch": 1.0812641083521444,
|
| 582 |
+
"grad_norm": 0.7245047092437744,
|
| 583 |
+
"learning_rate": 3.55416283362546e-05,
|
| 584 |
+
"loss": 0.9526,
|
| 585 |
+
"num_input_tokens_seen": 418488,
|
| 586 |
+
"step": 360
|
| 587 |
+
},
|
| 588 |
+
{
|
| 589 |
+
"epoch": 1.0963130173062452,
|
| 590 |
+
"grad_norm": 1.0173735618591309,
|
| 591 |
+
"learning_rate": 3.518282162138772e-05,
|
| 592 |
+
"loss": 0.8775,
|
| 593 |
+
"num_input_tokens_seen": 424192,
|
| 594 |
+
"step": 365
|
| 595 |
+
},
|
| 596 |
+
{
|
| 597 |
+
"epoch": 1.111361926260346,
|
| 598 |
+
"grad_norm": 0.9992531538009644,
|
| 599 |
+
"learning_rate": 3.482148222715835e-05,
|
| 600 |
+
"loss": 0.883,
|
| 601 |
+
"num_input_tokens_seen": 430312,
|
| 602 |
+
"step": 370
|
| 603 |
+
},
|
| 604 |
+
{
|
| 605 |
+
"epoch": 1.1264108352144468,
|
| 606 |
+
"grad_norm": 1.0938397645950317,
|
| 607 |
+
"learning_rate": 3.4457700026183374e-05,
|
| 608 |
+
"loss": 1.0032,
|
| 609 |
+
"num_input_tokens_seen": 436128,
|
| 610 |
+
"step": 375
|
| 611 |
+
},
|
| 612 |
+
{
|
| 613 |
+
"epoch": 1.141459744168548,
|
| 614 |
+
"grad_norm": 0.8988808989524841,
|
| 615 |
+
"learning_rate": 3.409156549865654e-05,
|
| 616 |
+
"loss": 0.943,
|
| 617 |
+
"num_input_tokens_seen": 441928,
|
| 618 |
+
"step": 380
|
| 619 |
+
},
|
| 620 |
+
{
|
| 621 |
+
"epoch": 1.1565086531226485,
|
| 622 |
+
"grad_norm": 0.9952559471130371,
|
| 623 |
+
"learning_rate": 3.3723169709844026e-05,
|
| 624 |
+
"loss": 0.801,
|
| 625 |
+
"num_input_tokens_seen": 447560,
|
| 626 |
+
"step": 385
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 1.1715575620767495,
|
| 630 |
+
"grad_norm": 0.7556662559509277,
|
| 631 |
+
"learning_rate": 3.335260428743475e-05,
|
| 632 |
+
"loss": 0.9294,
|
| 633 |
+
"num_input_tokens_seen": 453296,
|
| 634 |
+
"step": 390
|
| 635 |
+
},
|
| 636 |
+
{
|
| 637 |
+
"epoch": 1.1866064710308502,
|
| 638 |
+
"grad_norm": 0.8362197279930115,
|
| 639 |
+
"learning_rate": 3.297996139875055e-05,
|
| 640 |
+
"loss": 0.9528,
|
| 641 |
+
"num_input_tokens_seen": 459336,
|
| 642 |
+
"step": 395
|
| 643 |
+
},
|
| 644 |
+
{
|
| 645 |
+
"epoch": 1.2016553799849512,
|
| 646 |
+
"grad_norm": 0.9389665722846985,
|
| 647 |
+
"learning_rate": 3.260533372782234e-05,
|
| 648 |
+
"loss": 0.8981,
|
| 649 |
+
"num_input_tokens_seen": 464944,
|
| 650 |
+
"step": 400
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"epoch": 1.2167042889390518,
|
| 654 |
+
"grad_norm": 1.1821860074996948,
|
| 655 |
+
"learning_rate": 3.222881445233759e-05,
|
| 656 |
+
"loss": 0.9823,
|
| 657 |
+
"num_input_tokens_seen": 470992,
|
| 658 |
+
"step": 405
|
| 659 |
+
},
|
| 660 |
+
{
|
| 661 |
+
"epoch": 1.2317531978931529,
|
| 662 |
+
"grad_norm": 1.0015898942947388,
|
| 663 |
+
"learning_rate": 3.185049722046516e-05,
|
| 664 |
+
"loss": 0.9047,
|
| 665 |
+
"num_input_tokens_seen": 476216,
|
| 666 |
+
"step": 410
|
| 667 |
+
},
|
| 668 |
+
{
|
| 669 |
+
"epoch": 1.2468021068472535,
|
| 670 |
+
"grad_norm": 0.8765709400177002,
|
| 671 |
+
"learning_rate": 3.147047612756302e-05,
|
| 672 |
+
"loss": 0.8582,
|
| 673 |
+
"num_input_tokens_seen": 481824,
|
| 674 |
+
"step": 415
|
| 675 |
+
},
|
| 676 |
+
{
|
| 677 |
+
"epoch": 1.2618510158013545,
|
| 678 |
+
"grad_norm": 0.9712916612625122,
|
| 679 |
+
"learning_rate": 3.10888456927748e-05,
|
| 680 |
+
"loss": 0.8787,
|
| 681 |
+
"num_input_tokens_seen": 487576,
|
| 682 |
+
"step": 420
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 1.276899924755455,
|
| 686 |
+
"grad_norm": 1.1555066108703613,
|
| 687 |
+
"learning_rate": 3.0705700835520895e-05,
|
| 688 |
+
"loss": 0.8729,
|
| 689 |
+
"num_input_tokens_seen": 493336,
|
| 690 |
+
"step": 425
|
| 691 |
+
},
|
| 692 |
+
{
|
| 693 |
+
"epoch": 1.2919488337095562,
|
| 694 |
+
"grad_norm": 1.1198400259017944,
|
| 695 |
+
"learning_rate": 3.0321136851890036e-05,
|
| 696 |
+
"loss": 0.8772,
|
| 697 |
+
"num_input_tokens_seen": 499760,
|
| 698 |
+
"step": 430
|
| 699 |
+
},
|
| 700 |
+
{
|
| 701 |
+
"epoch": 1.3069977426636568,
|
| 702 |
+
"grad_norm": 1.1468943357467651,
|
| 703 |
+
"learning_rate": 2.9935249390937183e-05,
|
| 704 |
+
"loss": 0.9451,
|
| 705 |
+
"num_input_tokens_seen": 505400,
|
| 706 |
+
"step": 435
|
| 707 |
+
},
|
| 708 |
+
{
|
| 709 |
+
"epoch": 1.3220466516177578,
|
| 710 |
+
"grad_norm": 0.8468641042709351,
|
| 711 |
+
"learning_rate": 2.9548134430893604e-05,
|
| 712 |
+
"loss": 0.8202,
|
| 713 |
+
"num_input_tokens_seen": 511760,
|
| 714 |
+
"step": 440
|
| 715 |
+
},
|
| 716 |
+
{
|
| 717 |
+
"epoch": 1.3370955605718584,
|
| 718 |
+
"grad_norm": 1.3206151723861694,
|
| 719 |
+
"learning_rate": 2.9159888255295116e-05,
|
| 720 |
+
"loss": 0.9773,
|
| 721 |
+
"num_input_tokens_seen": 517616,
|
| 722 |
+
"step": 445
|
| 723 |
+
},
|
| 724 |
+
{
|
| 725 |
+
"epoch": 1.3521444695259595,
|
| 726 |
+
"grad_norm": 1.1996040344238281,
|
| 727 |
+
"learning_rate": 2.8770607429034352e-05,
|
| 728 |
+
"loss": 0.9101,
|
| 729 |
+
"num_input_tokens_seen": 522744,
|
| 730 |
+
"step": 450
|
| 731 |
+
},
|
| 732 |
+
{
|
| 733 |
+
"epoch": 1.36719337848006,
|
| 734 |
+
"grad_norm": 1.1539313793182373,
|
| 735 |
+
"learning_rate": 2.8380388774343047e-05,
|
| 736 |
+
"loss": 0.9633,
|
| 737 |
+
"num_input_tokens_seen": 528648,
|
| 738 |
+
"step": 455
|
| 739 |
+
},
|
| 740 |
+
{
|
| 741 |
+
"epoch": 1.382242287434161,
|
| 742 |
+
"grad_norm": 1.021848440170288,
|
| 743 |
+
"learning_rate": 2.7989329346710375e-05,
|
| 744 |
+
"loss": 0.8886,
|
| 745 |
+
"num_input_tokens_seen": 534000,
|
| 746 |
+
"step": 460
|
| 747 |
+
},
|
| 748 |
+
{
|
| 749 |
+
"epoch": 1.3972911963882617,
|
| 750 |
+
"grad_norm": 0.8612179160118103,
|
| 751 |
+
"learning_rate": 2.759752641074322e-05,
|
| 752 |
+
"loss": 0.9258,
|
| 753 |
+
"num_input_tokens_seen": 539688,
|
| 754 |
+
"step": 465
|
| 755 |
+
},
|
| 756 |
+
{
|
| 757 |
+
"epoch": 1.4123401053423628,
|
| 758 |
+
"grad_norm": 1.0109293460845947,
|
| 759 |
+
"learning_rate": 2.7205077415974416e-05,
|
| 760 |
+
"loss": 0.9039,
|
| 761 |
+
"num_input_tokens_seen": 545112,
|
| 762 |
+
"step": 470
|
| 763 |
+
},
|
| 764 |
+
{
|
| 765 |
+
"epoch": 1.4273890142964636,
|
| 766 |
+
"grad_norm": 1.1920832395553589,
|
| 767 |
+
"learning_rate": 2.6812079972625077e-05,
|
| 768 |
+
"loss": 1.0116,
|
| 769 |
+
"num_input_tokens_seen": 551328,
|
| 770 |
+
"step": 475
|
| 771 |
+
},
|
| 772 |
+
{
|
| 773 |
+
"epoch": 1.4424379232505644,
|
| 774 |
+
"grad_norm": 1.0512142181396484,
|
| 775 |
+
"learning_rate": 2.6418631827326857e-05,
|
| 776 |
+
"loss": 0.8218,
|
| 777 |
+
"num_input_tokens_seen": 556816,
|
| 778 |
+
"step": 480
|
| 779 |
+
},
|
| 780 |
+
{
|
| 781 |
+
"epoch": 1.4574868322046652,
|
| 782 |
+
"grad_norm": 1.146946907043457,
|
| 783 |
+
"learning_rate": 2.602483083881035e-05,
|
| 784 |
+
"loss": 0.8604,
|
| 785 |
+
"num_input_tokens_seen": 562552,
|
| 786 |
+
"step": 485
|
| 787 |
+
},
|
| 788 |
+
{
|
| 789 |
+
"epoch": 1.472535741158766,
|
| 790 |
+
"grad_norm": 1.1064790487289429,
|
| 791 |
+
"learning_rate": 2.563077495356561e-05,
|
| 792 |
+
"loss": 0.8044,
|
| 793 |
+
"num_input_tokens_seen": 568480,
|
| 794 |
+
"step": 490
|
| 795 |
+
},
|
| 796 |
+
{
|
| 797 |
+
"epoch": 1.487584650112867,
|
| 798 |
+
"grad_norm": 0.9678347110748291,
|
| 799 |
+
"learning_rate": 2.5236562181480794e-05,
|
| 800 |
+
"loss": 0.9198,
|
| 801 |
+
"num_input_tokens_seen": 574072,
|
| 802 |
+
"step": 495
|
| 803 |
+
},
|
| 804 |
+
{
|
| 805 |
+
"epoch": 1.5026335590669677,
|
| 806 |
+
"grad_norm": 0.9460956454277039,
|
| 807 |
+
"learning_rate": 2.484229057146507e-05,
|
| 808 |
+
"loss": 0.9181,
|
| 809 |
+
"num_input_tokens_seen": 580040,
|
| 810 |
+
"step": 500
|
| 811 |
+
},
|
| 812 |
+
{
|
| 813 |
+
"epoch": 1.5176824680210683,
|
| 814 |
+
"grad_norm": 1.175920844078064,
|
| 815 |
+
"learning_rate": 2.4448058187061835e-05,
|
| 816 |
+
"loss": 0.8644,
|
| 817 |
+
"num_input_tokens_seen": 586128,
|
| 818 |
+
"step": 505
|
| 819 |
+
},
|
| 820 |
+
{
|
| 821 |
+
"epoch": 1.5327313769751694,
|
| 822 |
+
"grad_norm": 1.2150397300720215,
|
| 823 |
+
"learning_rate": 2.4053963082058244e-05,
|
| 824 |
+
"loss": 1.0127,
|
| 825 |
+
"num_input_tokens_seen": 592256,
|
| 826 |
+
"step": 510
|
| 827 |
+
},
|
| 828 |
+
{
|
| 829 |
+
"epoch": 1.54778028592927,
|
| 830 |
+
"grad_norm": 0.9520708918571472,
|
| 831 |
+
"learning_rate": 2.3660103276097232e-05,
|
| 832 |
+
"loss": 0.7937,
|
| 833 |
+
"num_input_tokens_seen": 597704,
|
| 834 |
+
"step": 515
|
| 835 |
+
},
|
| 836 |
+
{
|
| 837 |
+
"epoch": 1.562829194883371,
|
| 838 |
+
"grad_norm": 1.0742231607437134,
|
| 839 |
+
"learning_rate": 2.3266576730297956e-05,
|
| 840 |
+
"loss": 0.9806,
|
| 841 |
+
"num_input_tokens_seen": 603240,
|
| 842 |
+
"step": 520
|
| 843 |
+
},
|
| 844 |
+
{
|
| 845 |
+
"epoch": 1.5778781038374716,
|
| 846 |
+
"grad_norm": 1.0484352111816406,
|
| 847 |
+
"learning_rate": 2.2873481322890862e-05,
|
| 848 |
+
"loss": 0.934,
|
| 849 |
+
"num_input_tokens_seen": 609616,
|
| 850 |
+
"step": 525
|
| 851 |
+
},
|
| 852 |
+
{
|
| 853 |
+
"epoch": 1.5929270127915727,
|
| 854 |
+
"grad_norm": 0.8829598426818848,
|
| 855 |
+
"learning_rate": 2.2480914824873297e-05,
|
| 856 |
+
"loss": 0.9288,
|
| 857 |
+
"num_input_tokens_seen": 615520,
|
| 858 |
+
"step": 530
|
| 859 |
+
},
|
| 860 |
+
{
|
| 861 |
+
"epoch": 1.6079759217456733,
|
| 862 |
+
"grad_norm": 0.9222884178161621,
|
| 863 |
+
"learning_rate": 2.2088974875691863e-05,
|
| 864 |
+
"loss": 0.8597,
|
| 865 |
+
"num_input_tokens_seen": 621208,
|
| 866 |
+
"step": 535
|
| 867 |
+
},
|
| 868 |
+
{
|
| 869 |
+
"epoch": 1.6230248306997743,
|
| 870 |
+
"grad_norm": 0.894801914691925,
|
| 871 |
+
"learning_rate": 2.1697758958957448e-05,
|
| 872 |
+
"loss": 0.8817,
|
| 873 |
+
"num_input_tokens_seen": 627176,
|
| 874 |
+
"step": 540
|
| 875 |
+
},
|
| 876 |
+
{
|
| 877 |
+
"epoch": 1.6380737396538751,
|
| 878 |
+
"grad_norm": 1.1703195571899414,
|
| 879 |
+
"learning_rate": 2.1307364378199005e-05,
|
| 880 |
+
"loss": 0.777,
|
| 881 |
+
"num_input_tokens_seen": 633248,
|
| 882 |
+
"step": 545
|
| 883 |
+
},
|
| 884 |
+
{
|
| 885 |
+
"epoch": 1.653122648607976,
|
| 886 |
+
"grad_norm": 1.0596733093261719,
|
| 887 |
+
"learning_rate": 2.0917888232662196e-05,
|
| 888 |
+
"loss": 0.798,
|
| 889 |
+
"num_input_tokens_seen": 639000,
|
| 890 |
+
"step": 550
|
| 891 |
+
},
|
| 892 |
+
{
|
| 893 |
+
"epoch": 1.6681715575620768,
|
| 894 |
+
"grad_norm": 1.0426228046417236,
|
| 895 |
+
"learning_rate": 2.0529427393158705e-05,
|
| 896 |
+
"loss": 0.9104,
|
| 897 |
+
"num_input_tokens_seen": 645280,
|
| 898 |
+
"step": 555
|
| 899 |
+
},
|
| 900 |
+
{
|
| 901 |
+
"epoch": 1.6832204665161776,
|
| 902 |
+
"grad_norm": 1.3300392627716064,
|
| 903 |
+
"learning_rate": 2.014207847797256e-05,
|
| 904 |
+
"loss": 0.8293,
|
| 905 |
+
"num_input_tokens_seen": 651760,
|
| 906 |
+
"step": 560
|
| 907 |
+
},
|
| 908 |
+
{
|
| 909 |
+
"epoch": 1.6982693754702785,
|
| 910 |
+
"grad_norm": 1.2664028406143188,
|
| 911 |
+
"learning_rate": 1.9755937828829067e-05,
|
| 912 |
+
"loss": 0.8821,
|
| 913 |
+
"num_input_tokens_seen": 657272,
|
| 914 |
+
"step": 565
|
| 915 |
+
},
|
| 916 |
+
{
|
| 917 |
+
"epoch": 1.7133182844243793,
|
| 918 |
+
"grad_norm": 0.9889734983444214,
|
| 919 |
+
"learning_rate": 1.937110148693265e-05,
|
| 920 |
+
"loss": 0.8253,
|
| 921 |
+
"num_input_tokens_seen": 663336,
|
| 922 |
+
"step": 570
|
| 923 |
+
},
|
| 924 |
+
{
|
| 925 |
+
"epoch": 1.72836719337848,
|
| 926 |
+
"grad_norm": 1.0789241790771484,
|
| 927 |
+
"learning_rate": 1.8987665169079454e-05,
|
| 928 |
+
"loss": 0.9391,
|
| 929 |
+
"num_input_tokens_seen": 668936,
|
| 930 |
+
"step": 575
|
| 931 |
+
},
|
| 932 |
+
{
|
| 933 |
+
"epoch": 1.743416102332581,
|
| 934 |
+
"grad_norm": 1.2337504625320435,
|
| 935 |
+
"learning_rate": 1.8605724243850502e-05,
|
| 936 |
+
"loss": 0.8711,
|
| 937 |
+
"num_input_tokens_seen": 675000,
|
| 938 |
+
"step": 580
|
| 939 |
+
},
|
| 940 |
+
{
|
| 941 |
+
"epoch": 1.7584650112866818,
|
| 942 |
+
"grad_norm": 0.905838668346405,
|
| 943 |
+
"learning_rate": 1.822537370789163e-05,
|
| 944 |
+
"loss": 0.8346,
|
| 945 |
+
"num_input_tokens_seen": 680584,
|
| 946 |
+
"step": 585
|
| 947 |
+
},
|
| 948 |
+
{
|
| 949 |
+
"epoch": 1.7735139202407826,
|
| 950 |
+
"grad_norm": 1.1633321046829224,
|
| 951 |
+
"learning_rate": 1.7846708162285785e-05,
|
| 952 |
+
"loss": 0.8275,
|
| 953 |
+
"num_input_tokens_seen": 686416,
|
| 954 |
+
"step": 590
|
| 955 |
+
},
|
| 956 |
+
{
|
| 957 |
+
"epoch": 1.7885628291948834,
|
| 958 |
+
"grad_norm": 0.9946597814559937,
|
| 959 |
+
"learning_rate": 1.7469821789023815e-05,
|
| 960 |
+
"loss": 0.9435,
|
| 961 |
+
"num_input_tokens_seen": 692016,
|
| 962 |
+
"step": 595
|
| 963 |
+
},
|
| 964 |
+
{
|
| 965 |
+
"epoch": 1.8036117381489842,
|
| 966 |
+
"grad_norm": 1.0259568691253662,
|
| 967 |
+
"learning_rate": 1.70948083275794e-05,
|
| 968 |
+
"loss": 0.8584,
|
| 969 |
+
"num_input_tokens_seen": 697984,
|
| 970 |
+
"step": 600
|
| 971 |
+
},
|
| 972 |
+
{
|
| 973 |
+
"epoch": 1.818660647103085,
|
| 974 |
+
"grad_norm": 1.0644334554672241,
|
| 975 |
+
"learning_rate": 1.672176105159417e-05,
|
| 976 |
+
"loss": 0.88,
|
| 977 |
+
"num_input_tokens_seen": 704056,
|
| 978 |
+
"step": 605
|
| 979 |
+
},
|
| 980 |
+
{
|
| 981 |
+
"epoch": 1.8337095560571859,
|
| 982 |
+
"grad_norm": 1.0443474054336548,
|
| 983 |
+
"learning_rate": 1.635077274567854e-05,
|
| 984 |
+
"loss": 0.8825,
|
| 985 |
+
"num_input_tokens_seen": 709760,
|
| 986 |
+
"step": 610
|
| 987 |
+
},
|
| 988 |
+
{
|
| 989 |
+
"epoch": 1.8487584650112867,
|
| 990 |
+
"grad_norm": 1.0267105102539062,
|
| 991 |
+
"learning_rate": 1.5981935682334264e-05,
|
| 992 |
+
"loss": 0.9978,
|
| 993 |
+
"num_input_tokens_seen": 715872,
|
| 994 |
+
"step": 615
|
| 995 |
+
},
|
| 996 |
+
{
|
| 997 |
+
"epoch": 1.8638073739653875,
|
| 998 |
+
"grad_norm": 1.3127869367599487,
|
| 999 |
+
"learning_rate": 1.561534159900441e-05,
|
| 1000 |
+
"loss": 0.9626,
|
| 1001 |
+
"num_input_tokens_seen": 722184,
|
| 1002 |
+
"step": 620
|
| 1003 |
+
},
|
| 1004 |
+
{
|
| 1005 |
+
"epoch": 1.8788562829194884,
|
| 1006 |
+
"grad_norm": 1.2093840837478638,
|
| 1007 |
+
"learning_rate": 1.525108167525624e-05,
|
| 1008 |
+
"loss": 0.9308,
|
| 1009 |
+
"num_input_tokens_seen": 727776,
|
| 1010 |
+
"step": 625
|
| 1011 |
+
},
|
| 1012 |
+
{
|
| 1013 |
+
"epoch": 1.8939051918735892,
|
| 1014 |
+
"grad_norm": 0.982764482498169,
|
| 1015 |
+
"learning_rate": 1.4889246510103077e-05,
|
| 1016 |
+
"loss": 0.9757,
|
| 1017 |
+
"num_input_tokens_seen": 733760,
|
| 1018 |
+
"step": 630
|
| 1019 |
+
},
|
| 1020 |
+
{
|
| 1021 |
+
"epoch": 1.90895410082769,
|
| 1022 |
+
"grad_norm": 1.111680507659912,
|
| 1023 |
+
"learning_rate": 1.4529926099470348e-05,
|
| 1024 |
+
"loss": 0.767,
|
| 1025 |
+
"num_input_tokens_seen": 740024,
|
| 1026 |
+
"step": 635
|
| 1027 |
+
},
|
| 1028 |
+
{
|
| 1029 |
+
"epoch": 1.9240030097817908,
|
| 1030 |
+
"grad_norm": 1.218017578125,
|
| 1031 |
+
"learning_rate": 1.4173209813811788e-05,
|
| 1032 |
+
"loss": 0.9272,
|
| 1033 |
+
"num_input_tokens_seen": 745480,
|
| 1034 |
+
"step": 640
|
| 1035 |
+
},
|
| 1036 |
+
{
|
| 1037 |
+
"epoch": 1.9390519187358917,
|
| 1038 |
+
"grad_norm": 1.3443623781204224,
|
| 1039 |
+
"learning_rate": 1.381918637588112e-05,
|
| 1040 |
+
"loss": 0.7941,
|
| 1041 |
+
"num_input_tokens_seen": 751384,
|
| 1042 |
+
"step": 645
|
| 1043 |
+
},
|
| 1044 |
+
{
|
| 1045 |
+
"epoch": 1.9541008276899925,
|
| 1046 |
+
"grad_norm": 0.9702039361000061,
|
| 1047 |
+
"learning_rate": 1.3467943838664863e-05,
|
| 1048 |
+
"loss": 0.8408,
|
| 1049 |
+
"num_input_tokens_seen": 756920,
|
| 1050 |
+
"step": 650
|
| 1051 |
+
},
|
| 1052 |
+
{
|
| 1053 |
+
"epoch": 1.9691497366440933,
|
| 1054 |
+
"grad_norm": 1.1215064525604248,
|
| 1055 |
+
"learning_rate": 1.311956956348177e-05,
|
| 1056 |
+
"loss": 0.8459,
|
| 1057 |
+
"num_input_tokens_seen": 762424,
|
| 1058 |
+
"step": 655
|
| 1059 |
+
},
|
| 1060 |
+
{
|
| 1061 |
+
"epoch": 1.9841986455981941,
|
| 1062 |
+
"grad_norm": 1.3830626010894775,
|
| 1063 |
+
"learning_rate": 1.277415019825417e-05,
|
| 1064 |
+
"loss": 1.0117,
|
| 1065 |
+
"num_input_tokens_seen": 768224,
|
| 1066 |
+
"step": 660
|
| 1067 |
+
},
|
| 1068 |
+
{
|
| 1069 |
+
"epoch": 1.999247554552295,
|
| 1070 |
+
"grad_norm": 1.028895616531372,
|
| 1071 |
+
"learning_rate": 1.2431771655956925e-05,
|
| 1072 |
+
"loss": 0.9665,
|
| 1073 |
+
"num_input_tokens_seen": 773568,
|
| 1074 |
+
"step": 665
|
| 1075 |
+
},
|
| 1076 |
+
{
|
| 1077 |
+
"epoch": 2.0120391271632805,
|
| 1078 |
+
"grad_norm": 1.1555911302566528,
|
| 1079 |
+
"learning_rate": 1.2092519093248988e-05,
|
| 1080 |
+
"loss": 0.7625,
|
| 1081 |
+
"num_input_tokens_seen": 778672,
|
| 1082 |
+
"step": 670
|
| 1083 |
+
},
|
| 1084 |
+
{
|
| 1085 |
+
"epoch": 2.0270880361173815,
|
| 1086 |
+
"grad_norm": 1.037429690361023,
|
| 1087 |
+
"learning_rate": 1.1756476889293269e-05,
|
| 1088 |
+
"loss": 0.8667,
|
| 1089 |
+
"num_input_tokens_seen": 784488,
|
| 1090 |
+
"step": 675
|
| 1091 |
+
},
|
| 1092 |
+
{
|
| 1093 |
+
"epoch": 2.042136945071482,
|
| 1094 |
+
"grad_norm": 1.053051471710205,
|
| 1095 |
+
"learning_rate": 1.1423728624769695e-05,
|
| 1096 |
+
"loss": 0.8297,
|
| 1097 |
+
"num_input_tokens_seen": 790304,
|
| 1098 |
+
"step": 680
|
| 1099 |
+
},
|
| 1100 |
+
{
|
| 1101 |
+
"epoch": 2.057185854025583,
|
| 1102 |
+
"grad_norm": 1.0523649454116821,
|
| 1103 |
+
"learning_rate": 1.1094357061087033e-05,
|
| 1104 |
+
"loss": 0.8774,
|
| 1105 |
+
"num_input_tokens_seen": 796192,
|
| 1106 |
+
"step": 685
|
| 1107 |
+
},
|
| 1108 |
+
{
|
| 1109 |
+
"epoch": 2.072234762979684,
|
| 1110 |
+
"grad_norm": 1.0367976427078247,
|
| 1111 |
+
"learning_rate": 1.0768444119798357e-05,
|
| 1112 |
+
"loss": 0.8476,
|
| 1113 |
+
"num_input_tokens_seen": 802144,
|
| 1114 |
+
"step": 690
|
| 1115 |
+
},
|
| 1116 |
+
{
|
| 1117 |
+
"epoch": 2.087283671933785,
|
| 1118 |
+
"grad_norm": 1.4130756855010986,
|
| 1119 |
+
"learning_rate": 1.0446070862225463e-05,
|
| 1120 |
+
"loss": 0.8641,
|
| 1121 |
+
"num_input_tokens_seen": 807768,
|
| 1122 |
+
"step": 695
|
| 1123 |
+
},
|
| 1124 |
+
{
|
| 1125 |
+
"epoch": 2.1023325808878854,
|
| 1126 |
+
"grad_norm": 1.1584120988845825,
|
| 1127 |
+
"learning_rate": 1.0127317469297277e-05,
|
| 1128 |
+
"loss": 0.8383,
|
| 1129 |
+
"num_input_tokens_seen": 813712,
|
| 1130 |
+
"step": 700
|
| 1131 |
+
},
|
| 1132 |
+
{
|
| 1133 |
+
"epoch": 2.1173814898419865,
|
| 1134 |
+
"grad_norm": 1.2318339347839355,
|
| 1135 |
+
"learning_rate": 9.812263221607112e-06,
|
| 1136 |
+
"loss": 0.9123,
|
| 1137 |
+
"num_input_tokens_seen": 819360,
|
| 1138 |
+
"step": 705
|
| 1139 |
+
},
|
| 1140 |
+
{
|
| 1141 |
+
"epoch": 2.132430398796087,
|
| 1142 |
+
"grad_norm": 1.6237512826919556,
|
| 1143 |
+
"learning_rate": 9.500986479694036e-06,
|
| 1144 |
+
"loss": 0.9635,
|
| 1145 |
+
"num_input_tokens_seen": 824584,
|
| 1146 |
+
"step": 710
|
| 1147 |
+
},
|
| 1148 |
+
{
|
| 1149 |
+
"epoch": 2.147479307750188,
|
| 1150 |
+
"grad_norm": 1.106604814529419,
|
| 1151 |
+
"learning_rate": 9.19356466455287e-06,
|
| 1152 |
+
"loss": 0.9221,
|
| 1153 |
+
"num_input_tokens_seen": 830600,
|
| 1154 |
+
"step": 715
|
| 1155 |
+
},
|
| 1156 |
+
{
|
| 1157 |
+
"epoch": 2.1625282167042887,
|
| 1158 |
+
"grad_norm": 0.8615310788154602,
|
| 1159 |
+
"learning_rate": 8.890074238378074e-06,
|
| 1160 |
+
"loss": 0.8757,
|
| 1161 |
+
"num_input_tokens_seen": 836856,
|
| 1162 |
+
"step": 720
|
| 1163 |
+
},
|
| 1164 |
+
{
|
| 1165 |
+
"epoch": 2.17757712565839,
|
| 1166 |
+
"grad_norm": 0.8537486791610718,
|
| 1167 |
+
"learning_rate": 8.590590685545946e-06,
|
| 1168 |
+
"loss": 0.7958,
|
| 1169 |
+
"num_input_tokens_seen": 842872,
|
| 1170 |
+
"step": 725
|
| 1171 |
+
},
|
| 1172 |
+
{
|
| 1173 |
+
"epoch": 2.1926260346124904,
|
| 1174 |
+
"grad_norm": 0.8556107878684998,
|
| 1175 |
+
"learning_rate": 8.295188493840104e-06,
|
| 1176 |
+
"loss": 0.7993,
|
| 1177 |
+
"num_input_tokens_seen": 848664,
|
| 1178 |
+
"step": 730
|
| 1179 |
+
},
|
| 1180 |
+
{
|
| 1181 |
+
"epoch": 2.2076749435665914,
|
| 1182 |
+
"grad_norm": 1.093944787979126,
|
| 1183 |
+
"learning_rate": 8.003941135924858e-06,
|
| 1184 |
+
"loss": 0.8436,
|
| 1185 |
+
"num_input_tokens_seen": 854712,
|
| 1186 |
+
"step": 735
|
| 1187 |
+
},
|
| 1188 |
+
{
|
| 1189 |
+
"epoch": 2.222723852520692,
|
| 1190 |
+
"grad_norm": 1.2639975547790527,
|
| 1191 |
+
"learning_rate": 7.71692105107098e-06,
|
| 1192 |
+
"loss": 0.896,
|
| 1193 |
+
"num_input_tokens_seen": 860648,
|
| 1194 |
+
"step": 740
|
| 1195 |
+
},
|
| 1196 |
+
{
|
| 1197 |
+
"epoch": 2.237772761474793,
|
| 1198 |
+
"grad_norm": 1.177778720855713,
|
| 1199 |
+
"learning_rate": 7.434199627138602e-06,
|
| 1200 |
+
"loss": 0.8948,
|
| 1201 |
+
"num_input_tokens_seen": 866080,
|
| 1202 |
+
"step": 745
|
| 1203 |
+
},
|
| 1204 |
+
{
|
| 1205 |
+
"epoch": 2.2528216704288937,
|
| 1206 |
+
"grad_norm": 0.9701932668685913,
|
| 1207 |
+
"learning_rate": 7.155847182821523e-06,
|
| 1208 |
+
"loss": 0.8546,
|
| 1209 |
+
"num_input_tokens_seen": 871560,
|
| 1210 |
+
"step": 750
|
| 1211 |
+
},
|
| 1212 |
+
{
|
| 1213 |
+
"epoch": 2.2678705793829947,
|
| 1214 |
+
"grad_norm": 1.0232161283493042,
|
| 1215 |
+
"learning_rate": 6.881932950157538e-06,
|
| 1216 |
+
"loss": 0.8494,
|
| 1217 |
+
"num_input_tokens_seen": 877568,
|
| 1218 |
+
"step": 755
|
| 1219 |
+
},
|
| 1220 |
+
{
|
| 1221 |
+
"epoch": 2.282919488337096,
|
| 1222 |
+
"grad_norm": 1.119441270828247,
|
| 1223 |
+
"learning_rate": 6.612525057308949e-06,
|
| 1224 |
+
"loss": 0.7723,
|
| 1225 |
+
"num_input_tokens_seen": 883808,
|
| 1226 |
+
"step": 760
|
| 1227 |
+
},
|
| 1228 |
+
{
|
| 1229 |
+
"epoch": 2.2979683972911964,
|
| 1230 |
+
"grad_norm": 1.5488731861114502,
|
| 1231 |
+
"learning_rate": 6.347690511617693e-06,
|
| 1232 |
+
"loss": 0.9168,
|
| 1233 |
+
"num_input_tokens_seen": 889296,
|
| 1234 |
+
"step": 765
|
| 1235 |
+
},
|
| 1236 |
+
{
|
| 1237 |
+
"epoch": 2.313017306245297,
|
| 1238 |
+
"grad_norm": 1.2143895626068115,
|
| 1239 |
+
"learning_rate": 6.0874951829392234e-06,
|
| 1240 |
+
"loss": 0.8831,
|
| 1241 |
+
"num_input_tokens_seen": 895120,
|
| 1242 |
+
"step": 770
|
| 1243 |
+
},
|
| 1244 |
+
{
|
| 1245 |
+
"epoch": 2.328066215199398,
|
| 1246 |
+
"grad_norm": 1.157663106918335,
|
| 1247 |
+
"learning_rate": 5.832003787259327e-06,
|
| 1248 |
+
"loss": 0.854,
|
| 1249 |
+
"num_input_tokens_seen": 900320,
|
| 1250 |
+
"step": 775
|
| 1251 |
+
},
|
| 1252 |
+
{
|
| 1253 |
+
"epoch": 2.343115124153499,
|
| 1254 |
+
"grad_norm": 1.4496403932571411,
|
| 1255 |
+
"learning_rate": 5.581279870597867e-06,
|
| 1256 |
+
"loss": 0.8843,
|
| 1257 |
+
"num_input_tokens_seen": 905928,
|
| 1258 |
+
"step": 780
|
| 1259 |
+
},
|
| 1260 |
+
{
|
| 1261 |
+
"epoch": 2.3581640331075997,
|
| 1262 |
+
"grad_norm": 0.8820686936378479,
|
| 1263 |
+
"learning_rate": 5.335385793203604e-06,
|
| 1264 |
+
"loss": 0.862,
|
| 1265 |
+
"num_input_tokens_seen": 911976,
|
| 1266 |
+
"step": 785
|
| 1267 |
+
},
|
| 1268 |
+
{
|
| 1269 |
+
"epoch": 2.3732129420617003,
|
| 1270 |
+
"grad_norm": 1.622916579246521,
|
| 1271 |
+
"learning_rate": 5.094382714043907e-06,
|
| 1272 |
+
"loss": 0.985,
|
| 1273 |
+
"num_input_tokens_seen": 917840,
|
| 1274 |
+
"step": 790
|
| 1275 |
+
},
|
| 1276 |
+
{
|
| 1277 |
+
"epoch": 2.3882618510158014,
|
| 1278 |
+
"grad_norm": 1.0603710412979126,
|
| 1279 |
+
"learning_rate": 4.85833057559322e-06,
|
| 1280 |
+
"loss": 0.7679,
|
| 1281 |
+
"num_input_tokens_seen": 923168,
|
| 1282 |
+
"step": 795
|
| 1283 |
+
},
|
| 1284 |
+
{
|
| 1285 |
+
"epoch": 2.4033107599699024,
|
| 1286 |
+
"grad_norm": 1.0989526510238647,
|
| 1287 |
+
"learning_rate": 4.627288088924156e-06,
|
| 1288 |
+
"loss": 0.8198,
|
| 1289 |
+
"num_input_tokens_seen": 928720,
|
| 1290 |
+
"step": 800
|
| 1291 |
+
},
|
| 1292 |
+
{
|
| 1293 |
+
"epoch": 2.418359668924003,
|
| 1294 |
+
"grad_norm": 0.9745952486991882,
|
| 1295 |
+
"learning_rate": 4.401312719104802e-06,
|
| 1296 |
+
"loss": 0.7773,
|
| 1297 |
+
"num_input_tokens_seen": 934568,
|
| 1298 |
+
"step": 805
|
| 1299 |
+
},
|
| 1300 |
+
{
|
| 1301 |
+
"epoch": 2.4334085778781036,
|
| 1302 |
+
"grad_norm": 1.529707670211792,
|
| 1303 |
+
"learning_rate": 4.180460670905978e-06,
|
| 1304 |
+
"loss": 0.9312,
|
| 1305 |
+
"num_input_tokens_seen": 940264,
|
| 1306 |
+
"step": 810
|
| 1307 |
+
},
|
| 1308 |
+
{
|
| 1309 |
+
"epoch": 2.4484574868322047,
|
| 1310 |
+
"grad_norm": 1.2537649869918823,
|
| 1311 |
+
"learning_rate": 3.964786874821955e-06,
|
| 1312 |
+
"loss": 0.8497,
|
| 1313 |
+
"num_input_tokens_seen": 946128,
|
| 1314 |
+
"step": 815
|
| 1315 |
+
},
|
| 1316 |
+
{
|
| 1317 |
+
"epoch": 2.4635063957863057,
|
| 1318 |
+
"grad_norm": 1.0871232748031616,
|
| 1319 |
+
"learning_rate": 3.754344973408064e-06,
|
| 1320 |
+
"loss": 0.782,
|
| 1321 |
+
"num_input_tokens_seen": 952032,
|
| 1322 |
+
"step": 820
|
| 1323 |
+
},
|
| 1324 |
+
{
|
| 1325 |
+
"epoch": 2.4785553047404063,
|
| 1326 |
+
"grad_norm": 1.2940268516540527,
|
| 1327 |
+
"learning_rate": 3.5491873079387256e-06,
|
| 1328 |
+
"loss": 0.8937,
|
| 1329 |
+
"num_input_tokens_seen": 957960,
|
| 1330 |
+
"step": 825
|
| 1331 |
+
},
|
| 1332 |
+
{
|
| 1333 |
+
"epoch": 2.493604213694507,
|
| 1334 |
+
"grad_norm": 1.2327598333358765,
|
| 1335 |
+
"learning_rate": 3.3493649053890326e-06,
|
| 1336 |
+
"loss": 0.7039,
|
| 1337 |
+
"num_input_tokens_seen": 964336,
|
| 1338 |
+
"step": 830
|
| 1339 |
+
},
|
| 1340 |
+
{
|
| 1341 |
+
"epoch": 2.508653122648608,
|
| 1342 |
+
"grad_norm": 1.516093373298645,
|
| 1343 |
+
"learning_rate": 3.1549274657433375e-06,
|
| 1344 |
+
"loss": 0.9265,
|
| 1345 |
+
"num_input_tokens_seen": 970168,
|
| 1346 |
+
"step": 835
|
| 1347 |
+
},
|
| 1348 |
+
{
|
| 1349 |
+
"epoch": 2.523702031602709,
|
| 1350 |
+
"grad_norm": 1.1418204307556152,
|
| 1351 |
+
"learning_rate": 2.9659233496337786e-06,
|
| 1352 |
+
"loss": 0.8669,
|
| 1353 |
+
"num_input_tokens_seen": 975752,
|
| 1354 |
+
"step": 840
|
| 1355 |
+
},
|
| 1356 |
+
{
|
| 1357 |
+
"epoch": 2.5387509405568096,
|
| 1358 |
+
"grad_norm": 1.3584462404251099,
|
| 1359 |
+
"learning_rate": 2.7823995663120327e-06,
|
| 1360 |
+
"loss": 0.9174,
|
| 1361 |
+
"num_input_tokens_seen": 981672,
|
| 1362 |
+
"step": 845
|
| 1363 |
+
},
|
| 1364 |
+
{
|
| 1365 |
+
"epoch": 2.55379984951091,
|
| 1366 |
+
"grad_norm": 1.1911269426345825,
|
| 1367 |
+
"learning_rate": 2.6044017619571065e-06,
|
| 1368 |
+
"loss": 0.8718,
|
| 1369 |
+
"num_input_tokens_seen": 987560,
|
| 1370 |
+
"step": 850
|
| 1371 |
+
},
|
| 1372 |
+
{
|
| 1373 |
+
"epoch": 2.5688487584650113,
|
| 1374 |
+
"grad_norm": 1.3048710823059082,
|
| 1375 |
+
"learning_rate": 2.431974208322191e-06,
|
| 1376 |
+
"loss": 0.8634,
|
| 1377 |
+
"num_input_tokens_seen": 993200,
|
| 1378 |
+
"step": 855
|
| 1379 |
+
},
|
| 1380 |
+
{
|
| 1381 |
+
"epoch": 2.5838976674191123,
|
| 1382 |
+
"grad_norm": 1.1356749534606934,
|
| 1383 |
+
"learning_rate": 2.265159791723373e-06,
|
| 1384 |
+
"loss": 0.845,
|
| 1385 |
+
"num_input_tokens_seen": 999192,
|
| 1386 |
+
"step": 860
|
| 1387 |
+
},
|
| 1388 |
+
{
|
| 1389 |
+
"epoch": 2.598946576373213,
|
| 1390 |
+
"grad_norm": 1.2655149698257446,
|
| 1391 |
+
"learning_rate": 2.104000002372886e-06,
|
| 1392 |
+
"loss": 0.8008,
|
| 1393 |
+
"num_input_tokens_seen": 1004576,
|
| 1394 |
+
"step": 865
|
| 1395 |
+
},
|
| 1396 |
+
{
|
| 1397 |
+
"epoch": 2.6139954853273135,
|
| 1398 |
+
"grad_norm": 1.354706048965454,
|
| 1399 |
+
"learning_rate": 1.9485349240596613e-06,
|
| 1400 |
+
"loss": 0.8797,
|
| 1401 |
+
"num_input_tokens_seen": 1010352,
|
| 1402 |
+
"step": 870
|
| 1403 |
+
},
|
| 1404 |
+
{
|
| 1405 |
+
"epoch": 2.6290443942814146,
|
| 1406 |
+
"grad_norm": 1.0957777500152588,
|
| 1407 |
+
"learning_rate": 1.7988032241796376e-06,
|
| 1408 |
+
"loss": 0.946,
|
| 1409 |
+
"num_input_tokens_seen": 1016272,
|
| 1410 |
+
"step": 875
|
| 1411 |
+
},
|
| 1412 |
+
{
|
| 1413 |
+
"epoch": 2.6440933032355156,
|
| 1414 |
+
"grad_norm": 1.3322904109954834,
|
| 1415 |
+
"learning_rate": 1.6548421441183875e-06,
|
| 1416 |
+
"loss": 0.8032,
|
| 1417 |
+
"num_input_tokens_seen": 1021896,
|
| 1418 |
+
"step": 880
|
| 1419 |
+
},
|
| 1420 |
+
{
|
| 1421 |
+
"epoch": 2.659142212189616,
|
| 1422 |
+
"grad_norm": 1.1363080739974976,
|
| 1423 |
+
"learning_rate": 1.5166874899884053e-06,
|
| 1424 |
+
"loss": 0.8892,
|
| 1425 |
+
"num_input_tokens_seen": 1027704,
|
| 1426 |
+
"step": 885
|
| 1427 |
+
},
|
| 1428 |
+
{
|
| 1429 |
+
"epoch": 2.674191121143717,
|
| 1430 |
+
"grad_norm": 1.2706754207611084,
|
| 1431 |
+
"learning_rate": 1.3843736237233784e-06,
|
| 1432 |
+
"loss": 0.856,
|
| 1433 |
+
"num_input_tokens_seen": 1033800,
|
| 1434 |
+
"step": 890
|
| 1435 |
+
},
|
| 1436 |
+
{
|
| 1437 |
+
"epoch": 2.689240030097818,
|
| 1438 |
+
"grad_norm": 1.1934438943862915,
|
| 1439 |
+
"learning_rate": 1.2579334545316733e-06,
|
| 1440 |
+
"loss": 0.8617,
|
| 1441 |
+
"num_input_tokens_seen": 1040008,
|
| 1442 |
+
"step": 895
|
| 1443 |
+
},
|
| 1444 |
+
{
|
| 1445 |
+
"epoch": 2.704288939051919,
|
| 1446 |
+
"grad_norm": 1.4581674337387085,
|
| 1447 |
+
"learning_rate": 1.137398430711123e-06,
|
| 1448 |
+
"loss": 0.9117,
|
| 1449 |
+
"num_input_tokens_seen": 1046272,
|
| 1450 |
+
"step": 900
|
| 1451 |
+
},
|
| 1452 |
+
{
|
| 1453 |
+
"epoch": 2.7193378480060195,
|
| 1454 |
+
"grad_norm": 1.080992579460144,
|
| 1455 |
+
"learning_rate": 1.0227985318271682e-06,
|
| 1456 |
+
"loss": 0.7855,
|
| 1457 |
+
"num_input_tokens_seen": 1052032,
|
| 1458 |
+
"step": 905
|
| 1459 |
+
},
|
| 1460 |
+
{
|
| 1461 |
+
"epoch": 2.73438675696012,
|
| 1462 |
+
"grad_norm": 1.0012861490249634,
|
| 1463 |
+
"learning_rate": 9.141622612563571e-07,
|
| 1464 |
+
"loss": 0.8212,
|
| 1465 |
+
"num_input_tokens_seen": 1057584,
|
| 1466 |
+
"step": 910
|
| 1467 |
+
},
|
| 1468 |
+
{
|
| 1469 |
+
"epoch": 2.749435665914221,
|
| 1470 |
+
"grad_norm": 1.1472314596176147,
|
| 1471 |
+
"learning_rate": 8.115166390969125e-07,
|
| 1472 |
+
"loss": 0.8404,
|
| 1473 |
+
"num_input_tokens_seen": 1063760,
|
| 1474 |
+
"step": 915
|
| 1475 |
+
},
|
| 1476 |
+
{
|
| 1477 |
+
"epoch": 2.764484574868322,
|
| 1478 |
+
"grad_norm": 1.2558523416519165,
|
| 1479 |
+
"learning_rate": 7.148871954483105e-07,
|
| 1480 |
+
"loss": 0.7782,
|
| 1481 |
+
"num_input_tokens_seen": 1069544,
|
| 1482 |
+
"step": 920
|
| 1483 |
+
},
|
| 1484 |
+
{
|
| 1485 |
+
"epoch": 2.779533483822423,
|
| 1486 |
+
"grad_norm": 1.1380338668823242,
|
| 1487 |
+
"learning_rate": 6.242979640613933e-07,
|
| 1488 |
+
"loss": 0.7847,
|
| 1489 |
+
"num_input_tokens_seen": 1075472,
|
| 1490 |
+
"step": 925
|
| 1491 |
+
},
|
| 1492 |
+
{
|
| 1493 |
+
"epoch": 2.7945823927765234,
|
| 1494 |
+
"grad_norm": 0.972878098487854,
|
| 1495 |
+
"learning_rate": 5.397714763606843e-07,
|
| 1496 |
+
"loss": 0.8857,
|
| 1497 |
+
"num_input_tokens_seen": 1081464,
|
| 1498 |
+
"step": 930
|
| 1499 |
+
},
|
| 1500 |
+
{
|
| 1501 |
+
"epoch": 2.8096313017306245,
|
| 1502 |
+
"grad_norm": 1.2546579837799072,
|
| 1503 |
+
"learning_rate": 4.613287558403512e-07,
|
| 1504 |
+
"loss": 0.8029,
|
| 1505 |
+
"num_input_tokens_seen": 1087464,
|
| 1506 |
+
"step": 935
|
| 1507 |
+
},
|
| 1508 |
+
{
|
| 1509 |
+
"epoch": 2.8246802106847255,
|
| 1510 |
+
"grad_norm": 1.1165034770965576,
|
| 1511 |
+
"learning_rate": 3.8898931283523344e-07,
|
| 1512 |
+
"loss": 0.8154,
|
| 1513 |
+
"num_input_tokens_seen": 1092888,
|
| 1514 |
+
"step": 940
|
| 1515 |
+
},
|
| 1516 |
+
{
|
| 1517 |
+
"epoch": 2.839729119638826,
|
| 1518 |
+
"grad_norm": 1.3924362659454346,
|
| 1519 |
+
"learning_rate": 3.227711396682015e-07,
|
| 1520 |
+
"loss": 0.8791,
|
| 1521 |
+
"num_input_tokens_seen": 1098808,
|
| 1522 |
+
"step": 945
|
| 1523 |
+
},
|
| 1524 |
+
{
|
| 1525 |
+
"epoch": 2.854778028592927,
|
| 1526 |
+
"grad_norm": 1.021448016166687,
|
| 1527 |
+
"learning_rate": 2.626907061751116e-07,
|
| 1528 |
+
"loss": 0.787,
|
| 1529 |
+
"num_input_tokens_seen": 1104688,
|
| 1530 |
+
"step": 950
|
| 1531 |
+
},
|
| 1532 |
+
{
|
| 1533 |
+
"epoch": 2.869826937547028,
|
| 1534 |
+
"grad_norm": 1.3344382047653198,
|
| 1535 |
+
"learning_rate": 2.0876295560839364e-07,
|
| 1536 |
+
"loss": 0.8831,
|
| 1537 |
+
"num_input_tokens_seen": 1110960,
|
| 1538 |
+
"step": 955
|
| 1539 |
+
},
|
| 1540 |
+
{
|
| 1541 |
+
"epoch": 2.884875846501129,
|
| 1542 |
+
"grad_norm": 1.3956490755081177,
|
| 1543 |
+
"learning_rate": 1.6100130092037703e-07,
|
| 1544 |
+
"loss": 0.7677,
|
| 1545 |
+
"num_input_tokens_seen": 1116800,
|
| 1546 |
+
"step": 960
|
| 1547 |
+
},
|
| 1548 |
+
{
|
| 1549 |
+
"epoch": 2.8999247554552294,
|
| 1550 |
+
"grad_norm": 1.1644206047058105,
|
| 1551 |
+
"learning_rate": 1.194176214271897e-07,
|
| 1552 |
+
"loss": 0.7567,
|
| 1553 |
+
"num_input_tokens_seen": 1122248,
|
| 1554 |
+
"step": 965
|
| 1555 |
+
},
|
| 1556 |
+
{
|
| 1557 |
+
"epoch": 2.9149736644093305,
|
| 1558 |
+
"grad_norm": 1.2540746927261353,
|
| 1559 |
+
"learning_rate": 8.402225985413848e-08,
|
| 1560 |
+
"loss": 0.8944,
|
| 1561 |
+
"num_input_tokens_seen": 1127928,
|
| 1562 |
+
"step": 970
|
| 1563 |
+
},
|
| 1564 |
+
{
|
| 1565 |
+
"epoch": 2.930022573363431,
|
| 1566 |
+
"grad_norm": 1.1684881448745728,
|
| 1567 |
+
"learning_rate": 5.4824019763252685e-08,
|
| 1568 |
+
"loss": 0.9737,
|
| 1569 |
+
"num_input_tokens_seen": 1133336,
|
| 1570 |
+
"step": 975
|
| 1571 |
+
},
|
| 1572 |
+
{
|
| 1573 |
+
"epoch": 2.945071482317532,
|
| 1574 |
+
"grad_norm": 1.072198510169983,
|
| 1575 |
+
"learning_rate": 3.1830163363655296e-08,
|
| 1576 |
+
"loss": 0.8965,
|
| 1577 |
+
"num_input_tokens_seen": 1139048,
|
| 1578 |
+
"step": 980
|
| 1579 |
+
},
|
| 1580 |
+
{
|
| 1581 |
+
"epoch": 2.9601203912716327,
|
| 1582 |
+
"grad_norm": 1.7171086072921753,
|
| 1583 |
+
"learning_rate": 1.504640970531046e-08,
|
| 1584 |
+
"loss": 0.837,
|
| 1585 |
+
"num_input_tokens_seen": 1144456,
|
| 1586 |
+
"step": 985
|
| 1587 |
+
},
|
| 1588 |
+
{
|
| 1589 |
+
"epoch": 2.975169300225734,
|
| 1590 |
+
"grad_norm": 1.4984806776046753,
|
| 1591 |
+
"learning_rate": 4.4769332565558485e-09,
|
| 1592 |
+
"loss": 0.7812,
|
| 1593 |
+
"num_input_tokens_seen": 1150160,
|
| 1594 |
+
"step": 990
|
| 1595 |
+
},
|
| 1596 |
+
{
|
| 1597 |
+
"epoch": 2.9902182091798344,
|
| 1598 |
+
"grad_norm": 1.2322272062301636,
|
| 1599 |
+
"learning_rate": 1.2436286584982527e-10,
|
| 1600 |
+
"loss": 0.8613,
|
| 1601 |
+
"num_input_tokens_seen": 1156704,
|
| 1602 |
+
"step": 995
|
| 1603 |
+
},
|
| 1604 |
+
{
|
| 1605 |
+
"epoch": 2.9932279909706545,
|
| 1606 |
+
"num_input_tokens_seen": 1157808,
|
| 1607 |
+
"step": 996,
|
| 1608 |
+
"total_flos": 1.3788411572404224e+16,
|
| 1609 |
+
"train_loss": 0.939127180590687,
|
| 1610 |
+
"train_runtime": 10484.6402,
|
| 1611 |
+
"train_samples_per_second": 0.761,
|
| 1612 |
+
"train_steps_per_second": 0.095
|
| 1613 |
+
}
|
| 1614 |
+
],
|
| 1615 |
+
"logging_steps": 5,
|
| 1616 |
+
"max_steps": 996,
|
| 1617 |
+
"num_input_tokens_seen": 1157808,
|
| 1618 |
+
"num_train_epochs": 3,
|
| 1619 |
+
"save_steps": 100,
|
| 1620 |
+
"stateful_callbacks": {
|
| 1621 |
+
"TrainerControl": {
|
| 1622 |
+
"args": {
|
| 1623 |
+
"should_epoch_stop": false,
|
| 1624 |
+
"should_evaluate": false,
|
| 1625 |
+
"should_log": false,
|
| 1626 |
+
"should_save": true,
|
| 1627 |
+
"should_training_stop": true
|
| 1628 |
+
},
|
| 1629 |
+
"attributes": {}
|
| 1630 |
+
}
|
| 1631 |
+
},
|
| 1632 |
+
"total_flos": 1.3788411572404224e+16,
|
| 1633 |
+
"train_batch_size": 1,
|
| 1634 |
+
"trial_name": null,
|
| 1635 |
+
"trial_params": null
|
| 1636 |
+
}
|
training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:19877188370cf3d74bf7f601a975694ade04d8236ac4f1d0937bf61a4ca990d0
|
| 3 |
+
size 5688
|
training_args.yaml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
bf16: true
|
| 2 |
+
cutoff_len: 2048
|
| 3 |
+
dataset: OCR_Finetuning_Dataset
|
| 4 |
+
dataset_dir: /content/drive/MyDrive
|
| 5 |
+
ddp_timeout: 180000000
|
| 6 |
+
do_train: true
|
| 7 |
+
finetuning_type: lora
|
| 8 |
+
flash_attn: auto
|
| 9 |
+
gradient_accumulation_steps: 8
|
| 10 |
+
include_num_input_tokens_seen: true
|
| 11 |
+
learning_rate: 5.0e-05
|
| 12 |
+
logging_steps: 5
|
| 13 |
+
lora_alpha: 16
|
| 14 |
+
lora_dropout: 0
|
| 15 |
+
lora_rank: 8
|
| 16 |
+
lora_target: all
|
| 17 |
+
lr_scheduler_type: cosine
|
| 18 |
+
max_grad_norm: 1.0
|
| 19 |
+
max_samples: 100000
|
| 20 |
+
model_name_or_path: prithivMLmods/Qwen2-VL-OCR-2B-Instruct
|
| 21 |
+
num_train_epochs: 3.0
|
| 22 |
+
optim: adamw_torch
|
| 23 |
+
output_dir: saves/Custom/lora/train_2025-04-01-09-06-36
|
| 24 |
+
packing: false
|
| 25 |
+
per_device_train_batch_size: 1
|
| 26 |
+
plot_loss: true
|
| 27 |
+
preprocessing_num_workers: 16
|
| 28 |
+
report_to: none
|
| 29 |
+
save_steps: 100
|
| 30 |
+
stage: sft
|
| 31 |
+
template: default
|
| 32 |
+
trust_remote_code: true
|
| 33 |
+
warmup_steps: 0
|
training_loss.png
ADDED
|
vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|