paligemma2-3b-pt-224-sft-lora-iphone_gates_cotrain_0.1_magicsoup_no_insta_sub5
/
training_config.json
{ | |
"model_args": { | |
"base_model_revision": null, | |
"model_name_or_path": "/gscratch/weirdlab/mateogc/projects/vlm-navigation/data/paligemma2-3b-pt-224-sft-lora-magicsoup_no_cfiphone_no_insta_sub5", | |
"model_revision": "main", | |
"model_code_revision": null, | |
"torch_dtype": "bfloat16", | |
"tokenizer_name_or_path": null, | |
"processor_name_or_path": null, | |
"trust_remote_code": false, | |
"attn_implementation": "eager", | |
"use_peft": true, | |
"lora_r": 16, | |
"lora_alpha": 16, | |
"lora_dropout": 0.05, | |
"lora_target_modules": [ | |
"q_proj", | |
"k_proj", | |
"v_proj", | |
"o_proj", | |
"gate_proj", | |
"up_proj", | |
"down_proj" | |
], | |
"lora_modules_to_save": null, | |
"load_in_8bit": false, | |
"load_in_4bit": false, | |
"bnb_4bit_quant_type": "nf4", | |
"use_bnb_nested_quant": false, | |
"bnb_4bit_quant_storage": "uint8" | |
}, | |
"data_args": { | |
"chat_template": null, | |
"dataset_mixer": { | |
"mateoguaman/vlmn_iphone_gates_cotrain_0.1_magicsoup_no_insta_sub5": 1.0 | |
}, | |
"eval_dataset_mixer": { | |
"mateoguaman/vlmn_iphone_gates_cotrain_0.1_magicsoup_no_insta_sub5": 1.0 | |
}, | |
"text_column": "text", | |
"dataset_splits": [ | |
"train", | |
"test" | |
], | |
"train_splits": [ | |
"train" | |
], | |
"validation_splits": [ | |
"validation" | |
], | |
"processing_params": {}, | |
"dataset_configs": null, | |
"preprocessing_num_workers": 12, | |
"truncation_side": null, | |
"auto_insert_empty_system_msg": true, | |
"auto_set_chat_template": false, | |
"cache_dataset_only": false, | |
"just_combine_data": false, | |
"output_dataset_name": null, | |
"output_dataset_description": null, | |
"hf_entity": null | |
}, | |
"training_args": { | |
"output_dir": "data/paligemma2-3b-pt-224-sft-lora-iphone_gates_cotrain_0.1_magicsoup_no_insta_sub5", | |
"overwrite_output_dir": true, | |
"do_train": false, | |
"do_eval": true, | |
"do_predict": false, | |
"eval_strategy": "steps", | |
"prediction_loss_only": false, | |
"per_device_train_batch_size": 8, | |
"per_device_eval_batch_size": 8, | |
"per_gpu_train_batch_size": null, | |
"per_gpu_eval_batch_size": null, | |
"gradient_accumulation_steps": 1, | |
"eval_accumulation_steps": null, | |
"eval_delay": 0, | |
"torch_empty_cache_steps": null, | |
"learning_rate": 0.0001, | |
"weight_decay": 1e-05, | |
"adam_beta1": 0.9, | |
"adam_beta2": 0.999, | |
"adam_epsilon": 1e-08, | |
"max_grad_norm": 1.0, | |
"num_train_epochs": 1, | |
"max_steps": -1, | |
"lr_scheduler_type": "cosine", | |
"lr_scheduler_kwargs": {}, | |
"warmup_ratio": 0.1, | |
"warmup_steps": 0, | |
"log_level": "info", | |
"log_level_replica": "warning", | |
"log_on_each_node": true, | |
"logging_dir": "data/paligemma2-3b-pt-224-sft-lora-iphone_gates_cotrain_0.1_magicsoup_no_insta_sub5/runs/May13_00-07-04_g3118", | |
"logging_strategy": "steps", | |
"logging_first_step": true, | |
"logging_steps": 20, | |
"logging_nan_inf_filter": true, | |
"save_strategy": "steps", | |
"save_steps": 0.25, | |
"save_total_limit": null, | |
"save_safetensors": true, | |
"save_on_each_node": false, | |
"save_only_model": false, | |
"restore_callback_states_from_checkpoint": false, | |
"no_cuda": false, | |
"use_cpu": false, | |
"use_mps_device": false, | |
"seed": 42, | |
"data_seed": null, | |
"jit_mode_eval": false, | |
"use_ipex": false, | |
"bf16": true, | |
"fp16": false, | |
"fp16_opt_level": "O1", | |
"half_precision_backend": "auto", | |
"bf16_full_eval": false, | |
"fp16_full_eval": false, | |
"tf32": null, | |
"local_rank": 5, | |
"ddp_backend": null, | |
"tpu_num_cores": null, | |
"tpu_metrics_debug": false, | |
"debug": [], | |
"dataloader_drop_last": false, | |
"eval_steps": 0.25, | |
"dataloader_num_workers": 0, | |
"dataloader_prefetch_factor": null, | |
"past_index": -1, | |
"run_name": "data/paligemma2-3b-pt-224-sft-lora-iphone_gates_cotrain_0.1_magicsoup_no_insta_sub5", | |
"disable_tqdm": false, | |
"remove_unused_columns": true, | |
"label_names": null, | |
"load_best_model_at_end": true, | |
"metric_for_best_model": "loss", | |
"greater_is_better": false, | |
"ignore_data_skip": false, | |
"fsdp": [], | |
"fsdp_min_num_params": 0, | |
"fsdp_config": { | |
"min_num_params": 0, | |
"xla": false, | |
"xla_fsdp_v2": false, | |
"xla_fsdp_grad_ckpt": false | |
}, | |
"fsdp_transformer_layer_cls_to_wrap": null, | |
"accelerator_config": { | |
"split_batches": false, | |
"dispatch_batches": null, | |
"even_batches": true, | |
"use_seedable_sampler": true, | |
"non_blocking": false, | |
"gradient_accumulation_kwargs": null | |
}, | |
"deepspeed": null, | |
"label_smoothing_factor": 0.0, | |
"optim": "adamw_torch", | |
"optim_args": null, | |
"adafactor": false, | |
"group_by_length": false, | |
"length_column_name": "length", | |
"report_to": [ | |
"wandb" | |
], | |
"ddp_find_unused_parameters": null, | |
"ddp_bucket_cap_mb": null, | |
"ddp_broadcast_buffers": null, | |
"dataloader_pin_memory": true, | |
"dataloader_persistent_workers": false, | |
"skip_memory_metrics": true, | |
"use_legacy_prediction_loop": false, | |
"push_to_hub": false, | |
"resume_from_checkpoint": null, | |
"hub_model_id": "paligemma2-3b-pt-224-sft-lora-iphone_gates_cotrain_0.1_magicsoup_no_insta_sub5", | |
"hub_strategy": "every_save", | |
"hub_token": null, | |
"hub_private_repo": null, | |
"hub_always_push": false, | |
"gradient_checkpointing": false, | |
"gradient_checkpointing_kwargs": null, | |
"include_inputs_for_metrics": false, | |
"include_for_metrics": [], | |
"eval_do_concat_batches": true, | |
"fp16_backend": "auto", | |
"evaluation_strategy": null, | |
"push_to_hub_model_id": null, | |
"push_to_hub_organization": null, | |
"push_to_hub_token": null, | |
"mp_parameters": "", | |
"auto_find_batch_size": false, | |
"full_determinism": false, | |
"torchdynamo": null, | |
"ray_scope": "last", | |
"ddp_timeout": 1800, | |
"torch_compile": false, | |
"torch_compile_backend": null, | |
"torch_compile_mode": null, | |
"dispatch_batches": null, | |
"split_batches": null, | |
"include_tokens_per_second": false, | |
"include_num_input_tokens_seen": false, | |
"neftune_noise_alpha": null, | |
"optim_target_modules": null, | |
"batch_eval_metrics": false, | |
"eval_on_start": false, | |
"use_liger_kernel": false, | |
"eval_use_gather_object": false, | |
"average_tokens_across_devices": false, | |
"model_init_kwargs": null, | |
"use_liger": false, | |
"dataset_text_field": "text", | |
"dataset_kwargs": null, | |
"dataset_num_proc": null, | |
"max_seq_length": 2048, | |
"packing": true, | |
"eval_packing": null, | |
"dataset_batch_size": null, | |
"num_of_sequences": null, | |
"chars_per_token": null, | |
"hub_model_revision": "main", | |
"distributed_state": { | |
"_cpu": false, | |
"backend": "nccl", | |
"device": "cuda:5", | |
"debug": false, | |
"distributed_type": "MULTI_GPU", | |
"num_processes": 8, | |
"process_index": 5, | |
"local_process_index": 5, | |
"fork_launched": false | |
}, | |
"_n_gpu": 1, | |
"__cached__setup_devices": "cuda:5", | |
"deepspeed_plugin": null | |
} | |
} |