Not optimized for cost / performance efficiency, YMMV.
Pretrain 4*H100
# ====================
# MODEL CONFIGURATION
# ====================
base_model: ../mergekit/pf_v3_upscale
model_type: MistralForCausalLM
tokenizer_type: AutoTokenizer
chat_template: mistral_v7_tekken
# ====================
# DATASET CONFIGURATION
# ====================
datasets:
- path: ./data/pretrain_dataset_v5_stripped.jsonl
type: completion
dataset_prepared_path:
train_on_inputs: false # Only train on assistant responses
# ====================
# QLORA CONFIGURATION
# ====================
adapter: qlora
load_in_4bit: true
lora_r: 32
lora_alpha: 64
lora_dropout: 0.05
lora_target_linear: true
# lora_modules_to_save: # Uncomment only if you added NEW tokens
# ====================
# TRAINING PARAMETERS
# ====================
num_epochs: 1
micro_batch_size: 4
gradient_accumulation_steps: 1
learning_rate: 4e-5
optimizer: paged_adamw_8bit
lr_scheduler: rex
warmup_ratio: 0.05
weight_decay: 0.01
max_grad_norm: 1.0
# ====================
# SEQUENCE & PACKING
# ====================
sequence_len: 12288
sample_packing: true
eval_sample_packing: false
pad_to_sequence_len: true
# ====================
# HARDWARE OPTIMIZATIONS
# ====================
bf16: auto
flash_attention: true
gradient_checkpointing: offload
deepspeed: deepspeed_configs/zero1.json
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false # Cut Cross Entropy overrides this
liger_fused_linear_cross_entropy: false # Cut Cross Entropy overrides this
# ====================
# EVALUATION & CHECKPOINTING
# ====================
save_strategy: steps
save_steps: 40
save_total_limit: 5 # Keep best + last few checkpoints
load_best_model_at_end: true
greater_is_better: false
# ====================
# LOGGING & OUTPUT
# ====================
output_dir: ./Visage-V3-PT-1
logging_steps: 2
save_safetensors: true
# ====================
# WANDB TRACKING
# ====================
wandb_project: Visage-V3-PT
# wandb_entity: your_entity
wandb_name: Visage-V3-PT-1
SFT 4*H100
# ====================
# MODEL CONFIGURATION
# ====================
base_model: ./Visage-V3-PT-1/merged
model_type: MistralForCausalLM
tokenizer_type: AutoTokenizer
chat_template: mistral_v7_tekken
# ====================
# DATASET CONFIGURATION
# ====================
datasets:
- path: ./data/dataset.jsonl
type: chat_template
split: train
chat_template_strategy: tokenizer
field_messages: messages
message_property_mappings:
role: role
content: content
roles:
user: ["user"]
assistant: ["assistant"]
system: ["system"]
dataset_prepared_path:
train_on_inputs: false # Only train on assistant responses
# ====================
# QLORA CONFIGURATION
# ====================
adapter: qlora
load_in_4bit: true
lora_r: 128
lora_alpha: 128
lora_dropout: 0.1
lora_target_linear: true
# lora_modules_to_save: # Uncomment only if you added NEW tokens
# ====================
# TRAINING PARAMETERS
# ====================
num_epochs: 3
micro_batch_size: 4
gradient_accumulation_steps: 1
learning_rate: 1e-5
optimizer: paged_adamw_8bit
lr_scheduler: rex
warmup_ratio: 0.05
weight_decay: 0.01
max_grad_norm: 1.0
# ====================
# SEQUENCE & PACKING
# ====================
sequence_len: 8192
sample_packing: true
pad_to_sequence_len: true
# ====================
# HARDWARE OPTIMIZATIONS
# ====================
bf16: auto
flash_attention: true
gradient_checkpointing: offload
deepspeed: deepspeed_configs/zero1.json
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false # Cut Cross Entropy overrides this
liger_fused_linear_cross_entropy: false # Cut Cross Entropy overrides this
# ====================
# EVALUATION & CHECKPOINTING
# ====================
save_strategy: steps
save_steps: 20
save_total_limit: 5 # Keep best + last few checkpoints
load_best_model_at_end: true
metric_for_best_model: eval_loss
greater_is_better: false
# ====================
# LOGGING & OUTPUT
# ====================
output_dir: ./Visage-V3-PT-1-SFT-2
logging_steps: 1
save_safetensors: true
# ====================
# WANDB TRACKING
# ====================
wandb_project: Visage-V3-SFT
# wandb_entity: your_entity
wandb_name: Visage-V3-PT-1-SFT-2
DPO 2*H200
# ====================
# MODEL CONFIGURATION
# ====================
base_model: ./Visage-V3-PT-1-SFT-2/merged
model_type: MistralForCausalLM
tokenizer_type: AutoTokenizer
chat_template: mistral_v7_tekken
# ====================
# RL/DPO CONFIGURATION
# ====================
rl: dpo
rl_beta: 0.085
# ====================
# DATASET CONFIGURATION
# ====================
datasets:
- path: ./data/handcrafted_dataset_mistral_rep.jsonl
type: chat_template.default
field_messages: messages
field_chosen: chosen
field_rejected: rejected
message_property_mappings:
role: role
content: content
roles:
system: ["system"]
user: ["user"]
assistant: ["assistant"]
- path: ./data/approved_automated_l3_dataset.jsonl
type: chat_template.default
field_messages: messages
field_chosen: chosen
field_rejected: rejected
message_property_mappings:
role: role
content: content
roles:
system: ["system"]
user: ["user"]
assistant: ["assistant"]
dataset_prepared_path:
train_on_inputs: false # Only train on assistant responses
# ====================
# QLORA CONFIGURATION
# ====================
adapter: lora
load_in_8bit: true
lora_r: 16
lora_alpha: 32
lora_dropout: 0.1
lora_target_linear: true
# lora_modules_to_save: # Uncomment only if you added NEW tokens
# ====================
# TRAINING PARAMETERS
# ====================
num_epochs: 1
micro_batch_size: 2
gradient_accumulation_steps: 4
learning_rate: 2e-6
optimizer: adamw_torch_fused
lr_scheduler: cosine
warmup_steps: 5
weight_decay: 0.01
max_grad_norm: 1.0
# ====================
# SEQUENCE CONFIGURATION
# ====================
sequence_len: 8192
pad_to_sequence_len: true
# ====================
# HARDWARE OPTIMIZATIONS
# ====================
bf16: auto
tf32: false
flash_attention: true
gradient_checkpointing: offload
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
liger_rope: true
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_cross_entropy: false # Cut Cross Entropy overrides this
liger_fused_linear_cross_entropy: false # Cut Cross Entropy overrides this
deepspeed: deepspeed_configs/zero1.json
# ====================
# CHECKPOINTING
# ====================
save_steps: 10
save_total_limit: 10
load_best_model_at_end: true
metric_for_best_model: eval_loss
greater_is_better: false
# ====================
# LOGGING & OUTPUT
# ====================
output_dir: ./Visage-V3-PT-1-SFT-2-DPO-2
logging_steps: 1
save_safetensors: true
# ====================
# WANDB TRACKING
# ====================
wandb_project: Visage-V3-DPO
# wandb_entity: your_entity
wandb_name: Visage-V3-PT-1-SFT-2-DPO-2