Creation Process: SFT
SFT on approx 10 million tokens, SFW / NSFW RP, stories, creative instruct & chat data.
MoE's are brutal to train even with a small dataset like mine, so I took a different approach from usual. I used a very low LR in an effort to avoid having to apply DPO / KTO training afterwards.
I think there's likely a better config to be found, but experimentation with the model to find it is quite draining.
>
Axolotl configs
Not optimized for cost / performance efficiency, YMMV.
SFT (4*H200)
base_model: zai-org/GLM-4.5-Air
eot_tokens:
- "<|user|>"
- "<|endoftext|>"
special_tokens:
eos_token: "<|user|>"
# ====================
# DATASET CONFIGURATION
# ====================
datasets:
- path: ./data/dataset.jsonl
type: chat_template
split: train
field_messages: messages
message_property_mappings:
role: role
content: content
roles:
user: ["user"]
assistant: ["assistant"]
system: ["system"]
dataset_prepared_path: ./last_run_prepared
train_on_inputs: false # Only train on assistant responses
eval_sample_packing: False
# ====================
# QLORA CONFIGURATION
# ====================
adapter: qlora
load_in_4bit: true
lora_r: 32
lora_alpha: 32
lora_dropout: 0.1
lora_target_modules:
- gate_proj
- down_proj
- up_proj
- q_proj
- v_proj
- k_proj
- o_proj
# lora_modules_to_save: # Uncomment only if you added NEW tokens
# ====================
# TRAINING PARAMETERS
# ====================
num_epochs: 3
micro_batch_size: 2
gradient_accumulation_steps: 4
learning_rate: 4.5e-6
optimizer: paged_adamw_8bit
lr_scheduler: rex
warmup_ratio: 0.05
weight_decay: 0.01
max_grad_norm: 1.0
val_set_size: 0.02
# ====================
# SEQUENCE & PACKING
# ====================
sequence_len: 8192
sample_packing: true
pad_to_sequence_len: true
# ====================
# HARDWARE OPTIMIZATIONS
# ====================
bf16: auto
flash_attention: true
gradient_checkpointing: true
plugins:
- axolotl.integrations.liger.LigerPlugin
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
liger_rope: false
liger_rms_norm: true
liger_layer_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: true
cut_cross_entropy: false
deepspeed: deepspeed_configs/zero1.json
# ====================
# EVALUATION & CHECKPOINTING
# ====================
save_strategy: steps
save_steps: 20
eval_steps: 35
save_total_limit: 18 # Keep best + last few checkpoints
load_best_model_at_end: true
metric_for_best_model: eval_loss
greater_is_better: false
# ====================
# LOGGING & OUTPUT
# ====================
output_dir: ./GLM-AIR-SFT_v2-5
logging_steps: 1
save_safetensors: true
# ====================
# WANDB TRACKING
# ====================
wandb_project: GLM-AIR-SFT
# wandb_entity: your_entity
wandb_name: GLM-AIR-SFT_v2-5