# ViT Outfit Encoder Training Configuration

# Model configuration
model:
  embedding_dim: 512             # Input embedding dimension (must match ResNet output)
  num_layers: 6                  # Number of transformer layers
  num_heads: 8                   # Number of attention heads
  ff_multiplier: 4               # Feed-forward multiplier
  dropout: 0.1                   # Dropout rate
  max_outfit_length: 8           # Maximum outfit length (items)
  
  # Transformer architecture
  transformer:
    activation: "gelu"            # gelu, relu, swish
    norm_first: true             # Pre-norm vs post-norm
    layer_norm_eps: 1e-5         # Layer norm epsilon

# Training configuration
training:
  batch_size: 32                 # Batch size for training
  epochs: 30                     # Number of training epochs
  lr: 0.0005                     # Learning rate
  weight_decay: 0.05             # Weight decay
  triplet_margin: 0.3            # Triplet loss margin
  
  # Optimization
  optimizer: "adamw"             # adamw, sgd, adam
  scheduler: "cosine"            # cosine, step, plateau
  warmup_epochs: 3               # Warmup epochs for learning rate
  
  # Mixed precision
  use_amp: true                  # Use automatic mixed precision
  
  # Validation
  eval_every: 1                  # Evaluate every N epochs
  save_every: 5                  # Save checkpoint every N epochs
  early_stopping_patience: 8     # Early stopping patience

# Data configuration
data:
  num_workers: 4                 # DataLoader workers
  pin_memory: true               # Pin memory for faster GPU transfer
  
  # Outfit constraints
  outfit_constraints:
    min_items: 3                 # Minimum items per outfit
    max_items: 8                 # Maximum items per outfit
    require_slots: false         # Require specific clothing slots

# Paths
paths:
  data_root: "data/Polyvore"     # Dataset root directory
  export_dir: "models/exports"   # Output directory for checkpoints
  checkpoint_name: "vit_outfit_model.pth"
  best_checkpoint_name: "vit_outfit_model_best.pth"
  metrics_name: "vit_metrics.json"
  
  # ResNet checkpoint for embedding
  resnet_checkpoint: "models/exports/resnet_item_embedder_best.pth"

# Loss configuration
loss:
  type: "triplet_cosine"         # triplet_cosine, triplet_euclidean, contrastive
  
  # Triplet loss
  triplet:
    margin: 0.3                  # Triplet margin
    distance: "cosine"           # cosine, euclidean
    
  # Additional losses
  auxiliary:
    diversity_loss: 0.1          # Diversity regularization weight
    consistency_loss: 0.05       # Consistency regularization weight

# Logging and monitoring
logging:
  use_wandb: false               # Use Weights & Biases
  log_every: 50                  # Log every N steps
  save_outfits: false            # Save sample outfit visualizations
  
# Hardware
hardware:
  device: "auto"                 # auto, cuda, cpu, mps
  num_gpus: 1                    # Number of GPUs to use
  precision: "mixed"             # mixed, full
  
# Advanced
advanced:
  gradient_clip: 1.0             # Gradient clipping value
  embedding_freeze: false        # Freeze ResNet embeddings during training
  outfit_augmentation: true      # Use outfit-level augmentation
  
  # Curriculum learning
  curriculum:
    enabled: false               # Enable curriculum learning
    start_length: 3              # Start with outfits of this length
    max_length: 8                # Gradually increase to this length
    increase_every: 5            # Increase length every N epochs