# ViT Outfit Encoder Training Configuration # Model configuration model: embedding_dim: 512 # Input embedding dimension (must match ResNet output) num_layers: 6 # Number of transformer layers num_heads: 8 # Number of attention heads ff_multiplier: 4 # Feed-forward multiplier dropout: 0.1 # Dropout rate max_outfit_length: 8 # Maximum outfit length (items) # Transformer architecture transformer: activation: "gelu" # gelu, relu, swish norm_first: true # Pre-norm vs post-norm layer_norm_eps: 1e-5 # Layer norm epsilon # Training configuration training: batch_size: 32 # Batch size for training epochs: 30 # Number of training epochs lr: 0.0005 # Learning rate weight_decay: 0.05 # Weight decay triplet_margin: 0.3 # Triplet loss margin # Optimization optimizer: "adamw" # adamw, sgd, adam scheduler: "cosine" # cosine, step, plateau warmup_epochs: 3 # Warmup epochs for learning rate # Mixed precision use_amp: true # Use automatic mixed precision # Validation eval_every: 1 # Evaluate every N epochs save_every: 5 # Save checkpoint every N epochs early_stopping_patience: 8 # Early stopping patience # Data configuration data: num_workers: 4 # DataLoader workers pin_memory: true # Pin memory for faster GPU transfer # Outfit constraints outfit_constraints: min_items: 3 # Minimum items per outfit max_items: 8 # Maximum items per outfit require_slots: false # Require specific clothing slots # Paths paths: data_root: "data/Polyvore" # Dataset root directory export_dir: "models/exports" # Output directory for checkpoints checkpoint_name: "vit_outfit_model.pth" best_checkpoint_name: "vit_outfit_model_best.pth" metrics_name: "vit_metrics.json" # ResNet checkpoint for embedding resnet_checkpoint: "models/exports/resnet_item_embedder_best.pth" # Loss configuration loss: type: "triplet_cosine" # triplet_cosine, triplet_euclidean, contrastive # Triplet loss triplet: margin: 0.3 # Triplet margin distance: "cosine" # cosine, euclidean # Additional losses auxiliary: diversity_loss: 0.1 # Diversity regularization weight consistency_loss: 0.05 # Consistency regularization weight # Logging and monitoring logging: use_wandb: false # Use Weights & Biases log_every: 50 # Log every N steps save_outfits: false # Save sample outfit visualizations # Hardware hardware: device: "auto" # auto, cuda, cpu, mps num_gpus: 1 # Number of GPUs to use precision: "mixed" # mixed, full # Advanced advanced: gradient_clip: 1.0 # Gradient clipping value embedding_freeze: false # Freeze ResNet embeddings during training outfit_augmentation: true # Use outfit-level augmentation # Curriculum learning curriculum: enabled: false # Enable curriculum learning start_length: 3 # Start with outfits of this length max_length: 8 # Gradually increase to this length increase_every: 5 # Increase length every N epochs