data_path: /mnt/localssd/ImageNet2012/train
data_face_path:
cloud_save_path: output/exp-vq
no_local_save: false
vq_model: VQ-16
vq_ckpt:
finetune: false
ema: true
codebook_size: 4096
codebook_embed_dim: 32
codebook_l2_norm: true
codebook_weight: 1.0
entropy_loss_ratio: 0.1
commit_loss_beta: 0.25
reconstruction_weight: 1.0
reconstruction_loss: l2
perceptual_weight: 1.0
disc_weight: 0.5
disc_epoch_start: 56
disc_start: 0
disc_type: dinodisc
disc_loss: hinge
gen_loss: hinge
compile: false
dropout_p: 0.0
results_dir: results_tokenizer_image
dataset: imagenet
image_size: 256
epochs: 200
lr: 3e-5
disc_lr: 0.0001
max_grad_norm: 0.0
lr_scheduler: cosine
weight_decay: 0.0
disc_weight_decay: 0.0005
beta1: 0.9
beta2: 0.95
global_batch_size: 1024
global_seed: 0
num_workers: 16
log_every: 100
vis_every: 5000
ckpt_every: 10000
gradient_accumulation_steps: 1
mixed_precision: bf16
save_best: true
val_data_path: /mnt/localssd/ImageNet2012/val
sample_folder_dir: samples
reconstruction_folder_dir: reconstruction
v_patch_nums:
- 16
enc_type: dinov2
dec_type: dinov2
semantic_guide: dinov2
num_latent_tokens: 256
encoder_model: vit_base_patch14_dinov2.lvd142m
decoder_model: vit_base_patch14_dinov2.lvd142m
disc_adaptive_weight: true
abs_pos_embed: true
product_quant: 2
share_quant_resi: 4
codebook_drop: 0.1
half_sem: true
start_drop: 3
lecam_loss_weight: 0.001
sem_loss_weight: 0.1
enc_tuning_method: full
dec_tuning_method: full
clip_norm: false
sem_loss_scale: 1.0
config: configs/tokenizer.yaml
norm_type: bn
aug_prob: 1.0
aug_fade_steps: 0
disc_reinit: 0
debug_disc: false
rank: 0
world_size: 32
gpu: 0
dist_url: env://
distributed: true
dist_backend: nccl