[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[10-29 12:06:17] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[10-29 12:06:17] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 17
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp113_d17/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp113_d17/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 4096
  codebook_embed_dim  : 32
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_large_patch14_dinov2.lvd142m
  decoder_model       : vit_large_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_msg          : fix bug
  commit_id           : d9be612da9c1a0f8350fd7614e16337787b4640e
  branch              : main
}

[10-29 12:06:17] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[10-29 12:06:20] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f134cae31c0>
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:06:20] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f134cae31c0>
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:06:20] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth
[10-29 12:06:20] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[10-29 12:06:20] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[10-29 12:06:17] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[10-29 12:06:17] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 17
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp113_d17/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp113_d17/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 4096
  codebook_embed_dim  : 32
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_large_patch14_dinov2.lvd142m
  decoder_model       : vit_large_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_msg          : fix bug
  branch              : main
  commit_id           : d9be612da9c1a0f8350fd7614e16337787b4640e
}

[10-29 12:06:17] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[10-29 12:06:20] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fe404aa3370>
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:06:20] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fe404aa3370>
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:06:20] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth
[10-29 12:06:20] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[10-29 12:06:20] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[10-29 12:06:17] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[10-29 12:06:17] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 17
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp113_d17/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp113_d17/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 4096
  codebook_embed_dim  : 32
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_large_patch14_dinov2.lvd142m
  decoder_model       : vit_large_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_msg          : fix bug
  branch              : main
  commit_id           : d9be612da9c1a0f8350fd7614e16337787b4640e
}

[10-29 12:06:17] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[10-29 12:06:20] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f3ef86a7370>
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:06:20] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f3ef86a7370>
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:06:20] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth
[10-29 12:06:20] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[10-29 12:06:20] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[10-29 12:06:15] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[10-29 12:06:17] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[10-29 12:06:17] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 17
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp113_d17/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp113_d17/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 4096
  codebook_embed_dim  : 32
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_large_patch14_dinov2.lvd142m
  decoder_model       : vit_large_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  branch              : main
  commit_id           : d9be612da9c1a0f8350fd7614e16337787b4640e
  commit_msg          : fix bug
}

[10-29 12:06:17] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[10-29 12:06:20] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1279867, len(val_set)=50000, num_classes=1000
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7ff54c41f370>
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:06:20] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7ff54c41f370>
[10-29 12:06:20] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:06:20] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth
[10-29 12:06:20] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[10-29 12:06:20] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...     [dataloader multi processing](*) finished! (47.39s)
     [dataloader multi processing](*) finished! (47.94s)
     [dataloader multi processing](*) finished! (51.25s)
[10-29 12:07:07] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
     [dataloader multi processing](*) finished! (51.98s)
[10-29 12:07:08] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[10-29 12:07:25] (e/user/VAR/models/var.py, line 103)=> 
[constructor]  ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== 
    [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354,
        0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708]))

[10-29 12:07:25] (e/user/VAR/models/var.py, line 103)=> 
[constructor]  ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== 
    [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354,
        0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708]))

[10-29 12:07:11] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1667, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[10-29 12:07:29] (e/user/VAR/models/var.py, line 103)=> 
[constructor]  ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== 
    [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354,
        0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708]))

[10-29 12:07:12] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[10-29 12:07:30] (e/user/VAR/models/var.py, line 103)=> 
[constructor]  ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== 
    [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354,
        0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708]))

[10-29 12:07:25] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035
[10-29 12:08:29] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0708333
    (word_embed): Linear(in_features=64, out_features=1088, bias=True)
    (class_emb): Embedding(1001, 1088)
    (lvl_embed): Embedding(10, 1088)
    (shared_ada_lin): Identity()
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
      (1-16): 16 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1088, out_features=2176, bias=True)
      )
    )
    (head): Linear(in_features=1088, out_features=8192, bias=True)
  )
)


[10-29 12:08:29] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34
[10-29 12:08:29] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26


[10-29 12:08:29] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n"
                    " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n"
                    " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n"
                    " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n"
                    " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n"
                    " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n"
                    " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n"
                    " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[10-29 12:08:29] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank0] type(model).__name__='OptimizedModule' count=214, numel=375258593
[10-29 12:07:25] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035
[10-29 12:08:29] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0708333
    (word_embed): Linear(in_features=64, out_features=1088, bias=True)
    (class_emb): Embedding(1001, 1088)
    (lvl_embed): Embedding(10, 1088)
    (shared_ada_lin): Identity()
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
      (1-16): 16 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1088, out_features=2176, bias=True)
      )
    )
    (head): Linear(in_features=1088, out_features=8192, bias=True)
  )
)


[10-29 12:08:29] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34
[10-29 12:08:29] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26


[10-29 12:08:29] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n"
                    " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n"
                    " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n"
                    " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n"
                    " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n"
                    " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n"
                    " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n"
                    " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[10-29 12:08:29] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank8] type(model).__name__='OptimizedModule' count=214, numel=375258593
[10-29 12:07:29] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035
[10-29 12:08:29] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0708333
    (word_embed): Linear(in_features=64, out_features=1088, bias=True)
    (class_emb): Embedding(1001, 1088)
    (lvl_embed): Embedding(10, 1088)
    (shared_ada_lin): Identity()
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
      (1-16): 16 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1088, out_features=2176, bias=True)
      )
    )
    (head): Linear(in_features=1088, out_features=8192, bias=True)
  )
)


[10-29 12:08:29] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34
[10-29 12:08:29] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26


[10-29 12:08:29] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n"
                    " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n"
                    " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n"
                    " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n"
                    " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n"
                    " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n"
                    " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n"
                    " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[10-29 12:08:29] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank16] type(model).__name__='OptimizedModule' count=214, numel=375258593
[10-29 12:07:30] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035
[10-29 12:08:29] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0708333
    (word_embed): Linear(in_features=64, out_features=1088, bias=True)
    (class_emb): Embedding(1001, 1088)
    (lvl_embed): Embedding(10, 1088)
    (shared_ada_lin): Identity()
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
      (1-16): 16 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1088, out_features=2176, bias=True)
      )
    )
    (head): Linear(in_features=1088, out_features=8192, bias=True)
  )
)


[10-29 12:08:29] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34
[10-29 12:08:29] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26


[10-29 12:08:29] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n"
                    " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n"
                    " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n"
                    " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n"
                    " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n"
                    " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n"
                    " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n"
                    " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[10-29 12:08:29] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank24] type(model).__name__='OptimizedModule' count=214, numel=375258593
[10-29 12:08:30] (/VAR/utils/lr_control.py, line 105)=> 
[10-29 12:08:30] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[10-29 12:19:36] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1669]  eta: 12 days, 20:52:31  tlr: 1.2e-06  tnm: 0.06  Lm: 8.318 (8.318)  Lt: 8.318 (8.318)  Accm: 0.04 (0.04)  Acct: 0.02 (0.02)  time: 666.2379  data: 0.0006
[10-29 12:08:30] (/VAR/utils/lr_control.py, line 105)=> 
[10-29 12:08:30] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[10-29 12:19:36] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1669]  eta: 12 days, 20:53:48  tlr: 1.2e-06  tnm: 0.06  Lm: 8.318 (8.318)  Lt: 8.318 (8.318)  Accm: 0.01 (0.01)  Acct: 0.02 (0.02)  time: 666.2845  data: 0.0005
[10-29 12:08:30] (/VAR/utils/lr_control.py, line 105)=> 
[10-29 12:08:30] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[10-29 12:19:36] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1667]  eta: 12 days, 20:31:44  tlr: 1.2e-06  tnm: 0.06  Lm: 8.318 (8.318)  Lt: 8.318 (8.318)  Accm: 0.04 (0.04)  Acct: 0.05 (0.05)  time: 666.2895  data: 0.0004
[10-29 12:08:30] (/VAR/utils/lr_control.py, line 105)=> 
[10-29 12:08:30] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[10-29 12:19:36] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1669]  eta: 12 days, 20:31:03  tlr: 1.2e-06  tnm: 0.06  Lm: 8.318 (8.318)  Lt: 8.318 (8.318)  Accm: 0.04 (0.04)  Acct: 0.03 (0.03)  time: 665.4663  data: 0.0006
[10-29 12:23:32] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 416/1667]  eta: 0:45:08  tlr: 9.7e-06  tnm: 0.06  Lm: 8.271 (8.271)  Lt: 8.260 (8.260)  Accm: 0.08 (0.08)  Acct: 0.09 (0.09)  time: 0.3480  data: 0.0002
[10-29 12:23:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 417/1669]  eta: 0:45:02  tlr: 9.7e-06  tnm: 0.06  Lm: 8.272 (8.272)  Lt: 8.267 (8.267)  Accm: 0.08 (0.08)  Acct: 0.08 (0.08)  time: 0.3481  data: 0.0002
[10-29 12:23:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 417/1669]  eta: 0:45:05  tlr: 9.7e-06  tnm: 0.06  Lm: 8.273 (8.273)  Lt: 8.267 (8.267)  Accm: 0.09 (0.09)  Acct: 0.13 (0.13)  time: 0.3481  data: 0.0002
[10-29 12:23:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 417/1669]  eta: 0:45:05  tlr: 9.7e-06  tnm: 0.06  Lm: 8.273 (8.273)  Lt: 8.262 (8.262)  Accm: 0.07 (0.07)  Acct: 0.05 (0.05)  time: 0.3481  data: 0.0002
[10-29 12:25:58] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 833/1667]  eta: 0:17:28  tlr: 1.8e-05  tnm: 0.08  Lm: 8.224 (8.207)  Lt: 8.203 (8.200)  Accm: 0.12 (0.14)  Acct: 0.12 (0.15)  time: 0.3484  data: 0.0002
[10-29 12:25:58] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 834/1669]  eta: 0:17:28  tlr: 1.8e-05  tnm: 0.08  Lm: 8.228 (8.214)  Lt: 8.206 (8.199)  Accm: 0.11 (0.12)  Acct: 0.09 (0.11)  time: 0.3485  data: 0.0002
[10-29 12:25:58] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 834/1669]  eta: 0:17:28  tlr: 1.8e-05  tnm: 0.08  Lm: 8.228 (8.214)  Lt: 8.216 (8.207)  Accm: 0.17 (0.14)  Acct: 0.24 (0.21)  time: 0.3485  data: 0.0002
[10-29 12:25:58] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 834/1669]  eta: 0:17:27  tlr: 1.8e-05  tnm: 0.08  Lm: 8.227 (8.213)  Lt: 8.216 (8.203)  Accm: 0.13 (0.14)  Acct: 0.12 (0.15)  time: 0.3485  data: 0.0002
[10-29 12:28:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1249/1667]  eta: 0:06:38  tlr: 2.7e-05  tnm: 0.14  Lm: 8.152 (8.117)  Lt: 8.142 (8.115)  Accm: 0.19 (0.29)  Acct: 0.20 (0.34)  time: 0.3490  data: 0.0002
[10-29 12:28:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1251/1669]  eta: 0:06:38  tlr: 2.7e-05  tnm: 0.12  Lm: 8.162 (8.131)  Lt: 8.140 (8.127)  Accm: 0.16 (0.25)  Acct: 0.16 (0.23)  time: 0.3491  data: 0.0002
[10-29 12:28:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1251/1669]  eta: 0:06:38  tlr: 2.7e-05  tnm: 0.12  Lm: 8.162 (8.130)  Lt: 8.151 (8.130)  Accm: 0.21 (0.25)  Acct: 0.30 (0.29)  time: 0.3491  data: 0.0002
[10-29 12:28:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1251/1669]  eta: 0:06:38  tlr: 2.7e-05  tnm: 0.12  Lm: 8.161 (8.129)  Lt: 8.145 (8.128)  Accm: 0.20 (0.27)  Acct: 0.22 (0.27)  time: 0.3491  data: 0.0005
[10-29 12:30:47] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1666/1667]  eta: 0:00:00  tlr: 3.5e-05  tnm: 0.35  Lm: 8.081 (8.025)  Lt: 8.081 (8.007)  Accm: 0.26 (0.39)  Acct: 0.28 (0.45)  time: 0.3499  data: 0.0014
[10-29 12:30:47] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [   0/350]   Total time:      0:22:17   (0.803 s / it)


=======================================================   RESTART [10-29 12:58:25]   =======================================================


=======================================================   RESTART [10-29 12:58:25]   =======================================================


=======================================================   RESTART [10-29 12:58:25]   =======================================================


=======================================================   RESTART [10-29 12:58:25]   =======================================================
[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[10-29 12:58:27] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[10-29 12:58:27] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 17
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp113_d17/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp113_d17/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 4096
  codebook_embed_dim  : 32
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_large_patch14_dinov2.lvd142m
  decoder_model       : vit_large_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  branch              : main
  commit_msg          : fix bug
  commit_id           : d9be612da9c1a0f8350fd7614e16337787b4640e
}

[10-29 12:58:27] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[10-29 12:58:30] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f3ce89a7370>
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:58:30] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f3ce89a7370>
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:58:30] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth
[10-29 12:58:30] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[10-29 12:58:30] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[10-29 12:58:27] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[10-29 12:58:27] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 17
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp113_d17/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp113_d17/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 4096
  codebook_embed_dim  : 32
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_large_patch14_dinov2.lvd142m
  decoder_model       : vit_large_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_msg          : fix bug
  branch              : main
  commit_id           : d9be612da9c1a0f8350fd7614e16337787b4640e
}

[10-29 12:58:27] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[10-29 12:58:30] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fb2782f7370>
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:58:30] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fb2782f7370>
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:58:30] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth
[10-29 12:58:30] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[10-29 12:58:30] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[10-29 12:58:27] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[10-29 12:58:27] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 17
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp113_d17/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp113_d17/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 4096
  codebook_embed_dim  : 32
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_large_patch14_dinov2.lvd142m
  decoder_model       : vit_large_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_msg          : fix bug
  branch              : main
  commit_id           : d9be612da9c1a0f8350fd7614e16337787b4640e
}

[10-29 12:58:27] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[10-29 12:58:30] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fd434417370>
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:58:30] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fd434417370>
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:58:30] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth
[10-29 12:58:30] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[10-29 12:58:30] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 215)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 216)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[10-29 12:58:25] (er/VAR/utils/arg_util.py, line 217)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[10-29 12:58:27] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[10-29 12:58:27] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 17
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=17 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_large_patch14_dinov2.lvd142m --decoder_model vit_large_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 32 --codebook_size 4096 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp113_d17/ --vae_ckpt /sensei-fs/users/xiangl/output/exp113/best_ckpt.pt --half_sem True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp113_d17/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp113_d17/tb-VARd17__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp113_d17/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp113_d17/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 4096
  codebook_embed_dim  : 32
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_large_patch14_dinov2.lvd142m
  decoder_model       : vit_large_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_id           : d9be612da9c1a0f8350fd7614e16337787b4640e
  branch              : main
  commit_msg          : fix bug
}

[10-29 12:58:27] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[10-29 12:58:30] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1279867, len(val_set)=50000, num_classes=1000
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fef347d3370>
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:58:30] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fef347d3370>
[10-29 12:58:30] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[10-29 12:58:30] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp113_d17/ar-ckpt*.pth
[10-29 12:58:30] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[10-29 12:58:30] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...     [dataloader multi processing](*) finished! (46.34s)
     [dataloader multi processing](*) finished! (47.47s)
     [dataloader multi processing](*) finished! (47.76s)
     [dataloader multi processing](*) finished! (51.68s)
[10-29 12:59:16] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[10-29 12:59:29] (e/user/VAR/models/var.py, line 103)=> 
[constructor]  ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== 
    [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354,
        0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708]))

[10-29 12:59:18] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[10-29 12:59:31] (e/user/VAR/models/var.py, line 103)=> 
[constructor]  ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== 
    [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354,
        0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708]))

[10-29 12:59:17] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1667, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[10-29 12:59:33] (e/user/VAR/models/var.py, line 103)=> 
[constructor]  ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== 
    [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354,
        0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708]))

[10-29 12:59:22] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[10-29 12:59:35] (e/user/VAR/models/var.py, line 103)=> 
[constructor]  ==== flash_if_available=True (0/17), fused_if_available=True (fusing_add_ln=0/17, fusing_mlp=0/17) ==== 
    [VAR config ] embed_dim=1088, num_heads=17, depth=17, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0708333 (tensor([0.0000, 0.0044, 0.0089, 0.0133, 0.0177, 0.0221, 0.0266, 0.0310, 0.0354,
        0.0398, 0.0443, 0.0487, 0.0531, 0.0576, 0.0620, 0.0664, 0.0708]))

[10-29 12:59:31] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035
[10-29 13:00:35] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0708333
    (word_embed): Linear(in_features=64, out_features=1088, bias=True)
    (class_emb): Embedding(1001, 1088)
    (lvl_embed): Embedding(10, 1088)
    (shared_ada_lin): Identity()
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
      (1-16): 16 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1088, out_features=2176, bias=True)
      )
    )
    (head): Linear(in_features=1088, out_features=8192, bias=True)
  )
)


[10-29 13:00:35] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34
[10-29 13:00:35] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26


[10-29 13:00:35] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n"
                    " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n"
                    " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n"
                    " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n"
                    " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n"
                    " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n"
                    " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n"
                    " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[10-29 13:00:35] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank0] type(model).__name__='OptimizedModule' count=214, numel=375258593
[10-29 12:59:35] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035
[10-29 13:00:35] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0708333
    (word_embed): Linear(in_features=64, out_features=1088, bias=True)
    (class_emb): Embedding(1001, 1088)
    (lvl_embed): Embedding(10, 1088)
    (shared_ada_lin): Identity()
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
      (1-16): 16 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1088, out_features=2176, bias=True)
      )
    )
    (head): Linear(in_features=1088, out_features=8192, bias=True)
  )
)


[10-29 13:00:35] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34
[10-29 13:00:35] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26


[10-29 13:00:35] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n"
                    " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n"
                    " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n"
                    " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n"
                    " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n"
                    " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n"
                    " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n"
                    " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[10-29 13:00:35] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank8] type(model).__name__='OptimizedModule' count=214, numel=375258593
[10-29 12:59:33] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035
[10-29 13:00:35] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0708333
    (word_embed): Linear(in_features=64, out_features=1088, bias=True)
    (class_emb): Embedding(1001, 1088)
    (lvl_embed): Embedding(10, 1088)
    (shared_ada_lin): Identity()
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
      (1-16): 16 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1088, out_features=2176, bias=True)
      )
    )
    (head): Linear(in_features=1088, out_features=8192, bias=True)
  )
)


[10-29 13:00:35] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34
[10-29 13:00:35] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26


[10-29 13:00:35] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n"
                    " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n"
                    " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n"
                    " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n"
                    " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n"
                    " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n"
                    " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n"
                    " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[10-29 13:00:35] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank16] type(model).__name__='OptimizedModule' count=214, numel=375258593
[10-29 12:59:30] (e/user/VAR/models/var.py, line 301)=> [init_weights] VAR with init_std=0.0175035
[10-29 13:00:35] (/home/user/VAR/train.py , line 123)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0708333
    (word_embed): Linear(in_features=64, out_features=1088, bias=True)
    (class_emb): Embedding(1001, 1088)
    (lvl_embed): Embedding(10, 1088)
    (shared_ada_lin): Identity()
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
      (1-16): 16 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1088, out_features=3264, bias=False)
          (proj): Linear(in_features=1088, out_features=1088, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1088, out_features=4352, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=4352, out_features=1088, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1088, out_features=6528, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1088,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1088, out_features=2176, bias=True)
      )
    )
    (head): Linear(in_features=1088, out_features=8192, bias=True)
  )
)


[10-29 13:00:35] (/home/user/VAR/train.py , line 125)=> [INIT][#para] VAE=910.93, VAE.enc=303.66, VAE.dec=303.42, VAE.quant=0.34
[10-29 13:00:35] (/home/user/VAR/train.py , line 126)=> [INIT][#para] VAR=375.26


[10-29 13:00:35] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, _orig_mod.blocks.0.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, _orig_mod.blocks.1.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, _orig_mod.blocks.2.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, _orig_mod.blocks.3.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, _orig_mod.blocks.4.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, _orig_mod.blocks.5.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, _orig_mod.blocks.6.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, _orig_mod.blocks.7.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, _orig_mod.blocks.8.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, _orig_mod.blocks.9.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, _orig_mod.blocks.10.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, _orig_mod.blocks.11.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, _orig_mod.blocks.12.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, _orig_mod.blocks.13.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, _orig_mod.blocks.14.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, _orig_mod.blocks.15.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, _orig_mod.blocks.16.ffn.fc1.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.word_embed.bias, _orig_mod.lvl_embed.weight, _orig_mod.blocks.0.attn.scale_mul_1H11, _orig_mod.blocks.0.attn.q_bias, '\n"
                    " '_orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, _orig_mod.blocks.0.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, _orig_mod.blocks.1.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, _orig_mod.blocks.2.attn.v_bias, '\n"
                    " '_orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, _orig_mod.blocks.3.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, _orig_mod.blocks.3.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, _orig_mod.blocks.4.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, _orig_mod.blocks.5.attn.q_bias, '\n"
                    " '_orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, _orig_mod.blocks.5.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, _orig_mod.blocks.6.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, _orig_mod.blocks.7.attn.v_bias, '\n"
                    " '_orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, _orig_mod.blocks.8.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, _orig_mod.blocks.8.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, _orig_mod.blocks.9.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, _orig_mod.blocks.10.attn.q_bias, '\n"
                    " '_orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, _orig_mod.blocks.10.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, _orig_mod.blocks.11.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, _orig_mod.blocks.12.attn.v_bias, '\n"
                    " '_orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, _orig_mod.blocks.13.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, _orig_mod.blocks.13.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, _orig_mod.blocks.14.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, _orig_mod.blocks.15.attn.q_bias, '\n"
                    " '_orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, _orig_mod.blocks.15.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, _orig_mod.blocks.16.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[10-29 13:00:36] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank24] type(model).__name__='OptimizedModule' count=214, numel=375258593
[10-29 13:00:36] (/VAR/utils/lr_control.py, line 105)=> 
[10-29 13:00:36] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[10-29 13:04:15] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1669]  eta: 4 days, 5:18:02  tlr: 1.2e-06  tnm: 0.06  Lm: 8.318 (8.318)  Lt: 8.318 (8.318)  Accm: 0.04 (0.04)  Acct: 0.03 (0.03)  time: 218.5037  data: 0.0006
[10-29 13:00:36] (/VAR/utils/lr_control.py, line 105)=> 
[10-29 13:00:36] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[10-29 13:04:15] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1669]  eta: 4 days, 5:40:05  tlr: 1.2e-06  tnm: 0.06  Lm: 8.318 (8.318)  Lt: 8.318 (8.318)  Accm: 0.01 (0.01)  Acct: 0.02 (0.02)  time: 219.2963  data: 0.0005
[10-29 13:00:36] (/VAR/utils/lr_control.py, line 105)=> 
[10-29 13:00:36] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[10-29 13:04:15] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1669]  eta: 4 days, 5:40:06  tlr: 1.2e-06  tnm: 0.06  Lm: 8.318 (8.318)  Lt: 8.318 (8.318)  Accm: 0.02 (0.02)  Acct: 0.00 (0.00)  time: 219.2969  data: 0.0005
[10-29 13:00:36] (/VAR/utils/lr_control.py, line 105)=> 
[10-29 13:00:36] (/home/user/VAR/train.py , line 141)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[10-29 13:04:15] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1667]  eta: 4 days, 5:34:41  tlr: 1.2e-06  tnm: 0.06  Lm: 8.318 (8.318)  Lt: 8.318 (8.318)  Accm: 0.03 (0.03)  Acct: 0.02 (0.02)  time: 219.3650  data: 0.0006
[10-29 13:08:13] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 416/1667]  eta: 0:22:50  tlr: 9.7e-06  tnm: 0.06  Lm: 8.271 (8.271)  Lt: 8.262 (8.262)  Accm: 0.09 (0.09)  Acct: 0.11 (0.11)  time: 0.3478  data: 0.0002
[10-29 13:08:13] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 417/1669]  eta: 0:22:49  tlr: 9.7e-06  tnm: 0.05  Lm: 8.274 (8.274)  Lt: 8.268 (8.268)  Accm: 0.08 (0.08)  Acct: 0.04 (0.04)  time: 0.3479  data: 0.0002
[10-29 13:08:13] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 417/1669]  eta: 0:22:49  tlr: 9.7e-06  tnm: 0.05  Lm: 8.268 (8.268)  Lt: 8.258 (8.258)  Accm: 0.11 (0.11)  Acct: 0.13 (0.13)  time: 0.3479  data: 0.0002
[10-29 13:08:13] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 417/1669]  eta: 0:22:47  tlr: 9.7e-06  tnm: 0.05  Lm: 8.268 (8.268)  Lt: 8.258 (8.258)  Accm: 0.08 (0.08)  Acct: 0.08 (0.08)  time: 0.3479  data: 0.0002
[10-29 13:10:38] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 833/1667]  eta: 0:10:02  tlr: 1.8e-05  tnm: 0.08  Lm: 8.224 (8.209)  Lt: 8.207 (8.199)  Accm: 0.15 (0.13)  Acct: 0.21 (0.18)  time: 0.3485  data: 0.0002
[10-29 13:10:38] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 834/1669]  eta: 0:10:01  tlr: 1.8e-05  tnm: 0.08  Lm: 8.219 (8.211)  Lt: 8.199 (8.196)  Accm: 0.13 (0.13)  Acct: 0.12 (0.15)  time: 0.3487  data: 0.0002
[10-29 13:10:38] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 834/1669]  eta: 0:10:02  tlr: 1.8e-05  tnm: 0.08  Lm: 8.230 (8.210)  Lt: 8.219 (8.200)  Accm: 0.15 (0.15)  Acct: 0.09 (0.16)  time: 0.3487  data: 0.0002
[10-29 13:10:38] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 834/1669]  eta: 0:10:02  tlr: 1.8e-05  tnm: 0.08  Lm: 8.218 (8.212)  Lt: 8.198 (8.199)  Accm: 0.20 (0.18)  Acct: 0.24 (0.21)  time: 0.3487  data: 0.0002
[10-29 13:13:02] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1249/1667]  eta: 0:04:09  tlr: 2.7e-05  tnm: 0.13  Lm: 8.154 (8.118)  Lt: 8.140 (8.114)  Accm: 0.18 (0.30)  Acct: 0.27 (0.32)  time: 0.3483  data: 0.0002
[10-29 13:13:03] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1251/1669]  eta: 0:04:09  tlr: 2.7e-05  tnm: 0.20  Lm: 8.159 (8.129)  Lt: 8.140 (8.125)  Accm: 0.26 (0.31)  Acct: 0.30 (0.35)  time: 0.3485  data: 0.0002
[10-29 13:13:03] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1251/1669]  eta: 0:04:09  tlr: 2.7e-05  tnm: 0.20  Lm: 8.157 (8.122)  Lt: 8.136 (8.119)  Accm: 0.18 (0.27)  Acct: 0.22 (0.25)  time: 0.3485  data: 0.0002
[10-29 13:13:03] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1251/1669]  eta: 0:04:09  tlr: 2.7e-05  tnm: 0.20  Lm: 8.156 (8.125)  Lt: 8.140 (8.123)  Accm: 0.22 (0.26)  Acct: 0.24 (0.25)  time: 0.3485  data: 0.0002
[10-29 13:15:28] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1666/1667]  eta: 0:00:00  tlr: 3.5e-05  tnm: 0.35  Lm: 8.085 (8.022)  Lt: 8.074 (8.004)  Accm: 0.20 (0.44)  Acct: 0.33 (0.50)  time: 0.3518  data: 0.0016
[10-29 13:15:28] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [   0/350]   Total time:      0:14:51   (0.535 s / it)