[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:00:33] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[11-27 00:00:33] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 20
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp141-var-d20-query/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 16384
  codebook_embed_dim  : 14
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  soft_entropy        : True
  scale               : 1.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  clip_norm           : True
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_base_patch14_dinov2.lvd142m
  decoder_model       : vit_base_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  st_ep               : 50
  ed_ep               : 150
  p_rand              : 0.0
  encoder_depth       : -1
  projector_dim       : 2048
  z_dims              : [768]
  proj_coef           : 0.2
  query               : True
  finetune_ckpt       : None
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_id           : 1bdaa86a4d2f2991825257e07f9bdb9a0a3560cb
  commit_msg          : add
  branch              : main
}

[11-27 00:00:33] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[11-27 00:00:35] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f3ab82132e0>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:00:35] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f3ab82132e0>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:00:35] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:00:35] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[11-27 00:00:35] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:00:33] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[11-27 00:00:33] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 20
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp141-var-d20-query/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 16384
  codebook_embed_dim  : 14
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  soft_entropy        : True
  scale               : 1.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  clip_norm           : True
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_base_patch14_dinov2.lvd142m
  decoder_model       : vit_base_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  st_ep               : 50
  ed_ep               : 150
  p_rand              : 0.0
  encoder_depth       : -1
  projector_dim       : 2048
  z_dims              : [768]
  proj_coef           : 0.2
  query               : True
  finetune_ckpt       : None
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_id           : 1bdaa86a4d2f2991825257e07f9bdb9a0a3560cb
  branch              : main
  commit_msg          : add
}

[11-27 00:00:33] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[11-27 00:00:35] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f6c996b7370>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:00:35] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f6c996b7370>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:00:35] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:00:35] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[11-27 00:00:35] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:00:33] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[11-27 00:00:33] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 20
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp141-var-d20-query/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 16384
  codebook_embed_dim  : 14
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  soft_entropy        : True
  scale               : 1.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  clip_norm           : True
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_base_patch14_dinov2.lvd142m
  decoder_model       : vit_base_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  st_ep               : 50
  ed_ep               : 150
  p_rand              : 0.0
  encoder_depth       : -1
  projector_dim       : 2048
  z_dims              : [768]
  proj_coef           : 0.2
  query               : True
  finetune_ckpt       : None
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  branch              : main
  commit_id           : 1bdaa86a4d2f2991825257e07f9bdb9a0a3560cb
  commit_msg          : add
}

[11-27 00:00:33] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[11-27 00:00:35] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f9710e6f370>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:00:35] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f9710e6f370>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:00:35] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:00:35] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[11-27 00:00:35] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:00:33] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[11-27 00:00:33] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 20
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp141-var-d20-query/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 16384
  codebook_embed_dim  : 14
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  soft_entropy        : True
  scale               : 1.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  clip_norm           : True
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_base_patch14_dinov2.lvd142m
  decoder_model       : vit_base_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  st_ep               : 50
  ed_ep               : 150
  p_rand              : 0.0
  encoder_depth       : -1
  projector_dim       : 2048
  z_dims              : [768]
  proj_coef           : 0.2
  query               : True
  finetune_ckpt       : None
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  branch              : main
  commit_msg          : add
  commit_id           : 1bdaa86a4d2f2991825257e07f9bdb9a0a3560cb
}

[11-27 00:00:33] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[11-27 00:00:38] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f3a9ee532e0>
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:00:38] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7f3a9ee532e0>
[11-27 00:00:38] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:00:38] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:00:38] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[11-27 00:00:38] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...     [dataloader multi processing](*) finished! (47.89s)
     [dataloader multi processing](*) finished! (48.21s)
     [dataloader multi processing](*) finished! (50.87s)
     [dataloader multi processing](*) finished! (48.74s)
[11-27 00:01:24] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:01:29] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:01:29] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:01:42] (e/user/VAR/models/var.py, line 117)=> 
[constructor]  ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ==== 
    [VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
        0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
        0.0789, 0.0833]))

[11-27 00:01:23] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:01:28] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:01:28] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:01:42] (e/user/VAR/models/var.py, line 117)=> 
[constructor]  ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ==== 
    [VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
        0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
        0.0789, 0.0833]))

[11-27 00:01:26] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:01:31] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:01:31] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:01:42] (e/user/VAR/models/var.py, line 117)=> 
[constructor]  ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ==== 
    [VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
        0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
        0.0789, 0.0833]))

[11-27 00:01:27] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:01:30] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:01:30] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:01:42] (e/user/VAR/models/var.py, line 117)=> 
[constructor]  ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ==== 
    [VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
        0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
        0.0789, 0.0833]))

[11-27 00:01:43] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:02:03] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0833333
    (word_embed): Linear(in_features=28, out_features=1280, bias=False)
    (class_emb): Embedding(1001, 1280)
    (lvl_embed): Embedding(10, 1280)
    (shared_ada_lin): Identity()
    (query_block): AdaLNSelfAttn(
      shared_aln=False
      (drop_path): DropPath((drop_prob=...))
      (attn): CrossAttention(
        (mat_q): Linear(in_features=1280, out_features=1280, bias=False)
        (mat_kv): Linear(in_features=32, out_features=2560, bias=False)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
        (proj_drop): Identity()
      )
      (ffn): FFN(
        fused_mlp_func=False
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): GELU(approximate='tanh')
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (drop): Identity()
      )
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=7680, bias=True)
      )
    )
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
      (1-19): 19 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=2560, bias=True)
      )
    )
    (head): Linear(in_features=1280, out_features=32768, bias=True)
  )
)


[11-27 00:02:03] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:02:03] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45


[11-27 00:02:03] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
                   " '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
                    " '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
                    " '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
                    " '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
                    " '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
                    " '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
                    " '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
                    " '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
                    " '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
                    " '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[11-27 00:02:03] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank0] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:01:43] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:02:03] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0833333
    (word_embed): Linear(in_features=28, out_features=1280, bias=False)
    (class_emb): Embedding(1001, 1280)
    (lvl_embed): Embedding(10, 1280)
    (shared_ada_lin): Identity()
    (query_block): AdaLNSelfAttn(
      shared_aln=False
      (drop_path): DropPath((drop_prob=...))
      (attn): CrossAttention(
        (mat_q): Linear(in_features=1280, out_features=1280, bias=False)
        (mat_kv): Linear(in_features=32, out_features=2560, bias=False)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
        (proj_drop): Identity()
      )
      (ffn): FFN(
        fused_mlp_func=False
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): GELU(approximate='tanh')
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (drop): Identity()
      )
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=7680, bias=True)
      )
    )
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
      (1-19): 19 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=2560, bias=True)
      )
    )
    (head): Linear(in_features=1280, out_features=32768, bias=True)
  )
)


[11-27 00:02:03] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:02:03] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45


[11-27 00:02:03] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
                   " '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
                    " '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
                    " '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
                    " '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
                    " '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
                    " '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
                    " '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
                    " '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
                    " '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
                    " '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[11-27 00:02:03] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank8] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:01:42] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:02:03] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0833333
    (word_embed): Linear(in_features=28, out_features=1280, bias=False)
    (class_emb): Embedding(1001, 1280)
    (lvl_embed): Embedding(10, 1280)
    (shared_ada_lin): Identity()
    (query_block): AdaLNSelfAttn(
      shared_aln=False
      (drop_path): DropPath((drop_prob=...))
      (attn): CrossAttention(
        (mat_q): Linear(in_features=1280, out_features=1280, bias=False)
        (mat_kv): Linear(in_features=32, out_features=2560, bias=False)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
        (proj_drop): Identity()
      )
      (ffn): FFN(
        fused_mlp_func=False
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): GELU(approximate='tanh')
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (drop): Identity()
      )
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=7680, bias=True)
      )
    )
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
      (1-19): 19 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=2560, bias=True)
      )
    )
    (head): Linear(in_features=1280, out_features=32768, bias=True)
  )
)


[11-27 00:02:03] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:02:03] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45


[11-27 00:02:03] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
                   " '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
                    " '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
                    " '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
                    " '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
                    " '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
                    " '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
                    " '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
                    " '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
                    " '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
                    " '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[11-27 00:02:03] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank16] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:01:42] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:02:03] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0833333
    (word_embed): Linear(in_features=28, out_features=1280, bias=False)
    (class_emb): Embedding(1001, 1280)
    (lvl_embed): Embedding(10, 1280)
    (shared_ada_lin): Identity()
    (query_block): AdaLNSelfAttn(
      shared_aln=False
      (drop_path): DropPath((drop_prob=...))
      (attn): CrossAttention(
        (mat_q): Linear(in_features=1280, out_features=1280, bias=False)
        (mat_kv): Linear(in_features=32, out_features=2560, bias=False)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
        (proj_drop): Identity()
      )
      (ffn): FFN(
        fused_mlp_func=False
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): GELU(approximate='tanh')
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (drop): Identity()
      )
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=7680, bias=True)
      )
    )
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
      (1-19): 19 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=2560, bias=True)
      )
    )
    (head): Linear(in_features=1280, out_features=32768, bias=True)
  )
)


[11-27 00:02:03] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:02:03] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45


[11-27 00:02:03] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
                   " '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
                    " '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
                    " '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
                    " '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
                    " '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
                    " '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
                    " '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
                    " '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
                    " '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
                    " '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[11-27 00:02:03] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank24] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 105)=> 
[11-27 00:02:03] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[11-27 00:02:03] (/VAR/utils/lr_control.py, line 105)=> 
[11-27 00:02:03] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[11-27 00:02:03] (/VAR/utils/lr_control.py, line 105)=> 
[11-27 00:02:03] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[11-27 00:02:03] (/VAR/utils/lr_control.py, line 105)=> 
[11-27 00:02:03] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}


=======================================================   RESTART [11-27 00:04:11]   =======================================================


=======================================================   RESTART [11-27 00:04:11]   =======================================================


=======================================================   RESTART [11-27 00:04:11]   =======================================================


=======================================================   RESTART [11-27 00:04:11]   =======================================================
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:04:12] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[11-27 00:04:12] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 20
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp141-var-d20-query/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 16384
  codebook_embed_dim  : 14
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  soft_entropy        : True
  scale               : 1.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  clip_norm           : True
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_base_patch14_dinov2.lvd142m
  decoder_model       : vit_base_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  st_ep               : 50
  ed_ep               : 150
  p_rand              : 0.0
  encoder_depth       : -1
  projector_dim       : 2048
  z_dims              : [768]
  proj_coef           : 0.2
  query               : True
  finetune_ckpt       : None
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  branch              : main
  commit_id           : 2b2768034d9141e66c71ad27587e9da1bea39dba
  commit_msg          : add
}

[11-27 00:04:12] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[11-27 00:04:15] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fe05c1fb370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:04:15] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fe05c1fb370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:04:15] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:04:15] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[11-27 00:04:15] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:04:12] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[11-27 00:04:12] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 20
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp141-var-d20-query/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 16384
  codebook_embed_dim  : 14
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  soft_entropy        : True
  scale               : 1.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  clip_norm           : True
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_base_patch14_dinov2.lvd142m
  decoder_model       : vit_base_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  st_ep               : 50
  ed_ep               : 150
  p_rand              : 0.0
  encoder_depth       : -1
  projector_dim       : 2048
  z_dims              : [768]
  proj_coef           : 0.2
  query               : True
  finetune_ckpt       : None
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_id           : 2b2768034d9141e66c71ad27587e9da1bea39dba
  branch              : main
  commit_msg          : add
}

[11-27 00:04:12] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[11-27 00:04:15] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fe40799f370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:04:15] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fe40799f370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:04:15] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:04:15] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[11-27 00:04:15] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:04:12] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[11-27 00:04:12] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 20
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp141-var-d20-query/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 16384
  codebook_embed_dim  : 14
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  soft_entropy        : True
  scale               : 1.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  clip_norm           : True
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_base_patch14_dinov2.lvd142m
  decoder_model       : vit_base_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  st_ep               : 50
  ed_ep               : 150
  p_rand              : 0.0
  encoder_depth       : -1
  projector_dim       : 2048
  z_dims              : [768]
  proj_coef           : 0.2
  query               : True
  finetune_ckpt       : None
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_msg          : add
  commit_id           : 2b2768034d9141e66c71ad27587e9da1bea39dba
  branch              : main
}

[11-27 00:04:12] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[11-27 00:04:15] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fcd75bef370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:04:15] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fcd75bef370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:04:15] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:04:15] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[11-27 00:04:15] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:04:12] (/home/user/VAR/train.py , line  37)=> global bs=768, local bs=24
[11-27 00:04:12] (/home/user/VAR/train.py , line  38)=> initial args:
{
  data_path           : /mnt/localssd/ImageNet2012/
  exp_name            : text
  vae_ckpt            : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
  vfast               : 2
  tfast               : 2
  depth               : 20
  ini                 : -1
  hd                  : 0.02
  aln                 : 0.5
  alng                : 0.0001
  fp16                : 1
  tblr                : 8e-05
  tlr                 : 0.00024000000000000003
  twd                 : 0.05
  twde                : 0.05
  tclip               : 2.0
  ls                  : 0.0
  bs                  : 768
  batch_size          : 24
  glb_batch_size      : 768
  ac                  : 1
  ep                  : 350
  wp                  : 7.0
  wp0                 : 0.005
  wpe                 : 0.01
  sche                : lin0
  opt                 : adamw
  afuse               : True
  saln                : False
  anorm               : True
  fuse                : True
  pn                  : 1_1_2_3_3_4_5_6_8_11
  patch_size          : 11
  patch_nums          : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
  resos               : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
  data_load_reso      : 256
  mid_reso            : 1.125
  hflip               : False
  workers             : 12
  pg                  : 0.0
  pg0                 : 4
  pgwp                : 1.1666666666666667
  cmd                 : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
  acc_mean            : None
  acc_tail            : None
  L_mean              : None
  L_tail              : None
  vacc_mean           : None
  vacc_tail           : None
  vL_mean             : None
  vL_tail             : None
  grad_norm           : None
  cur_lr              : None
  cur_wd              : None
  cur_it              : 
  cur_ep              : 
  remain_time         : 
  finish_time         : 
  local_out_dir_path  : /sensei-fs/users/xiangl/exp141-var-d20-query/
  tb_log_dir_path     : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
  log_txt_path        : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
  last_ckpt_path      : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
  tf32                : True
  seed                : None
  codebook_size       : 16384
  codebook_embed_dim  : 14
  codebook_l2_norm    : True
  codebook_show_usage : True
  commit_loss_beta    : 0.25
  entropy_loss_ratio  : 0.0
  soft_entropy        : True
  scale               : 1.0
  test_model          : True
  encoder_ch_mult     : [1, 1, 2, 2, 4]
  decoder_ch_mult     : [1, 1, 2, 2, 4]
  z_channels          : 256
  dropout_p           : 0.0
  clip_norm           : True
  v_patch_nums        : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
  enc_type            : dinov2
  dec_type            : dinov2
  semantic_guide      : dinov2
  num_latent_tokens   : 121
  encoder_model       : vit_base_patch14_dinov2.lvd142m
  decoder_model       : vit_base_patch14_dinov2.lvd142m
  abs_pos_embed       : True
  share_quant_resi    : 4
  product_quant       : 2
  half_sem            : True
  p_drop              : 0.0
  joint_sample        : False
  infer_ckpt          : 
  masking_method      : uniform
  st_ep               : 50
  ed_ep               : 150
  p_rand              : 0.0
  encoder_depth       : -1
  projector_dim       : 2048
  z_dims              : [768]
  proj_coef           : 0.2
  query               : True
  finetune_ckpt       : None
  same_seed_for_all_ranks: 0
  local_debug         : False
  dbg_nan             : False
  cfg                 : [3.5, 3.5]
  top_k               : 900
  top_p               : 0.95
  commit_id           : 2b2768034d9141e66c71ad27587e9da1bea39dba
  commit_msg          : add
  branch              : main
}

[11-27 00:04:12] (/home/user/VAR/train.py , line  42)=> [build PT data] ...

[11-27 00:04:15] (e/user/VAR/utils/data.py, line  34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  48)=> Transform [train] = 
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fd744023370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:04:15] (e/user/VAR/utils/data.py, line  48)=> Transform [val] = 
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> CenterCrop(size=(256, 256))
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  51)=> <function normalize_01_into_pm1 at 0x7fd744023370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line  54)=> ---------------------------

[11-27 00:04:15] (/home/user/VAR/train.py , line  65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:04:15] (/home/user/VAR/train.py , line  65)=> [auto_resume quit]
[11-27 00:04:15] (/home/user/VAR/train.py , line  66)=> [dataloader multi processing] ...     [dataloader multi processing](*) finished! (48.40s)
     [dataloader multi processing](*) finished! (50.34s)
     [dataloader multi processing](*) finished! (51.16s)
     [dataloader multi processing](*) finished! (52.57s)
[11-27 00:05:04] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:05:09] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:05:09] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:05:12] (e/user/VAR/models/var.py, line 117)=> 
[constructor]  ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ==== 
    [VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
        0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
        0.0789, 0.0833]))

[11-27 00:05:06] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:05:10] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:05:10] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:05:13] (e/user/VAR/models/var.py, line 117)=> 
[constructor]  ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ==== 
    [VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
        0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
        0.0789, 0.0833]))

[11-27 00:05:06] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:05:10] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:05:10] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:05:14] (e/user/VAR/models/var.py, line 117)=> 
[constructor]  ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ==== 
    [VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
        0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
        0.0789, 0.0833]))

[11-27 00:05:08] (/home/user/VAR/train.py , line  72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:05:13] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:05:13] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
        0.2673])
[11-27 00:05:17] (e/user/VAR/models/var.py, line 117)=> 
[constructor]  ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ==== 
    [VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
    [drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
        0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
        0.0789, 0.0833]))

[11-27 00:05:14] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:05:22] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0833333
    (word_embed): Linear(in_features=28, out_features=1280, bias=False)
    (class_emb): Embedding(1001, 1280)
    (lvl_embed): Embedding(10, 1280)
    (shared_ada_lin): Identity()
    (query_block): AdaLNSelfAttn(
      shared_aln=False
      (drop_path): DropPath((drop_prob=...))
      (attn): CrossAttention(
        (mat_q): Linear(in_features=1280, out_features=1280, bias=False)
        (mat_kv): Linear(in_features=32, out_features=2560, bias=False)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
        (proj_drop): Identity()
      )
      (ffn): FFN(
        fused_mlp_func=False
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): GELU(approximate='tanh')
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (drop): Identity()
      )
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=7680, bias=True)
      )
    )
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
      (1-19): 19 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=2560, bias=True)
      )
    )
    (head): Linear(in_features=1280, out_features=32768, bias=True)
  )
)


[11-27 00:05:22] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:05:22] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45


[11-27 00:05:22] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
                   " '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
                    " '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
                    " '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
                    " '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
                    " '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
                    " '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
                    " '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
                    " '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
                    " '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
                    " '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[11-27 00:05:22] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank0] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:05:13] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:05:22] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0833333
    (word_embed): Linear(in_features=28, out_features=1280, bias=False)
    (class_emb): Embedding(1001, 1280)
    (lvl_embed): Embedding(10, 1280)
    (shared_ada_lin): Identity()
    (query_block): AdaLNSelfAttn(
      shared_aln=False
      (drop_path): DropPath((drop_prob=...))
      (attn): CrossAttention(
        (mat_q): Linear(in_features=1280, out_features=1280, bias=False)
        (mat_kv): Linear(in_features=32, out_features=2560, bias=False)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
        (proj_drop): Identity()
      )
      (ffn): FFN(
        fused_mlp_func=False
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): GELU(approximate='tanh')
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (drop): Identity()
      )
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=7680, bias=True)
      )
    )
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
      (1-19): 19 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=2560, bias=True)
      )
    )
    (head): Linear(in_features=1280, out_features=32768, bias=True)
  )
)


[11-27 00:05:22] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:05:22] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45


[11-27 00:05:22] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
                   " '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
                    " '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
                    " '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
                    " '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
                    " '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
                    " '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
                    " '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
                    " '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
                    " '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
                    " '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[11-27 00:05:22] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank8] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:05:15] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:05:22] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0833333
    (word_embed): Linear(in_features=28, out_features=1280, bias=False)
    (class_emb): Embedding(1001, 1280)
    (lvl_embed): Embedding(10, 1280)
    (shared_ada_lin): Identity()
    (query_block): AdaLNSelfAttn(
      shared_aln=False
      (drop_path): DropPath((drop_prob=...))
      (attn): CrossAttention(
        (mat_q): Linear(in_features=1280, out_features=1280, bias=False)
        (mat_kv): Linear(in_features=32, out_features=2560, bias=False)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
        (proj_drop): Identity()
      )
      (ffn): FFN(
        fused_mlp_func=False
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): GELU(approximate='tanh')
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (drop): Identity()
      )
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=7680, bias=True)
      )
    )
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
      (1-19): 19 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=2560, bias=True)
      )
    )
    (head): Linear(in_features=1280, out_features=32768, bias=True)
  )
)


[11-27 00:05:22] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:05:22] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45


[11-27 00:05:22] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
                   " '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
                    " '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
                    " '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
                    " '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
                    " '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
                    " '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
                    " '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
                    " '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
                    " '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
                    " '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[11-27 00:05:22] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank16] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:05:17] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:05:22] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
  (_orig_mod): VAR(
    drop_path_rate=0.0833333
    (word_embed): Linear(in_features=28, out_features=1280, bias=False)
    (class_emb): Embedding(1001, 1280)
    (lvl_embed): Embedding(10, 1280)
    (shared_ada_lin): Identity()
    (query_block): AdaLNSelfAttn(
      shared_aln=False
      (drop_path): DropPath((drop_prob=...))
      (attn): CrossAttention(
        (mat_q): Linear(in_features=1280, out_features=1280, bias=False)
        (mat_kv): Linear(in_features=32, out_features=2560, bias=False)
        (proj): Linear(in_features=1280, out_features=1280, bias=True)
        (proj_drop): Identity()
      )
      (ffn): FFN(
        fused_mlp_func=False
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (act): GELU(approximate='tanh')
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (drop): Identity()
      )
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=7680, bias=True)
      )
    )
    (blocks): ModuleList(
      (0): AdaLNSelfAttn(
        shared_aln=False
        (drop_path): Identity()
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
      (1-19): 19 x AdaLNSelfAttn(
        shared_aln=False
        (drop_path): DropPath((drop_prob=...))
        (attn): SelfAttention(
          (mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
          (proj_drop): Identity()
        )
        (ffn): FFN(
          fused_mlp_func=False
          (fc1): Linear(in_features=1280, out_features=5120, bias=True)
          (act): GELU(approximate='tanh')
          (fc2): Linear(in_features=5120, out_features=1280, bias=True)
          (drop): Identity()
        )
        (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
        (ada_lin): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1280, out_features=7680, bias=True)
        )
      )
    )
    (head_nm): AdaLNBeforeHead(
      (ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
      (ada_lin): Sequential(
        (0): SiLU()
        (1): Linear(in_features=1280, out_features=2560, bias=True)
      )
    )
    (head): Linear(in_features=1280, out_features=32768, bias=True)
  )
)


[11-27 00:05:22] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:05:22] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45


[11-27 00:05:22] (/VAR/utils/lr_control.py, line  99)=> [get_param_groups] param_groups = 
{ 'D': { 'lr_sc': 1.0,
         'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
                   " '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
                   " '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
         'wd_sc': 1.0},
  'ND': { 'lr_sc': 1.0,
          'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
                    " '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
                    " '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
                    " '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
                    " '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
                    " '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
                    " '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
                    " '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
                    " '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
                    " '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
                    " '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
                    " '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
                    " '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
                    " '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
                    " '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
          'wd_sc': 0.0}}

[11-27 00:05:23] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank24] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:05:23] (/VAR/utils/lr_control.py, line 105)=> 
[11-27 00:05:23] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[11-27 00:20:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1669]  eta: 17 days, 9:23:49  tlr: 1.2e-06  tnm: 0.05  Lm: 9.704 (9.704)  Lt: 9.704 (9.704)  Accm: 0.01 (0.01)  Acct: 0.00 (0.00)  proj_loss: 0.0000 (0.0000)  time: 900.3170  data: 0.0005
[11-27 00:05:23] (/VAR/utils/lr_control.py, line 105)=> 
[11-27 00:05:23] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[11-27 00:20:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1669]  eta: 17 days, 9:23:54  tlr: 1.2e-06  tnm: 0.05  Lm: 9.704 (9.704)  Lt: 9.704 (9.704)  Accm: 0.01 (0.01)  Acct: 0.00 (0.00)  proj_loss: 0.0000 (0.0000)  time: 900.3200  data: 0.0005
[11-27 00:05:23] (/VAR/utils/lr_control.py, line 105)=> 
[11-27 00:05:23] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[11-27 00:20:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1669]  eta: 17 days, 9:25:52  tlr: 1.2e-06  tnm: 0.05  Lm: 9.704 (9.704)  Lt: 9.704 (9.704)  Accm: 0.01 (0.01)  Acct: 0.00 (0.00)  proj_loss: 0.0000 (0.0000)  time: 900.3908  data: 0.0005
[11-27 00:05:23] (/VAR/utils/lr_control.py, line 105)=> 
[11-27 00:05:23] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}

[11-27 00:20:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [   0/1669]  eta: 17 days, 8:49:23  tlr: 1.2e-06  tnm: 0.05  Lm: 9.704 (9.704)  Lt: 9.704 (9.704)  Accm: 0.02 (0.02)  Acct: 0.00 (0.00)  proj_loss: 0.0000 (0.0000)  time: 899.0792  data: 0.0006
[11-27 00:27:56] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 417/1669]  eta: 1:07:29  tlr: 9.7e-06  tnm: 0.05  Lm: 9.700 (9.700)  Lt: 9.700 (9.700)  Accm: 0.01 (0.01)  Acct: 0.00 (0.00)  proj_loss: 0.0000 (0.0000)  time: 0.4525  data: 0.0002
[11-27 00:27:56] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 417/1669]  eta: 1:07:33  tlr: 9.7e-06  tnm: 0.05  Lm: 9.700 (9.700)  Lt: 9.700 (9.700)  Accm: 0.01 (0.01)  Acct: 0.01 (0.01)  proj_loss: 0.0000 (0.0000)  time: 0.4525  data: 0.0002
[11-27 00:27:56] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 417/1669]  eta: 1:07:33  tlr: 9.7e-06  tnm: 0.05  Lm: 9.700 (9.700)  Lt: 9.700 (9.700)  Accm: 0.01 (0.01)  Acct: 0.02 (0.02)  proj_loss: 0.0000 (0.0000)  time: 0.4525  data: 0.0002
[11-27 00:27:56] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 417/1669]  eta: 1:07:33  tlr: 9.7e-06  tnm: 0.05  Lm: 9.701 (9.701)  Lt: 9.700 (9.700)  Accm: 0.01 (0.01)  Acct: 0.00 (0.00)  proj_loss: 0.0000 (0.0000)  time: 0.4525  data: 0.0002
[11-27 00:31:05] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 834/1669]  eta: 0:25:42  tlr: 1.8e-05  tnm: 0.08  Lm: 9.698 (9.686)  Lt: 9.696 (9.686)  Accm: 0.01 (0.01)  Acct: 0.00 (0.01)  proj_loss: 0.0000 (0.0000)  time: 0.4528  data: 0.0002
[11-27 00:31:05] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 834/1669]  eta: 0:25:40  tlr: 1.8e-05  tnm: 0.08  Lm: 9.696 (9.686)  Lt: 9.696 (9.687)  Accm: 0.02 (0.02)  Acct: 0.00 (0.01)  proj_loss: 0.0000 (0.0000)  time: 0.4528  data: 0.0002
[11-27 00:31:05] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 834/1669]  eta: 0:25:42  tlr: 1.8e-05  tnm: 0.08  Lm: 9.696 (9.684)  Lt: 9.696 (9.686)  Accm: 0.01 (0.02)  Acct: 0.03 (0.03)  proj_loss: 0.0000 (0.0000)  time: 0.4528  data: 0.0002
[11-27 00:31:05] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [ 834/1669]  eta: 0:25:42  tlr: 1.8e-05  tnm: 0.08  Lm: 9.696 (9.686)  Lt: 9.695 (9.687)  Accm: 0.02 (0.02)  Acct: 0.02 (0.02)  proj_loss: 0.0000 (0.0000)  time: 0.4528  data: 0.0002
[11-27 00:34:14] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1251/1669]  eta: 0:09:38  tlr: 2.7e-05  tnm: 0.22  Lm: 9.677 (9.661)  Lt: 9.678 (9.668)  Accm: 0.02 (0.02)  Acct: 0.03 (0.02)  proj_loss: 0.0000 (0.0000)  time: 0.4537  data: 0.0002
[11-27 00:34:14] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1251/1669]  eta: 0:09:38  tlr: 2.7e-05  tnm: 0.22  Lm: 9.676 (9.661)  Lt: 9.678 (9.668)  Accm: 0.02 (0.02)  Acct: 0.01 (0.01)  proj_loss: 0.0000 (0.0000)  time: 0.4537  data: 0.0002
[11-27 00:34:14] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1251/1669]  eta: 0:09:37  tlr: 2.7e-05  tnm: 0.22  Lm: 9.677 (9.661)  Lt: 9.678 (9.668)  Accm: 0.01 (0.02)  Acct: 0.00 (0.01)  proj_loss: 0.0000 (0.0000)  time: 0.4537  data: 0.0002
[11-27 00:34:14] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1251/1669]  eta: 0:09:38  tlr: 2.7e-05  tnm: 0.22  Lm: 9.674 (9.658)  Lt: 9.677 (9.666)  Accm: 0.01 (0.02)  Acct: 0.02 (0.02)  proj_loss: 0.0000 (0.0000)  time: 0.4537  data: 0.0002
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1668/1669]  eta: 0:00:01  tlr: 3.5e-05  tnm: 0.51  Lm: 9.651 (9.618)  Lt: 9.658 (9.633)  Accm: 0.01 (0.02)  Acct: 0.03 (0.02)  proj_loss: 0.0000 (0.0000)  time: 0.4558  data: 0.0014
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [   0/350]   Total time:      0:32:00   (1.151 s / it)
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1668/1669]  eta: 0:00:01  tlr: 3.5e-05  tnm: 0.51  Lm: 9.658 (9.620)  Lt: 9.661 (9.635)  Accm: 0.02 (0.03)  Acct: 0.03 (0.04)  proj_loss: 0.0000 (0.0000)  time: 0.4558  data: 0.0015
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1668/1669]  eta: 0:00:01  tlr: 3.5e-05  tnm: 0.51  Lm: 9.657 (9.618)  Lt: 9.659 (9.633)  Accm: 0.02 (0.02)  Acct: 0.00 (0.01)  proj_loss: 0.0000 (0.0000)  time: 0.4558  data: 0.0016
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   0/350]  [1668/1669]  eta: 0:00:01  tlr: 3.5e-05  tnm: 0.51  Lm: 9.655 (9.621)  Lt: 9.659 (9.634)  Accm: 0.03 (0.03)  Acct: 0.00 (0.01)  proj_loss: 0.0000 (0.0000)  time: 0.4558  data: 0.0018
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [   0/350]   Total time:      0:32:00   (1.151 s / it)
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [   0/350]   Total time:      0:31:59   (1.150 s / it)
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [   0/350]   Total time:      0:32:00   (1.151 s / it)
[11-27 00:37:23] (/home/user/VAR/train.py , line 279)=>      [ep0]  (training )  Lm: 9.617 (9.617), Lt: 9.633 (9.633),  Acc m&t: 0.02 0.02,  Remain: 3 days, 2:04:01,  Finish: 2024-11-29 10:41
[11-27 00:37:23] (/home/user/VAR/train.py , line 279)=>      [ep0]  (training )  Lm: 9.617 (9.617), Lt: 9.633 (9.633),  Acc m&t: 0.02 0.02,  Remain: 3 days, 2:03:10,  Finish: 2024-11-29 10:40
[11-27 00:37:23] (/home/user/VAR/train.py , line 279)=>      [ep0]  (training )  Lm: 9.617 (9.617), Lt: 9.633 (9.633),  Acc m&t: 0.02 0.02,  Remain: 3 days, 2:02:53,  Finish: 2024-11-29 10:40
[11-27 00:37:23] (/home/user/VAR/train.py , line 279)=>      [ep0]  (training )  Lm: 9.617 (9.617), Lt: 9.633 (9.633),  Acc m&t: 0.02 0.02,  Remain: 3 days, 2:03:37,  Finish: 2024-11-29 10:41
[11-27 00:37:24] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [   0/1669]  eta: 0:12:39  tlr: 3.5e-05  tnm: 0.49  Lm: 9.445 (9.445)  Lt: 9.486 (9.486)  Accm: 0.04 (0.04)  Acct: 0.03 (0.03)  proj_loss: 0.0000 (0.0000)  time: 0.4551  data: 0.0003
[11-27 00:37:24] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [   0/1669]  eta: 0:12:39  tlr: 3.5e-05  tnm: 0.49  Lm: 9.446 (9.446)  Lt: 9.497 (9.497)  Accm: 0.04 (0.04)  Acct: 0.03 (0.03)  proj_loss: 0.0000 (0.0000)  time: 0.4553  data: 0.0004
[11-27 00:37:24] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [   0/1669]  eta: 0:12:14  tlr: 3.5e-05  tnm: 0.49  Lm: 9.451 (9.451)  Lt: 9.502 (9.502)  Accm: 0.04 (0.04)  Acct: 0.00 (0.00)  proj_loss: 0.0000 (0.0000)  time: 0.4402  data: 0.0003
[11-27 00:37:24] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [   0/1669]  eta: 0:12:39  tlr: 3.5e-05  tnm: 0.49  Lm: 9.441 (9.441)  Lt: 9.487 (9.487)  Accm: 0.04 (0.04)  Acct: 0.07 (0.07)  proj_loss: 0.0000 (0.0000)  time: 0.4552  data: 0.0003
[11-27 00:40:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [ 417/1669]  eta: 0:09:28  tlr: 4.4e-05  tnm: 0.92  Lm: 9.230 (9.230)  Lt: 9.236 (9.236)  Accm: 0.13 (0.13)  Acct: 0.14 (0.14)  proj_loss: 0.0000 (0.0000)  time: 0.4531  data: 0.0002
[11-27 00:40:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [ 417/1669]  eta: 0:09:28  tlr: 4.4e-05  tnm: 0.92  Lm: 9.215 (9.215)  Lt: 9.233 (9.233)  Accm: 0.12 (0.12)  Acct: 0.10 (0.10)  proj_loss: 0.0000 (0.0000)  time: 0.4531  data: 0.0003
[11-27 00:40:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [ 417/1669]  eta: 0:09:28  tlr: 4.4e-05  tnm: 0.92  Lm: 9.218 (9.218)  Lt: 9.216 (9.216)  Accm: 0.13 (0.13)  Acct: 0.13 (0.13)  proj_loss: 0.0000 (0.0000)  time: 0.4531  data: 0.0002
[11-27 00:40:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [ 417/1669]  eta: 0:09:28  tlr: 4.4e-05  tnm: 0.92  Lm: 9.228 (9.228)  Lt: 9.246 (9.246)  Accm: 0.13 (0.13)  Acct: 0.09 (0.09)  proj_loss: 0.0000 (0.0000)  time: 0.4531  data: 0.0002
[11-27 00:43:42] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [ 834/1669]  eta: 0:06:18  tlr: 5.2e-05  tnm: 1.56  Lm: 9.005 (8.972)  Lt: 8.991 (8.917)  Accm: 0.21 (0.24)  Acct: 0.19 (0.22)  proj_loss: 0.0000 (0.0000)  time: 0.4545  data: 0.0002
[11-27 00:43:42] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [ 834/1669]  eta: 0:06:18  tlr: 5.2e-05  tnm: 1.56  Lm: 8.991 (8.972)  Lt: 8.947 (8.909)  Accm: 0.23 (0.26)  Acct: 0.22 (0.36)  proj_loss: 0.0000 (0.0000)  time: 0.4545  data: 0.0002
[11-27 00:43:42] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [ 834/1669]  eta: 0:06:18  tlr: 5.2e-05  tnm: 1.56  Lm: 9.020 (8.981)  Lt: 8.985 (8.926)  Accm: 0.23 (0.22)  Acct: 0.21 (0.22)  proj_loss: 0.0000 (0.0000)  time: 0.4545  data: 0.0003
[11-27 00:43:42] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [ 834/1669]  eta: 0:06:18  tlr: 5.2e-05  tnm: 1.56  Lm: 8.985 (8.970)  Lt: 8.968 (8.918)  Accm: 0.21 (0.23)  Acct: 0.17 (0.26)  proj_loss: 0.0000 (0.0000)  time: 0.4545  data: 0.0002
[11-27 00:46:52] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [1251/1669]  eta: 0:03:09  tlr: 6.1e-05  tnm: 1.51  Lm: 8.735 (8.805)  Lt: 8.621 (8.690)  Accm: 0.38 (0.34)  Acct: 0.49 (0.46)  proj_loss: 0.0000 (0.0000)  time: 0.4532  data: 0.0002
[11-27 00:46:52] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [1251/1669]  eta: 0:03:09  tlr: 6.1e-05  tnm: 1.51  Lm: 8.732 (8.798)  Lt: 8.625 (8.690)  Accm: 0.34 (0.32)  Acct: 0.34 (0.34)  proj_loss: 0.0000 (0.0000)  time: 0.4532  data: 0.0002
[11-27 00:46:52] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [1251/1669]  eta: 0:03:09  tlr: 6.1e-05  tnm: 1.51  Lm: 8.751 (8.817)  Lt: 8.645 (8.717)  Accm: 0.32 (0.30)  Acct: 0.30 (0.36)  proj_loss: 0.0000 (0.0000)  time: 0.4532  data: 0.0002
[11-27 00:46:52] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [   1/350]  [1251/1669]  eta: 0:03:09  tlr: 6.1e-05  tnm: 1.51  Lm: 8.732 (8.799)  Lt: 8.628 (8.695)  Accm: 0.33 (0.28)  Acct: 0.34 (0.32)  proj_loss: 0.0000 (0.0000)  time: 0.4532  data: 0.0002