qiuk6's picture
Upload subfolder: exp141-var-d20-query
833f807 verified
raw
history blame
191 kB
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:00:33] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24
[11-27 00:00:33] (/home/user/VAR/train.py , line 38)=> initial args:
{
data_path : /mnt/localssd/ImageNet2012/
exp_name : text
vae_ckpt : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
vfast : 2
tfast : 2
depth : 20
ini : -1
hd : 0.02
aln : 0.5
alng : 0.0001
fp16 : 1
tblr : 8e-05
tlr : 0.00024000000000000003
twd : 0.05
twde : 0.05
tclip : 2.0
ls : 0.0
bs : 768
batch_size : 24
glb_batch_size : 768
ac : 1
ep : 350
wp : 7.0
wp0 : 0.005
wpe : 0.01
sche : lin0
opt : adamw
afuse : True
saln : False
anorm : True
fuse : True
pn : 1_1_2_3_3_4_5_6_8_11
patch_size : 11
patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
data_load_reso : 256
mid_reso : 1.125
hflip : False
workers : 12
pg : 0.0
pg0 : 4
pgwp : 1.1666666666666667
cmd : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
acc_mean : None
acc_tail : None
L_mean : None
L_tail : None
vacc_mean : None
vacc_tail : None
vL_mean : None
vL_tail : None
grad_norm : None
cur_lr : None
cur_wd : None
cur_it :
cur_ep :
remain_time :
finish_time :
local_out_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/
tb_log_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
log_txt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
last_ckpt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
tf32 : True
seed : None
codebook_size : 16384
codebook_embed_dim : 14
codebook_l2_norm : True
codebook_show_usage : True
commit_loss_beta : 0.25
entropy_loss_ratio : 0.0
soft_entropy : True
scale : 1.0
test_model : True
encoder_ch_mult : [1, 1, 2, 2, 4]
decoder_ch_mult : [1, 1, 2, 2, 4]
z_channels : 256
dropout_p : 0.0
clip_norm : True
v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
enc_type : dinov2
dec_type : dinov2
semantic_guide : dinov2
num_latent_tokens : 121
encoder_model : vit_base_patch14_dinov2.lvd142m
decoder_model : vit_base_patch14_dinov2.lvd142m
abs_pos_embed : True
share_quant_resi : 4
product_quant : 2
half_sem : True
p_drop : 0.0
joint_sample : False
infer_ckpt :
masking_method : uniform
st_ep : 50
ed_ep : 150
p_rand : 0.0
encoder_depth : -1
projector_dim : 2048
z_dims : [768]
proj_coef : 0.2
query : True
finetune_ckpt : None
same_seed_for_all_ranks: 0
local_debug : False
dbg_nan : False
cfg : [3.5, 3.5]
top_k : 900
top_p : 0.95
commit_id : 1bdaa86a4d2f2991825257e07f9bdb9a0a3560cb
commit_msg : add
branch : main
}
[11-27 00:00:33] (/home/user/VAR/train.py , line 42)=> [build PT data] ...
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 48)=> Transform [train] =
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f3ab82132e0>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 48)=> Transform [val] =
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256))
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f3ab82132e0>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:00:35] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:00:35] (/home/user/VAR/train.py , line 65)=> [auto_resume quit]
[11-27 00:00:35] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:00:33] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24
[11-27 00:00:33] (/home/user/VAR/train.py , line 38)=> initial args:
{
data_path : /mnt/localssd/ImageNet2012/
exp_name : text
vae_ckpt : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
vfast : 2
tfast : 2
depth : 20
ini : -1
hd : 0.02
aln : 0.5
alng : 0.0001
fp16 : 1
tblr : 8e-05
tlr : 0.00024000000000000003
twd : 0.05
twde : 0.05
tclip : 2.0
ls : 0.0
bs : 768
batch_size : 24
glb_batch_size : 768
ac : 1
ep : 350
wp : 7.0
wp0 : 0.005
wpe : 0.01
sche : lin0
opt : adamw
afuse : True
saln : False
anorm : True
fuse : True
pn : 1_1_2_3_3_4_5_6_8_11
patch_size : 11
patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
data_load_reso : 256
mid_reso : 1.125
hflip : False
workers : 12
pg : 0.0
pg0 : 4
pgwp : 1.1666666666666667
cmd : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
acc_mean : None
acc_tail : None
L_mean : None
L_tail : None
vacc_mean : None
vacc_tail : None
vL_mean : None
vL_tail : None
grad_norm : None
cur_lr : None
cur_wd : None
cur_it :
cur_ep :
remain_time :
finish_time :
local_out_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/
tb_log_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
log_txt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
last_ckpt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
tf32 : True
seed : None
codebook_size : 16384
codebook_embed_dim : 14
codebook_l2_norm : True
codebook_show_usage : True
commit_loss_beta : 0.25
entropy_loss_ratio : 0.0
soft_entropy : True
scale : 1.0
test_model : True
encoder_ch_mult : [1, 1, 2, 2, 4]
decoder_ch_mult : [1, 1, 2, 2, 4]
z_channels : 256
dropout_p : 0.0
clip_norm : True
v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
enc_type : dinov2
dec_type : dinov2
semantic_guide : dinov2
num_latent_tokens : 121
encoder_model : vit_base_patch14_dinov2.lvd142m
decoder_model : vit_base_patch14_dinov2.lvd142m
abs_pos_embed : True
share_quant_resi : 4
product_quant : 2
half_sem : True
p_drop : 0.0
joint_sample : False
infer_ckpt :
masking_method : uniform
st_ep : 50
ed_ep : 150
p_rand : 0.0
encoder_depth : -1
projector_dim : 2048
z_dims : [768]
proj_coef : 0.2
query : True
finetune_ckpt : None
same_seed_for_all_ranks: 0
local_debug : False
dbg_nan : False
cfg : [3.5, 3.5]
top_k : 900
top_p : 0.95
commit_id : 1bdaa86a4d2f2991825257e07f9bdb9a0a3560cb
branch : main
commit_msg : add
}
[11-27 00:00:33] (/home/user/VAR/train.py , line 42)=> [build PT data] ...
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 48)=> Transform [train] =
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f6c996b7370>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 48)=> Transform [val] =
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256))
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f6c996b7370>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:00:35] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:00:35] (/home/user/VAR/train.py , line 65)=> [auto_resume quit]
[11-27 00:00:35] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:00:33] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24
[11-27 00:00:33] (/home/user/VAR/train.py , line 38)=> initial args:
{
data_path : /mnt/localssd/ImageNet2012/
exp_name : text
vae_ckpt : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
vfast : 2
tfast : 2
depth : 20
ini : -1
hd : 0.02
aln : 0.5
alng : 0.0001
fp16 : 1
tblr : 8e-05
tlr : 0.00024000000000000003
twd : 0.05
twde : 0.05
tclip : 2.0
ls : 0.0
bs : 768
batch_size : 24
glb_batch_size : 768
ac : 1
ep : 350
wp : 7.0
wp0 : 0.005
wpe : 0.01
sche : lin0
opt : adamw
afuse : True
saln : False
anorm : True
fuse : True
pn : 1_1_2_3_3_4_5_6_8_11
patch_size : 11
patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
data_load_reso : 256
mid_reso : 1.125
hflip : False
workers : 12
pg : 0.0
pg0 : 4
pgwp : 1.1666666666666667
cmd : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
acc_mean : None
acc_tail : None
L_mean : None
L_tail : None
vacc_mean : None
vacc_tail : None
vL_mean : None
vL_tail : None
grad_norm : None
cur_lr : None
cur_wd : None
cur_it :
cur_ep :
remain_time :
finish_time :
local_out_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/
tb_log_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
log_txt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
last_ckpt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
tf32 : True
seed : None
codebook_size : 16384
codebook_embed_dim : 14
codebook_l2_norm : True
codebook_show_usage : True
commit_loss_beta : 0.25
entropy_loss_ratio : 0.0
soft_entropy : True
scale : 1.0
test_model : True
encoder_ch_mult : [1, 1, 2, 2, 4]
decoder_ch_mult : [1, 1, 2, 2, 4]
z_channels : 256
dropout_p : 0.0
clip_norm : True
v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
enc_type : dinov2
dec_type : dinov2
semantic_guide : dinov2
num_latent_tokens : 121
encoder_model : vit_base_patch14_dinov2.lvd142m
decoder_model : vit_base_patch14_dinov2.lvd142m
abs_pos_embed : True
share_quant_resi : 4
product_quant : 2
half_sem : True
p_drop : 0.0
joint_sample : False
infer_ckpt :
masking_method : uniform
st_ep : 50
ed_ep : 150
p_rand : 0.0
encoder_depth : -1
projector_dim : 2048
z_dims : [768]
proj_coef : 0.2
query : True
finetune_ckpt : None
same_seed_for_all_ranks: 0
local_debug : False
dbg_nan : False
cfg : [3.5, 3.5]
top_k : 900
top_p : 0.95
branch : main
commit_id : 1bdaa86a4d2f2991825257e07f9bdb9a0a3560cb
commit_msg : add
}
[11-27 00:00:33] (/home/user/VAR/train.py , line 42)=> [build PT data] ...
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 48)=> Transform [train] =
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f9710e6f370>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 48)=> Transform [val] =
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256))
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f9710e6f370>
[11-27 00:00:35] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:00:35] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:00:35] (/home/user/VAR/train.py , line 65)=> [auto_resume quit]
[11-27 00:00:35] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:00:31] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:00:33] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24
[11-27 00:00:33] (/home/user/VAR/train.py , line 38)=> initial args:
{
data_path : /mnt/localssd/ImageNet2012/
exp_name : text
vae_ckpt : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
vfast : 2
tfast : 2
depth : 20
ini : -1
hd : 0.02
aln : 0.5
alng : 0.0001
fp16 : 1
tblr : 8e-05
tlr : 0.00024000000000000003
twd : 0.05
twde : 0.05
tclip : 2.0
ls : 0.0
bs : 768
batch_size : 24
glb_batch_size : 768
ac : 1
ep : 350
wp : 7.0
wp0 : 0.005
wpe : 0.01
sche : lin0
opt : adamw
afuse : True
saln : False
anorm : True
fuse : True
pn : 1_1_2_3_3_4_5_6_8_11
patch_size : 11
patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
data_load_reso : 256
mid_reso : 1.125
hflip : False
workers : 12
pg : 0.0
pg0 : 4
pgwp : 1.1666666666666667
cmd : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
acc_mean : None
acc_tail : None
L_mean : None
L_tail : None
vacc_mean : None
vacc_tail : None
vL_mean : None
vL_tail : None
grad_norm : None
cur_lr : None
cur_wd : None
cur_it :
cur_ep :
remain_time :
finish_time :
local_out_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/
tb_log_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
log_txt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
last_ckpt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
tf32 : True
seed : None
codebook_size : 16384
codebook_embed_dim : 14
codebook_l2_norm : True
codebook_show_usage : True
commit_loss_beta : 0.25
entropy_loss_ratio : 0.0
soft_entropy : True
scale : 1.0
test_model : True
encoder_ch_mult : [1, 1, 2, 2, 4]
decoder_ch_mult : [1, 1, 2, 2, 4]
z_channels : 256
dropout_p : 0.0
clip_norm : True
v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
enc_type : dinov2
dec_type : dinov2
semantic_guide : dinov2
num_latent_tokens : 121
encoder_model : vit_base_patch14_dinov2.lvd142m
decoder_model : vit_base_patch14_dinov2.lvd142m
abs_pos_embed : True
share_quant_resi : 4
product_quant : 2
half_sem : True
p_drop : 0.0
joint_sample : False
infer_ckpt :
masking_method : uniform
st_ep : 50
ed_ep : 150
p_rand : 0.0
encoder_depth : -1
projector_dim : 2048
z_dims : [768]
proj_coef : 0.2
query : True
finetune_ckpt : None
same_seed_for_all_ranks: 0
local_debug : False
dbg_nan : False
cfg : [3.5, 3.5]
top_k : 900
top_p : 0.95
branch : main
commit_msg : add
commit_id : 1bdaa86a4d2f2991825257e07f9bdb9a0a3560cb
}
[11-27 00:00:33] (/home/user/VAR/train.py , line 42)=> [build PT data] ...
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 48)=> Transform [train] =
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f3a9ee532e0>
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 48)=> Transform [val] =
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256))
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7f3a9ee532e0>
[11-27 00:00:38] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:00:38] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:00:38] (/home/user/VAR/train.py , line 65)=> [auto_resume quit]
[11-27 00:00:38] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ... [dataloader multi processing](*) finished! (47.89s)
[dataloader multi processing](*) finished! (48.21s)
[dataloader multi processing](*) finished! (50.87s)
[dataloader multi processing](*) finished! (48.74s)
[11-27 00:01:24] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:01:29] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:01:29] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:01:42] (e/user/VAR/models/var.py, line 117)=>
[constructor] ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ====
[VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
[drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
0.0789, 0.0833]))
[11-27 00:01:23] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:01:28] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:01:28] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:01:42] (e/user/VAR/models/var.py, line 117)=>
[constructor] ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ====
[VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
[drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
0.0789, 0.0833]))
[11-27 00:01:26] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:01:31] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:01:31] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:01:42] (e/user/VAR/models/var.py, line 117)=>
[constructor] ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ====
[VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
[drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
0.0789, 0.0833]))
[11-27 00:01:27] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:01:30] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:01:30] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:01:42] (e/user/VAR/models/var.py, line 117)=>
[constructor] ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ====
[VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
[drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
0.0789, 0.0833]))
[11-27 00:01:43] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:02:03] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
(_orig_mod): VAR(
drop_path_rate=0.0833333
(word_embed): Linear(in_features=28, out_features=1280, bias=False)
(class_emb): Embedding(1001, 1280)
(lvl_embed): Embedding(10, 1280)
(shared_ada_lin): Identity()
(query_block): AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): CrossAttention(
(mat_q): Linear(in_features=1280, out_features=1280, bias=False)
(mat_kv): Linear(in_features=32, out_features=2560, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(blocks): ModuleList(
(0): AdaLNSelfAttn(
shared_aln=False
(drop_path): Identity()
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(1-19): 19 x AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
)
(head_nm): AdaLNBeforeHead(
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=2560, bias=True)
)
)
(head): Linear(in_features=1280, out_features=32768, bias=True)
)
)
[11-27 00:02:03] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:02:03] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups =
{ 'D': { 'lr_sc': 1.0,
'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
" '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
" '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
" '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
" '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
" '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
" '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
" '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
" '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
" '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
" '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
" '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
" '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
" '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
" '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
" '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
" '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
" '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
" '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
" '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
" '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
" '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
'wd_sc': 1.0},
'ND': { 'lr_sc': 1.0,
'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
" '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
" '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
" '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
" '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
" '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
" '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
" '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
" '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
" '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
" '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
" '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
" '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
" '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
'wd_sc': 0.0}}
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank0] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:01:43] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:02:03] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
(_orig_mod): VAR(
drop_path_rate=0.0833333
(word_embed): Linear(in_features=28, out_features=1280, bias=False)
(class_emb): Embedding(1001, 1280)
(lvl_embed): Embedding(10, 1280)
(shared_ada_lin): Identity()
(query_block): AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): CrossAttention(
(mat_q): Linear(in_features=1280, out_features=1280, bias=False)
(mat_kv): Linear(in_features=32, out_features=2560, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(blocks): ModuleList(
(0): AdaLNSelfAttn(
shared_aln=False
(drop_path): Identity()
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(1-19): 19 x AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
)
(head_nm): AdaLNBeforeHead(
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=2560, bias=True)
)
)
(head): Linear(in_features=1280, out_features=32768, bias=True)
)
)
[11-27 00:02:03] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:02:03] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups =
{ 'D': { 'lr_sc': 1.0,
'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
" '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
" '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
" '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
" '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
" '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
" '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
" '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
" '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
" '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
" '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
" '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
" '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
" '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
" '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
" '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
" '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
" '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
" '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
" '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
" '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
" '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
'wd_sc': 1.0},
'ND': { 'lr_sc': 1.0,
'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
" '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
" '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
" '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
" '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
" '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
" '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
" '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
" '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
" '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
" '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
" '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
" '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
" '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
'wd_sc': 0.0}}
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank8] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:01:42] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:02:03] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
(_orig_mod): VAR(
drop_path_rate=0.0833333
(word_embed): Linear(in_features=28, out_features=1280, bias=False)
(class_emb): Embedding(1001, 1280)
(lvl_embed): Embedding(10, 1280)
(shared_ada_lin): Identity()
(query_block): AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): CrossAttention(
(mat_q): Linear(in_features=1280, out_features=1280, bias=False)
(mat_kv): Linear(in_features=32, out_features=2560, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(blocks): ModuleList(
(0): AdaLNSelfAttn(
shared_aln=False
(drop_path): Identity()
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(1-19): 19 x AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
)
(head_nm): AdaLNBeforeHead(
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=2560, bias=True)
)
)
(head): Linear(in_features=1280, out_features=32768, bias=True)
)
)
[11-27 00:02:03] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:02:03] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups =
{ 'D': { 'lr_sc': 1.0,
'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
" '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
" '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
" '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
" '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
" '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
" '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
" '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
" '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
" '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
" '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
" '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
" '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
" '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
" '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
" '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
" '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
" '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
" '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
" '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
" '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
" '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
'wd_sc': 1.0},
'ND': { 'lr_sc': 1.0,
'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
" '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
" '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
" '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
" '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
" '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
" '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
" '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
" '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
" '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
" '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
" '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
" '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
" '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
'wd_sc': 0.0}}
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank16] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:01:42] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:02:03] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
(_orig_mod): VAR(
drop_path_rate=0.0833333
(word_embed): Linear(in_features=28, out_features=1280, bias=False)
(class_emb): Embedding(1001, 1280)
(lvl_embed): Embedding(10, 1280)
(shared_ada_lin): Identity()
(query_block): AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): CrossAttention(
(mat_q): Linear(in_features=1280, out_features=1280, bias=False)
(mat_kv): Linear(in_features=32, out_features=2560, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(blocks): ModuleList(
(0): AdaLNSelfAttn(
shared_aln=False
(drop_path): Identity()
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(1-19): 19 x AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
)
(head_nm): AdaLNBeforeHead(
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=2560, bias=True)
)
)
(head): Linear(in_features=1280, out_features=32768, bias=True)
)
)
[11-27 00:02:03] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:02:03] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups =
{ 'D': { 'lr_sc': 1.0,
'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
" '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
" '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
" '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
" '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
" '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
" '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
" '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
" '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
" '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
" '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
" '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
" '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
" '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
" '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
" '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
" '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
" '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
" '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
" '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
" '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
" '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
'wd_sc': 1.0},
'ND': { 'lr_sc': 1.0,
'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
" '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
" '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
" '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
" '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
" '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
" '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
" '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
" '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
" '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
" '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
" '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
" '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
" '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
'wd_sc': 0.0}}
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank24] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 105)=>
[11-27 00:02:03] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 105)=>
[11-27 00:02:03] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 105)=>
[11-27 00:02:03] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}
[11-27 00:02:03] (/VAR/utils/lr_control.py, line 105)=>
[11-27 00:02:03] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}
======================================================= RESTART [11-27 00:04:11] =======================================================
======================================================= RESTART [11-27 00:04:11] =======================================================
======================================================= RESTART [11-27 00:04:11] =======================================================
======================================================= RESTART [11-27 00:04:11] =======================================================
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:04:12] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24
[11-27 00:04:12] (/home/user/VAR/train.py , line 38)=> initial args:
{
data_path : /mnt/localssd/ImageNet2012/
exp_name : text
vae_ckpt : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
vfast : 2
tfast : 2
depth : 20
ini : -1
hd : 0.02
aln : 0.5
alng : 0.0001
fp16 : 1
tblr : 8e-05
tlr : 0.00024000000000000003
twd : 0.05
twde : 0.05
tclip : 2.0
ls : 0.0
bs : 768
batch_size : 24
glb_batch_size : 768
ac : 1
ep : 350
wp : 7.0
wp0 : 0.005
wpe : 0.01
sche : lin0
opt : adamw
afuse : True
saln : False
anorm : True
fuse : True
pn : 1_1_2_3_3_4_5_6_8_11
patch_size : 11
patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
data_load_reso : 256
mid_reso : 1.125
hflip : False
workers : 12
pg : 0.0
pg0 : 4
pgwp : 1.1666666666666667
cmd : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
acc_mean : None
acc_tail : None
L_mean : None
L_tail : None
vacc_mean : None
vacc_tail : None
vL_mean : None
vL_tail : None
grad_norm : None
cur_lr : None
cur_wd : None
cur_it :
cur_ep :
remain_time :
finish_time :
local_out_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/
tb_log_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
log_txt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
last_ckpt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
tf32 : True
seed : None
codebook_size : 16384
codebook_embed_dim : 14
codebook_l2_norm : True
codebook_show_usage : True
commit_loss_beta : 0.25
entropy_loss_ratio : 0.0
soft_entropy : True
scale : 1.0
test_model : True
encoder_ch_mult : [1, 1, 2, 2, 4]
decoder_ch_mult : [1, 1, 2, 2, 4]
z_channels : 256
dropout_p : 0.0
clip_norm : True
v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
enc_type : dinov2
dec_type : dinov2
semantic_guide : dinov2
num_latent_tokens : 121
encoder_model : vit_base_patch14_dinov2.lvd142m
decoder_model : vit_base_patch14_dinov2.lvd142m
abs_pos_embed : True
share_quant_resi : 4
product_quant : 2
half_sem : True
p_drop : 0.0
joint_sample : False
infer_ckpt :
masking_method : uniform
st_ep : 50
ed_ep : 150
p_rand : 0.0
encoder_depth : -1
projector_dim : 2048
z_dims : [768]
proj_coef : 0.2
query : True
finetune_ckpt : None
same_seed_for_all_ranks: 0
local_debug : False
dbg_nan : False
cfg : [3.5, 3.5]
top_k : 900
top_p : 0.95
branch : main
commit_id : 2b2768034d9141e66c71ad27587e9da1bea39dba
commit_msg : add
}
[11-27 00:04:12] (/home/user/VAR/train.py , line 42)=> [build PT data] ...
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 48)=> Transform [train] =
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fe05c1fb370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 48)=> Transform [val] =
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256))
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fe05c1fb370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:04:15] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:04:15] (/home/user/VAR/train.py , line 65)=> [auto_resume quit]
[11-27 00:04:15] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:04:12] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24
[11-27 00:04:12] (/home/user/VAR/train.py , line 38)=> initial args:
{
data_path : /mnt/localssd/ImageNet2012/
exp_name : text
vae_ckpt : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
vfast : 2
tfast : 2
depth : 20
ini : -1
hd : 0.02
aln : 0.5
alng : 0.0001
fp16 : 1
tblr : 8e-05
tlr : 0.00024000000000000003
twd : 0.05
twde : 0.05
tclip : 2.0
ls : 0.0
bs : 768
batch_size : 24
glb_batch_size : 768
ac : 1
ep : 350
wp : 7.0
wp0 : 0.005
wpe : 0.01
sche : lin0
opt : adamw
afuse : True
saln : False
anorm : True
fuse : True
pn : 1_1_2_3_3_4_5_6_8_11
patch_size : 11
patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
data_load_reso : 256
mid_reso : 1.125
hflip : False
workers : 12
pg : 0.0
pg0 : 4
pgwp : 1.1666666666666667
cmd : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
acc_mean : None
acc_tail : None
L_mean : None
L_tail : None
vacc_mean : None
vacc_tail : None
vL_mean : None
vL_tail : None
grad_norm : None
cur_lr : None
cur_wd : None
cur_it :
cur_ep :
remain_time :
finish_time :
local_out_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/
tb_log_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
log_txt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
last_ckpt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
tf32 : True
seed : None
codebook_size : 16384
codebook_embed_dim : 14
codebook_l2_norm : True
codebook_show_usage : True
commit_loss_beta : 0.25
entropy_loss_ratio : 0.0
soft_entropy : True
scale : 1.0
test_model : True
encoder_ch_mult : [1, 1, 2, 2, 4]
decoder_ch_mult : [1, 1, 2, 2, 4]
z_channels : 256
dropout_p : 0.0
clip_norm : True
v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
enc_type : dinov2
dec_type : dinov2
semantic_guide : dinov2
num_latent_tokens : 121
encoder_model : vit_base_patch14_dinov2.lvd142m
decoder_model : vit_base_patch14_dinov2.lvd142m
abs_pos_embed : True
share_quant_resi : 4
product_quant : 2
half_sem : True
p_drop : 0.0
joint_sample : False
infer_ckpt :
masking_method : uniform
st_ep : 50
ed_ep : 150
p_rand : 0.0
encoder_depth : -1
projector_dim : 2048
z_dims : [768]
proj_coef : 0.2
query : True
finetune_ckpt : None
same_seed_for_all_ranks: 0
local_debug : False
dbg_nan : False
cfg : [3.5, 3.5]
top_k : 900
top_p : 0.95
commit_id : 2b2768034d9141e66c71ad27587e9da1bea39dba
branch : main
commit_msg : add
}
[11-27 00:04:12] (/home/user/VAR/train.py , line 42)=> [build PT data] ...
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 48)=> Transform [train] =
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fe40799f370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 48)=> Transform [val] =
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256))
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fe40799f370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:04:15] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:04:15] (/home/user/VAR/train.py , line 65)=> [auto_resume quit]
[11-27 00:04:15] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:04:12] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24
[11-27 00:04:12] (/home/user/VAR/train.py , line 38)=> initial args:
{
data_path : /mnt/localssd/ImageNet2012/
exp_name : text
vae_ckpt : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
vfast : 2
tfast : 2
depth : 20
ini : -1
hd : 0.02
aln : 0.5
alng : 0.0001
fp16 : 1
tblr : 8e-05
tlr : 0.00024000000000000003
twd : 0.05
twde : 0.05
tclip : 2.0
ls : 0.0
bs : 768
batch_size : 24
glb_batch_size : 768
ac : 1
ep : 350
wp : 7.0
wp0 : 0.005
wpe : 0.01
sche : lin0
opt : adamw
afuse : True
saln : False
anorm : True
fuse : True
pn : 1_1_2_3_3_4_5_6_8_11
patch_size : 11
patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
data_load_reso : 256
mid_reso : 1.125
hflip : False
workers : 12
pg : 0.0
pg0 : 4
pgwp : 1.1666666666666667
cmd : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
acc_mean : None
acc_tail : None
L_mean : None
L_tail : None
vacc_mean : None
vacc_tail : None
vL_mean : None
vL_tail : None
grad_norm : None
cur_lr : None
cur_wd : None
cur_it :
cur_ep :
remain_time :
finish_time :
local_out_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/
tb_log_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
log_txt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
last_ckpt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
tf32 : True
seed : None
codebook_size : 16384
codebook_embed_dim : 14
codebook_l2_norm : True
codebook_show_usage : True
commit_loss_beta : 0.25
entropy_loss_ratio : 0.0
soft_entropy : True
scale : 1.0
test_model : True
encoder_ch_mult : [1, 1, 2, 2, 4]
decoder_ch_mult : [1, 1, 2, 2, 4]
z_channels : 256
dropout_p : 0.0
clip_norm : True
v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
enc_type : dinov2
dec_type : dinov2
semantic_guide : dinov2
num_latent_tokens : 121
encoder_model : vit_base_patch14_dinov2.lvd142m
decoder_model : vit_base_patch14_dinov2.lvd142m
abs_pos_embed : True
share_quant_resi : 4
product_quant : 2
half_sem : True
p_drop : 0.0
joint_sample : False
infer_ckpt :
masking_method : uniform
st_ep : 50
ed_ep : 150
p_rand : 0.0
encoder_depth : -1
projector_dim : 2048
z_dims : [768]
proj_coef : 0.2
query : True
finetune_ckpt : None
same_seed_for_all_ranks: 0
local_debug : False
dbg_nan : False
cfg : [3.5, 3.5]
top_k : 900
top_p : 0.95
commit_msg : add
commit_id : 2b2768034d9141e66c71ad27587e9da1bea39dba
branch : main
}
[11-27 00:04:12] (/home/user/VAR/train.py , line 42)=> [build PT data] ...
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 48)=> Transform [train] =
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fcd75bef370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 48)=> Transform [val] =
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256))
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fcd75bef370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:04:15] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:04:15] (/home/user/VAR/train.py , line 65)=> [auto_resume quit]
[11-27 00:04:15] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ...[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 228)=> [tf32] [precis] torch.get_float32_matmul_precision(): high
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 229)=> [tf32] [ conv ] torch.backends.cudnn.allow_tf32: True
[11-27 00:04:11] (er/VAR/utils/arg_util.py, line 230)=> [tf32] [matmul] torch.backends.cuda.matmul.allow_tf32: True
[11-27 00:04:12] (/home/user/VAR/train.py , line 37)=> global bs=768, local bs=24
[11-27 00:04:12] (/home/user/VAR/train.py , line 38)=> initial args:
{
data_path : /mnt/localssd/ImageNet2012/
exp_name : text
vae_ckpt : /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt
vfast : 2
tfast : 2
depth : 20
ini : -1
hd : 0.02
aln : 0.5
alng : 0.0001
fp16 : 1
tblr : 8e-05
tlr : 0.00024000000000000003
twd : 0.05
twde : 0.05
tclip : 2.0
ls : 0.0
bs : 768
batch_size : 24
glb_batch_size : 768
ac : 1
ep : 350
wp : 7.0
wp0 : 0.005
wpe : 0.01
sche : lin0
opt : adamw
afuse : True
saln : False
anorm : True
fuse : True
pn : 1_1_2_3_3_4_5_6_8_11
patch_size : 11
patch_nums : (1, 1, 2, 3, 3, 4, 5, 6, 8, 11)
resos : (11, 11, 22, 33, 33, 44, 55, 66, 88, 121)
data_load_reso : 256
mid_reso : 1.125
hflip : False
workers : 12
pg : 0.0
pg0 : 4
pgwp : 1.1666666666666667
cmd : --depth=20 --bs=768 --ep=350 --fp16=1 --alng=1e-4 --wpe=0.01 --tblr=8e-5 --data_path /mnt/localssd/ImageNet2012/ --workers 12 --vfast 2 --tfast 2 --encoder_model vit_base_patch14_dinov2.lvd142m --decoder_model vit_base_patch14_dinov2.lvd142m --product_quant 2 --semantic_guide dinov2 --num_latent_tokens 121 --codebook_embed_dim 14 --codebook_size 16384 --v_patch_nums 1 1 2 3 3 4 5 6 8 11 --pn 1_1_2_3_3_4_5_6_8_11 --patch_size 11 --local_out_dir_path /sensei-fs/users/xiangl/exp141-var-d20-query/ --vae_ckpt /sensei-fs/users/xiangl/output/exp141/best_ckpt.pt --p_drop 0.0 --st_ep 50 --ed_ep 150 --sem_half True --clip_norm True --scale 1.0 --encoder_depth -1 --proj_coef 0.2 --query True
acc_mean : None
acc_tail : None
L_mean : None
L_tail : None
vacc_mean : None
vacc_tail : None
vL_mean : None
vL_tail : None
grad_norm : None
cur_lr : None
cur_wd : None
cur_it :
cur_ep :
remain_time :
finish_time :
local_out_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/
tb_log_dir_path : /sensei-fs/users/xiangl/exp141-var-d20-query/tb-VARd20__pn1_1_2_3_3_4_5_6_8_11__b768ep350adamlr8e-05wd0.05
log_txt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/log.txt
last_ckpt_path : /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt-last.pth
tf32 : True
seed : None
codebook_size : 16384
codebook_embed_dim : 14
codebook_l2_norm : True
codebook_show_usage : True
commit_loss_beta : 0.25
entropy_loss_ratio : 0.0
soft_entropy : True
scale : 1.0
test_model : True
encoder_ch_mult : [1, 1, 2, 2, 4]
decoder_ch_mult : [1, 1, 2, 2, 4]
z_channels : 256
dropout_p : 0.0
clip_norm : True
v_patch_nums : [1, 1, 2, 3, 3, 4, 5, 6, 8, 11]
enc_type : dinov2
dec_type : dinov2
semantic_guide : dinov2
num_latent_tokens : 121
encoder_model : vit_base_patch14_dinov2.lvd142m
decoder_model : vit_base_patch14_dinov2.lvd142m
abs_pos_embed : True
share_quant_resi : 4
product_quant : 2
half_sem : True
p_drop : 0.0
joint_sample : False
infer_ckpt :
masking_method : uniform
st_ep : 50
ed_ep : 150
p_rand : 0.0
encoder_depth : -1
projector_dim : 2048
z_dims : [768]
proj_coef : 0.2
query : True
finetune_ckpt : None
same_seed_for_all_ranks: 0
local_debug : False
dbg_nan : False
cfg : [3.5, 3.5]
top_k : 900
top_p : 0.95
commit_id : 2b2768034d9141e66c71ad27587e9da1bea39dba
commit_msg : add
branch : main
}
[11-27 00:04:12] (/home/user/VAR/train.py , line 42)=> [build PT data] ...
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 34)=> [Dataset] len(train_set)=1281167, len(val_set)=50000, num_classes=1000
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 48)=> Transform [train] =
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> RandomCrop(size=(256, 256), padding=None)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fd744023370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 48)=> Transform [val] =
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> Resize(size=288, interpolation=lanczos, max_size=None, antialias=True)
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> CenterCrop(size=(256, 256))
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> ToTensor()
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 51)=> <function normalize_01_into_pm1 at 0x7fd744023370>
[11-27 00:04:15] (e/user/VAR/utils/data.py, line 54)=> ---------------------------
[11-27 00:04:15] (/home/user/VAR/train.py , line 65)=> [auto_resume] no ckpt found @ /sensei-fs/users/xiangl/exp141-var-d20-query/ar-ckpt*.pth
[11-27 00:04:15] (/home/user/VAR/train.py , line 65)=> [auto_resume quit]
[11-27 00:04:15] (/home/user/VAR/train.py , line 66)=> [dataloader multi processing] ... [dataloader multi processing](*) finished! (48.40s)
[dataloader multi processing](*) finished! (50.34s)
[dataloader multi processing](*) finished! (51.16s)
[dataloader multi processing](*) finished! (52.57s)
[11-27 00:05:04] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:05:09] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:05:09] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:05:12] (e/user/VAR/models/var.py, line 117)=>
[constructor] ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ====
[VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
[drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
0.0789, 0.0833]))
[11-27 00:05:06] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:05:10] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:05:10] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:05:13] (e/user/VAR/models/var.py, line 117)=>
[constructor] ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ====
[VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
[drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
0.0789, 0.0833]))
[11-27 00:05:06] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:05:10] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:05:10] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:05:14] (e/user/VAR/models/var.py, line 117)=>
[constructor] ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ====
[VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
[drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
0.0789, 0.0833]))
[11-27 00:05:08] (/home/user/VAR/train.py , line 72)=> [dataloader] gbs=768, lbs=24, iters_train=1669, types(tr, va)=('DatasetFolder', 'DatasetFolder')
[11-27 00:05:13] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:05:13] (/lookup_free_quantize.py, line 128)=> scale is tensor([0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673, 0.2673,
0.2673])
[11-27 00:05:17] (e/user/VAR/models/var.py, line 117)=>
[constructor] ==== flash_if_available=True (0/20), fused_if_available=True (fusing_add_ln=0/20, fusing_mlp=0/20) ====
[VAR config ] embed_dim=1280, num_heads=20, depth=20, mlp_ratio=4.0
[drop ratios ] drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.0833333 (tensor([0.0000, 0.0044, 0.0088, 0.0132, 0.0175, 0.0219, 0.0263, 0.0307, 0.0351,
0.0395, 0.0439, 0.0482, 0.0526, 0.0570, 0.0614, 0.0658, 0.0702, 0.0746,
0.0789, 0.0833]))
[11-27 00:05:14] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:05:22] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
(_orig_mod): VAR(
drop_path_rate=0.0833333
(word_embed): Linear(in_features=28, out_features=1280, bias=False)
(class_emb): Embedding(1001, 1280)
(lvl_embed): Embedding(10, 1280)
(shared_ada_lin): Identity()
(query_block): AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): CrossAttention(
(mat_q): Linear(in_features=1280, out_features=1280, bias=False)
(mat_kv): Linear(in_features=32, out_features=2560, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(blocks): ModuleList(
(0): AdaLNSelfAttn(
shared_aln=False
(drop_path): Identity()
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(1-19): 19 x AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
)
(head_nm): AdaLNBeforeHead(
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=2560, bias=True)
)
)
(head): Linear(in_features=1280, out_features=32768, bias=True)
)
)
[11-27 00:05:22] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:05:22] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45
[11-27 00:05:22] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups =
{ 'D': { 'lr_sc': 1.0,
'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
" '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
" '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
" '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
" '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
" '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
" '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
" '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
" '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
" '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
" '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
" '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
" '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
" '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
" '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
" '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
" '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
" '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
" '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
" '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
" '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
" '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
'wd_sc': 1.0},
'ND': { 'lr_sc': 1.0,
'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
" '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
" '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
" '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
" '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
" '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
" '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
" '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
" '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
" '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
" '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
" '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
" '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
" '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
'wd_sc': 0.0}}
[11-27 00:05:22] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank0] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:05:13] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:05:22] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
(_orig_mod): VAR(
drop_path_rate=0.0833333
(word_embed): Linear(in_features=28, out_features=1280, bias=False)
(class_emb): Embedding(1001, 1280)
(lvl_embed): Embedding(10, 1280)
(shared_ada_lin): Identity()
(query_block): AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): CrossAttention(
(mat_q): Linear(in_features=1280, out_features=1280, bias=False)
(mat_kv): Linear(in_features=32, out_features=2560, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(blocks): ModuleList(
(0): AdaLNSelfAttn(
shared_aln=False
(drop_path): Identity()
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(1-19): 19 x AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
)
(head_nm): AdaLNBeforeHead(
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=2560, bias=True)
)
)
(head): Linear(in_features=1280, out_features=32768, bias=True)
)
)
[11-27 00:05:22] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:05:22] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45
[11-27 00:05:22] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups =
{ 'D': { 'lr_sc': 1.0,
'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
" '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
" '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
" '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
" '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
" '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
" '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
" '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
" '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
" '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
" '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
" '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
" '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
" '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
" '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
" '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
" '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
" '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
" '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
" '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
" '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
" '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
'wd_sc': 1.0},
'ND': { 'lr_sc': 1.0,
'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
" '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
" '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
" '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
" '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
" '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
" '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
" '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
" '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
" '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
" '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
" '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
" '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
" '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
'wd_sc': 0.0}}
[11-27 00:05:22] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank8] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:05:15] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:05:22] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
(_orig_mod): VAR(
drop_path_rate=0.0833333
(word_embed): Linear(in_features=28, out_features=1280, bias=False)
(class_emb): Embedding(1001, 1280)
(lvl_embed): Embedding(10, 1280)
(shared_ada_lin): Identity()
(query_block): AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): CrossAttention(
(mat_q): Linear(in_features=1280, out_features=1280, bias=False)
(mat_kv): Linear(in_features=32, out_features=2560, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(blocks): ModuleList(
(0): AdaLNSelfAttn(
shared_aln=False
(drop_path): Identity()
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(1-19): 19 x AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
)
(head_nm): AdaLNBeforeHead(
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=2560, bias=True)
)
)
(head): Linear(in_features=1280, out_features=32768, bias=True)
)
)
[11-27 00:05:22] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:05:22] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45
[11-27 00:05:22] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups =
{ 'D': { 'lr_sc': 1.0,
'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
" '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
" '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
" '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
" '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
" '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
" '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
" '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
" '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
" '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
" '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
" '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
" '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
" '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
" '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
" '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
" '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
" '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
" '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
" '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
" '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
" '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
'wd_sc': 1.0},
'ND': { 'lr_sc': 1.0,
'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
" '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
" '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
" '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
" '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
" '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
" '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
" '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
" '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
" '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
" '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
" '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
" '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
" '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
'wd_sc': 0.0}}
[11-27 00:05:22] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank16] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:05:17] (e/user/VAR/models/var.py, line 425)=> [init_weights] VAR with init_std=0.0161374
[11-27 00:05:22] (/home/user/VAR/train.py , line 128)=> [INIT] VAR model = OptimizedModule(
(_orig_mod): VAR(
drop_path_rate=0.0833333
(word_embed): Linear(in_features=28, out_features=1280, bias=False)
(class_emb): Embedding(1001, 1280)
(lvl_embed): Embedding(10, 1280)
(shared_ada_lin): Identity()
(query_block): AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): CrossAttention(
(mat_q): Linear(in_features=1280, out_features=1280, bias=False)
(mat_kv): Linear(in_features=32, out_features=2560, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(blocks): ModuleList(
(0): AdaLNSelfAttn(
shared_aln=False
(drop_path): Identity()
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
(1-19): 19 x AdaLNSelfAttn(
shared_aln=False
(drop_path): DropPath((drop_prob=...))
(attn): SelfAttention(
(mat_qkv): Linear(in_features=1280, out_features=3840, bias=False)
(proj): Linear(in_features=1280, out_features=1280, bias=True)
(proj_drop): Identity()
)
(ffn): FFN(
fused_mlp_func=False
(fc1): Linear(in_features=1280, out_features=5120, bias=True)
(act): GELU(approximate='tanh')
(fc2): Linear(in_features=5120, out_features=1280, bias=True)
(drop): Identity()
)
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=7680, bias=True)
)
)
)
(head_nm): AdaLNBeforeHead(
(ln_wo_grad): LayerNorm((1280,), eps=1e-06, elementwise_affine=False)
(ada_lin): Sequential(
(0): SiLU()
(1): Linear(in_features=1280, out_features=2560, bias=True)
)
)
(head): Linear(in_features=1280, out_features=32768, bias=True)
)
)
[11-27 00:05:22] (/home/user/VAR/train.py , line 130)=> [INIT][#para] VAE=257.83, VAE.enc=86.05, VAE.dec=85.87, VAE.quant=0.01
[11-27 00:05:22] (/home/user/VAR/train.py , line 131)=> [INIT][#para] VAR=663.45
[11-27 00:05:22] (/VAR/utils/lr_control.py, line 99)=> [get_param_groups] param_groups =
{ 'D': { 'lr_sc': 1.0,
'params': "('_orig_mod.word_embed.weight, _orig_mod.class_emb.weight, _orig_mod.query_block.attn.mat_q.weight, _orig_mod.query_block.attn.mat_kv.weight, _orig_mod.query_block.attn.proj.weight, '\n"
" '_orig_mod.query_block.ffn.fc1.weight, _orig_mod.query_block.ffn.fc2.weight, _orig_mod.query_block.ada_lin.1.weight, _orig_mod.blocks.0.attn.mat_qkv.weight, _orig_mod.blocks.0.attn.proj.weight, '\n"
" '_orig_mod.blocks.0.ffn.fc1.weight, _orig_mod.blocks.0.ffn.fc2.weight, _orig_mod.blocks.0.ada_lin.1.weight, _orig_mod.blocks.1.attn.mat_qkv.weight, _orig_mod.blocks.1.attn.proj.weight, '\n"
" '_orig_mod.blocks.1.ffn.fc1.weight, _orig_mod.blocks.1.ffn.fc2.weight, _orig_mod.blocks.1.ada_lin.1.weight, _orig_mod.blocks.2.attn.mat_qkv.weight, _orig_mod.blocks.2.attn.proj.weight, '\n"
" '_orig_mod.blocks.2.ffn.fc1.weight, _orig_mod.blocks.2.ffn.fc2.weight, _orig_mod.blocks.2.ada_lin.1.weight, _orig_mod.blocks.3.attn.mat_qkv.weight, _orig_mod.blocks.3.attn.proj.weight, '\n"
" '_orig_mod.blocks.3.ffn.fc1.weight, _orig_mod.blocks.3.ffn.fc2.weight, _orig_mod.blocks.3.ada_lin.1.weight, _orig_mod.blocks.4.attn.mat_qkv.weight, _orig_mod.blocks.4.attn.proj.weight, '\n"
" '_orig_mod.blocks.4.ffn.fc1.weight, _orig_mod.blocks.4.ffn.fc2.weight, _orig_mod.blocks.4.ada_lin.1.weight, _orig_mod.blocks.5.attn.mat_qkv.weight, _orig_mod.blocks.5.attn.proj.weight, '\n"
" '_orig_mod.blocks.5.ffn.fc1.weight, _orig_mod.blocks.5.ffn.fc2.weight, _orig_mod.blocks.5.ada_lin.1.weight, _orig_mod.blocks.6.attn.mat_qkv.weight, _orig_mod.blocks.6.attn.proj.weight, '\n"
" '_orig_mod.blocks.6.ffn.fc1.weight, _orig_mod.blocks.6.ffn.fc2.weight, _orig_mod.blocks.6.ada_lin.1.weight, _orig_mod.blocks.7.attn.mat_qkv.weight, _orig_mod.blocks.7.attn.proj.weight, '\n"
" '_orig_mod.blocks.7.ffn.fc1.weight, _orig_mod.blocks.7.ffn.fc2.weight, _orig_mod.blocks.7.ada_lin.1.weight, _orig_mod.blocks.8.attn.mat_qkv.weight, _orig_mod.blocks.8.attn.proj.weight, '\n"
" '_orig_mod.blocks.8.ffn.fc1.weight, _orig_mod.blocks.8.ffn.fc2.weight, _orig_mod.blocks.8.ada_lin.1.weight, _orig_mod.blocks.9.attn.mat_qkv.weight, _orig_mod.blocks.9.attn.proj.weight, '\n"
" '_orig_mod.blocks.9.ffn.fc1.weight, _orig_mod.blocks.9.ffn.fc2.weight, _orig_mod.blocks.9.ada_lin.1.weight, _orig_mod.blocks.10.attn.mat_qkv.weight, _orig_mod.blocks.10.attn.proj.weight, '\n"
" '_orig_mod.blocks.10.ffn.fc1.weight, _orig_mod.blocks.10.ffn.fc2.weight, _orig_mod.blocks.10.ada_lin.1.weight, _orig_mod.blocks.11.attn.mat_qkv.weight, _orig_mod.blocks.11.attn.proj.weight, '\n"
" '_orig_mod.blocks.11.ffn.fc1.weight, _orig_mod.blocks.11.ffn.fc2.weight, _orig_mod.blocks.11.ada_lin.1.weight, _orig_mod.blocks.12.attn.mat_qkv.weight, _orig_mod.blocks.12.attn.proj.weight, '\n"
" '_orig_mod.blocks.12.ffn.fc1.weight, _orig_mod.blocks.12.ffn.fc2.weight, _orig_mod.blocks.12.ada_lin.1.weight, _orig_mod.blocks.13.attn.mat_qkv.weight, _orig_mod.blocks.13.attn.proj.weight, '\n"
" '_orig_mod.blocks.13.ffn.fc1.weight, _orig_mod.blocks.13.ffn.fc2.weight, _orig_mod.blocks.13.ada_lin.1.weight, _orig_mod.blocks.14.attn.mat_qkv.weight, _orig_mod.blocks.14.attn.proj.weight, '\n"
" '_orig_mod.blocks.14.ffn.fc1.weight, _orig_mod.blocks.14.ffn.fc2.weight, _orig_mod.blocks.14.ada_lin.1.weight, _orig_mod.blocks.15.attn.mat_qkv.weight, _orig_mod.blocks.15.attn.proj.weight, '\n"
" '_orig_mod.blocks.15.ffn.fc1.weight, _orig_mod.blocks.15.ffn.fc2.weight, _orig_mod.blocks.15.ada_lin.1.weight, _orig_mod.blocks.16.attn.mat_qkv.weight, _orig_mod.blocks.16.attn.proj.weight, '\n"
" '_orig_mod.blocks.16.ffn.fc1.weight, _orig_mod.blocks.16.ffn.fc2.weight, _orig_mod.blocks.16.ada_lin.1.weight, _orig_mod.blocks.17.attn.mat_qkv.weight, _orig_mod.blocks.17.attn.proj.weight, '\n"
" '_orig_mod.blocks.17.ffn.fc1.weight, _orig_mod.blocks.17.ffn.fc2.weight, _orig_mod.blocks.17.ada_lin.1.weight, _orig_mod.blocks.18.attn.mat_qkv.weight, _orig_mod.blocks.18.attn.proj.weight, '\n"
" '_orig_mod.blocks.18.ffn.fc1.weight, _orig_mod.blocks.18.ffn.fc2.weight, _orig_mod.blocks.18.ada_lin.1.weight, _orig_mod.blocks.19.attn.mat_qkv.weight, _orig_mod.blocks.19.attn.proj.weight, '\n"
" '_orig_mod.blocks.19.ffn.fc1.weight, _orig_mod.blocks.19.ffn.fc2.weight, _orig_mod.blocks.19.ada_lin.1.weight, _orig_mod.head_nm.ada_lin.1.weight, _orig_mod.head.weight')",
'wd_sc': 1.0},
'ND': { 'lr_sc': 1.0,
'params': "('_orig_mod.pos_start, _orig_mod.pos_1LC, _orig_mod.lvl_embed.weight, _orig_mod.query_block.attn.scale_mul_1H11, _orig_mod.query_block.attn.q_bias, _orig_mod.query_block.attn.v_bias, '\n"
" '_orig_mod.query_block.attn.proj.bias, _orig_mod.query_block.ffn.fc1.bias, _orig_mod.query_block.ffn.fc2.bias, _orig_mod.query_block.ada_lin.1.bias, _orig_mod.blocks.0.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.0.attn.q_bias, _orig_mod.blocks.0.attn.v_bias, _orig_mod.blocks.0.attn.proj.bias, _orig_mod.blocks.0.ffn.fc1.bias, _orig_mod.blocks.0.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.0.ada_lin.1.bias, _orig_mod.blocks.1.attn.scale_mul_1H11, _orig_mod.blocks.1.attn.q_bias, _orig_mod.blocks.1.attn.v_bias, _orig_mod.blocks.1.attn.proj.bias, '\n"
" '_orig_mod.blocks.1.ffn.fc1.bias, _orig_mod.blocks.1.ffn.fc2.bias, _orig_mod.blocks.1.ada_lin.1.bias, _orig_mod.blocks.2.attn.scale_mul_1H11, _orig_mod.blocks.2.attn.q_bias, '\n"
" '_orig_mod.blocks.2.attn.v_bias, _orig_mod.blocks.2.attn.proj.bias, _orig_mod.blocks.2.ffn.fc1.bias, _orig_mod.blocks.2.ffn.fc2.bias, _orig_mod.blocks.2.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.3.attn.scale_mul_1H11, _orig_mod.blocks.3.attn.q_bias, _orig_mod.blocks.3.attn.v_bias, _orig_mod.blocks.3.attn.proj.bias, _orig_mod.blocks.3.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.3.ffn.fc2.bias, _orig_mod.blocks.3.ada_lin.1.bias, _orig_mod.blocks.4.attn.scale_mul_1H11, _orig_mod.blocks.4.attn.q_bias, _orig_mod.blocks.4.attn.v_bias, '\n"
" '_orig_mod.blocks.4.attn.proj.bias, _orig_mod.blocks.4.ffn.fc1.bias, _orig_mod.blocks.4.ffn.fc2.bias, _orig_mod.blocks.4.ada_lin.1.bias, _orig_mod.blocks.5.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.5.attn.q_bias, _orig_mod.blocks.5.attn.v_bias, _orig_mod.blocks.5.attn.proj.bias, _orig_mod.blocks.5.ffn.fc1.bias, _orig_mod.blocks.5.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.5.ada_lin.1.bias, _orig_mod.blocks.6.attn.scale_mul_1H11, _orig_mod.blocks.6.attn.q_bias, _orig_mod.blocks.6.attn.v_bias, _orig_mod.blocks.6.attn.proj.bias, '\n"
" '_orig_mod.blocks.6.ffn.fc1.bias, _orig_mod.blocks.6.ffn.fc2.bias, _orig_mod.blocks.6.ada_lin.1.bias, _orig_mod.blocks.7.attn.scale_mul_1H11, _orig_mod.blocks.7.attn.q_bias, '\n"
" '_orig_mod.blocks.7.attn.v_bias, _orig_mod.blocks.7.attn.proj.bias, _orig_mod.blocks.7.ffn.fc1.bias, _orig_mod.blocks.7.ffn.fc2.bias, _orig_mod.blocks.7.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.8.attn.scale_mul_1H11, _orig_mod.blocks.8.attn.q_bias, _orig_mod.blocks.8.attn.v_bias, _orig_mod.blocks.8.attn.proj.bias, _orig_mod.blocks.8.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.8.ffn.fc2.bias, _orig_mod.blocks.8.ada_lin.1.bias, _orig_mod.blocks.9.attn.scale_mul_1H11, _orig_mod.blocks.9.attn.q_bias, _orig_mod.blocks.9.attn.v_bias, '\n"
" '_orig_mod.blocks.9.attn.proj.bias, _orig_mod.blocks.9.ffn.fc1.bias, _orig_mod.blocks.9.ffn.fc2.bias, _orig_mod.blocks.9.ada_lin.1.bias, _orig_mod.blocks.10.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.10.attn.q_bias, _orig_mod.blocks.10.attn.v_bias, _orig_mod.blocks.10.attn.proj.bias, _orig_mod.blocks.10.ffn.fc1.bias, _orig_mod.blocks.10.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.10.ada_lin.1.bias, _orig_mod.blocks.11.attn.scale_mul_1H11, _orig_mod.blocks.11.attn.q_bias, _orig_mod.blocks.11.attn.v_bias, _orig_mod.blocks.11.attn.proj.bias, '\n"
" '_orig_mod.blocks.11.ffn.fc1.bias, _orig_mod.blocks.11.ffn.fc2.bias, _orig_mod.blocks.11.ada_lin.1.bias, _orig_mod.blocks.12.attn.scale_mul_1H11, _orig_mod.blocks.12.attn.q_bias, '\n"
" '_orig_mod.blocks.12.attn.v_bias, _orig_mod.blocks.12.attn.proj.bias, _orig_mod.blocks.12.ffn.fc1.bias, _orig_mod.blocks.12.ffn.fc2.bias, _orig_mod.blocks.12.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.13.attn.scale_mul_1H11, _orig_mod.blocks.13.attn.q_bias, _orig_mod.blocks.13.attn.v_bias, _orig_mod.blocks.13.attn.proj.bias, _orig_mod.blocks.13.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.13.ffn.fc2.bias, _orig_mod.blocks.13.ada_lin.1.bias, _orig_mod.blocks.14.attn.scale_mul_1H11, _orig_mod.blocks.14.attn.q_bias, _orig_mod.blocks.14.attn.v_bias, '\n"
" '_orig_mod.blocks.14.attn.proj.bias, _orig_mod.blocks.14.ffn.fc1.bias, _orig_mod.blocks.14.ffn.fc2.bias, _orig_mod.blocks.14.ada_lin.1.bias, _orig_mod.blocks.15.attn.scale_mul_1H11, '\n"
" '_orig_mod.blocks.15.attn.q_bias, _orig_mod.blocks.15.attn.v_bias, _orig_mod.blocks.15.attn.proj.bias, _orig_mod.blocks.15.ffn.fc1.bias, _orig_mod.blocks.15.ffn.fc2.bias, '\n"
" '_orig_mod.blocks.15.ada_lin.1.bias, _orig_mod.blocks.16.attn.scale_mul_1H11, _orig_mod.blocks.16.attn.q_bias, _orig_mod.blocks.16.attn.v_bias, _orig_mod.blocks.16.attn.proj.bias, '\n"
" '_orig_mod.blocks.16.ffn.fc1.bias, _orig_mod.blocks.16.ffn.fc2.bias, _orig_mod.blocks.16.ada_lin.1.bias, _orig_mod.blocks.17.attn.scale_mul_1H11, _orig_mod.blocks.17.attn.q_bias, '\n"
" '_orig_mod.blocks.17.attn.v_bias, _orig_mod.blocks.17.attn.proj.bias, _orig_mod.blocks.17.ffn.fc1.bias, _orig_mod.blocks.17.ffn.fc2.bias, _orig_mod.blocks.17.ada_lin.1.bias, '\n"
" '_orig_mod.blocks.18.attn.scale_mul_1H11, _orig_mod.blocks.18.attn.q_bias, _orig_mod.blocks.18.attn.v_bias, _orig_mod.blocks.18.attn.proj.bias, _orig_mod.blocks.18.ffn.fc1.bias, '\n"
" '_orig_mod.blocks.18.ffn.fc2.bias, _orig_mod.blocks.18.ada_lin.1.bias, _orig_mod.blocks.19.attn.scale_mul_1H11, _orig_mod.blocks.19.attn.q_bias, _orig_mod.blocks.19.attn.v_bias, '\n"
" '_orig_mod.blocks.19.attn.proj.bias, _orig_mod.blocks.19.ffn.fc1.bias, _orig_mod.blocks.19.ffn.fc2.bias, _orig_mod.blocks.19.ada_lin.1.bias, _orig_mod.head_nm.ada_lin.1.bias, _orig_mod.head.bias')",
'wd_sc': 0.0}}
[11-27 00:05:23] (/VAR/utils/lr_control.py, line 104)=> [get_param_groups][rank24] type(model).__name__='OptimizedModule' count=262, numel=663449508
[11-27 00:05:23] (/VAR/utils/lr_control.py, line 105)=>
[11-27 00:05:23] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}
[11-27 00:20:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1669] eta: 17 days, 9:23:49 tlr: 1.2e-06 tnm: 0.05 Lm: 9.704 (9.704) Lt: 9.704 (9.704) Accm: 0.01 (0.01) Acct: 0.00 (0.00) proj_loss: 0.0000 (0.0000) time: 900.3170 data: 0.0005
[11-27 00:05:23] (/VAR/utils/lr_control.py, line 105)=>
[11-27 00:05:23] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}
[11-27 00:20:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1669] eta: 17 days, 9:23:54 tlr: 1.2e-06 tnm: 0.05 Lm: 9.704 (9.704) Lt: 9.704 (9.704) Accm: 0.01 (0.01) Acct: 0.00 (0.00) proj_loss: 0.0000 (0.0000) time: 900.3200 data: 0.0005
[11-27 00:05:23] (/VAR/utils/lr_control.py, line 105)=>
[11-27 00:05:23] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}
[11-27 00:20:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1669] eta: 17 days, 9:25:52 tlr: 1.2e-06 tnm: 0.05 Lm: 9.704 (9.704) Lt: 9.704 (9.704) Accm: 0.01 (0.01) Acct: 0.00 (0.00) proj_loss: 0.0000 (0.0000) time: 900.3908 data: 0.0005
[11-27 00:05:23] (/VAR/utils/lr_control.py, line 105)=>
[11-27 00:05:23] (/home/user/VAR/train.py , line 146)=> [INIT] optim=functools.partial(<class 'torch.optim.adamw.AdamW'>, betas=(0.9, 0.95), fused=True), opt_kw={'lr': 0.00024000000000000003, 'weight_decay': 0}
[11-27 00:20:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 0/1669] eta: 17 days, 8:49:23 tlr: 1.2e-06 tnm: 0.05 Lm: 9.704 (9.704) Lt: 9.704 (9.704) Accm: 0.02 (0.02) Acct: 0.00 (0.00) proj_loss: 0.0000 (0.0000) time: 899.0792 data: 0.0006
[11-27 00:27:56] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 417/1669] eta: 1:07:29 tlr: 9.7e-06 tnm: 0.05 Lm: 9.700 (9.700) Lt: 9.700 (9.700) Accm: 0.01 (0.01) Acct: 0.00 (0.00) proj_loss: 0.0000 (0.0000) time: 0.4525 data: 0.0002
[11-27 00:27:56] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 417/1669] eta: 1:07:33 tlr: 9.7e-06 tnm: 0.05 Lm: 9.700 (9.700) Lt: 9.700 (9.700) Accm: 0.01 (0.01) Acct: 0.01 (0.01) proj_loss: 0.0000 (0.0000) time: 0.4525 data: 0.0002
[11-27 00:27:56] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 417/1669] eta: 1:07:33 tlr: 9.7e-06 tnm: 0.05 Lm: 9.700 (9.700) Lt: 9.700 (9.700) Accm: 0.01 (0.01) Acct: 0.02 (0.02) proj_loss: 0.0000 (0.0000) time: 0.4525 data: 0.0002
[11-27 00:27:56] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 417/1669] eta: 1:07:33 tlr: 9.7e-06 tnm: 0.05 Lm: 9.701 (9.701) Lt: 9.700 (9.700) Accm: 0.01 (0.01) Acct: 0.00 (0.00) proj_loss: 0.0000 (0.0000) time: 0.4525 data: 0.0002
[11-27 00:31:05] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 834/1669] eta: 0:25:42 tlr: 1.8e-05 tnm: 0.08 Lm: 9.698 (9.686) Lt: 9.696 (9.686) Accm: 0.01 (0.01) Acct: 0.00 (0.01) proj_loss: 0.0000 (0.0000) time: 0.4528 data: 0.0002
[11-27 00:31:05] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 834/1669] eta: 0:25:40 tlr: 1.8e-05 tnm: 0.08 Lm: 9.696 (9.686) Lt: 9.696 (9.687) Accm: 0.02 (0.02) Acct: 0.00 (0.01) proj_loss: 0.0000 (0.0000) time: 0.4528 data: 0.0002
[11-27 00:31:05] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 834/1669] eta: 0:25:42 tlr: 1.8e-05 tnm: 0.08 Lm: 9.696 (9.684) Lt: 9.696 (9.686) Accm: 0.01 (0.02) Acct: 0.03 (0.03) proj_loss: 0.0000 (0.0000) time: 0.4528 data: 0.0002
[11-27 00:31:05] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [ 834/1669] eta: 0:25:42 tlr: 1.8e-05 tnm: 0.08 Lm: 9.696 (9.686) Lt: 9.695 (9.687) Accm: 0.02 (0.02) Acct: 0.02 (0.02) proj_loss: 0.0000 (0.0000) time: 0.4528 data: 0.0002
[11-27 00:34:14] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1251/1669] eta: 0:09:38 tlr: 2.7e-05 tnm: 0.22 Lm: 9.677 (9.661) Lt: 9.678 (9.668) Accm: 0.02 (0.02) Acct: 0.03 (0.02) proj_loss: 0.0000 (0.0000) time: 0.4537 data: 0.0002
[11-27 00:34:14] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1251/1669] eta: 0:09:38 tlr: 2.7e-05 tnm: 0.22 Lm: 9.676 (9.661) Lt: 9.678 (9.668) Accm: 0.02 (0.02) Acct: 0.01 (0.01) proj_loss: 0.0000 (0.0000) time: 0.4537 data: 0.0002
[11-27 00:34:14] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1251/1669] eta: 0:09:37 tlr: 2.7e-05 tnm: 0.22 Lm: 9.677 (9.661) Lt: 9.678 (9.668) Accm: 0.01 (0.02) Acct: 0.00 (0.01) proj_loss: 0.0000 (0.0000) time: 0.4537 data: 0.0002
[11-27 00:34:14] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1251/1669] eta: 0:09:38 tlr: 2.7e-05 tnm: 0.22 Lm: 9.674 (9.658) Lt: 9.677 (9.666) Accm: 0.01 (0.02) Acct: 0.02 (0.02) proj_loss: 0.0000 (0.0000) time: 0.4537 data: 0.0002
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1668/1669] eta: 0:00:01 tlr: 3.5e-05 tnm: 0.51 Lm: 9.651 (9.618) Lt: 9.658 (9.633) Accm: 0.01 (0.02) Acct: 0.03 (0.02) proj_loss: 0.0000 (0.0000) time: 0.4558 data: 0.0014
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [ 0/350] Total time: 0:32:00 (1.151 s / it)
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1668/1669] eta: 0:00:01 tlr: 3.5e-05 tnm: 0.51 Lm: 9.658 (9.620) Lt: 9.661 (9.635) Accm: 0.02 (0.03) Acct: 0.03 (0.04) proj_loss: 0.0000 (0.0000) time: 0.4558 data: 0.0015
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1668/1669] eta: 0:00:01 tlr: 3.5e-05 tnm: 0.51 Lm: 9.657 (9.618) Lt: 9.659 (9.633) Accm: 0.02 (0.02) Acct: 0.00 (0.01) proj_loss: 0.0000 (0.0000) time: 0.4558 data: 0.0016
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 0/350] [1668/1669] eta: 0:00:01 tlr: 3.5e-05 tnm: 0.51 Lm: 9.655 (9.621) Lt: 9.659 (9.634) Accm: 0.03 (0.03) Acct: 0.00 (0.01) proj_loss: 0.0000 (0.0000) time: 0.4558 data: 0.0018
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [ 0/350] Total time: 0:32:00 (1.151 s / it)
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [ 0/350] Total time: 0:31:59 (1.150 s / it)
[11-27 00:37:23] (e/user/VAR/utils/misc.py, line 336)=> [Ep]: [ 0/350] Total time: 0:32:00 (1.151 s / it)
[11-27 00:37:23] (/home/user/VAR/train.py , line 279)=> [ep0] (training ) Lm: 9.617 (9.617), Lt: 9.633 (9.633), Acc m&t: 0.02 0.02, Remain: 3 days, 2:04:01, Finish: 2024-11-29 10:41
[11-27 00:37:23] (/home/user/VAR/train.py , line 279)=> [ep0] (training ) Lm: 9.617 (9.617), Lt: 9.633 (9.633), Acc m&t: 0.02 0.02, Remain: 3 days, 2:03:10, Finish: 2024-11-29 10:40
[11-27 00:37:23] (/home/user/VAR/train.py , line 279)=> [ep0] (training ) Lm: 9.617 (9.617), Lt: 9.633 (9.633), Acc m&t: 0.02 0.02, Remain: 3 days, 2:02:53, Finish: 2024-11-29 10:40
[11-27 00:37:23] (/home/user/VAR/train.py , line 279)=> [ep0] (training ) Lm: 9.617 (9.617), Lt: 9.633 (9.633), Acc m&t: 0.02 0.02, Remain: 3 days, 2:03:37, Finish: 2024-11-29 10:41
[11-27 00:37:24] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 0/1669] eta: 0:12:39 tlr: 3.5e-05 tnm: 0.49 Lm: 9.445 (9.445) Lt: 9.486 (9.486) Accm: 0.04 (0.04) Acct: 0.03 (0.03) proj_loss: 0.0000 (0.0000) time: 0.4551 data: 0.0003
[11-27 00:37:24] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 0/1669] eta: 0:12:39 tlr: 3.5e-05 tnm: 0.49 Lm: 9.446 (9.446) Lt: 9.497 (9.497) Accm: 0.04 (0.04) Acct: 0.03 (0.03) proj_loss: 0.0000 (0.0000) time: 0.4553 data: 0.0004
[11-27 00:37:24] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 0/1669] eta: 0:12:14 tlr: 3.5e-05 tnm: 0.49 Lm: 9.451 (9.451) Lt: 9.502 (9.502) Accm: 0.04 (0.04) Acct: 0.00 (0.00) proj_loss: 0.0000 (0.0000) time: 0.4402 data: 0.0003
[11-27 00:37:24] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 0/1669] eta: 0:12:39 tlr: 3.5e-05 tnm: 0.49 Lm: 9.441 (9.441) Lt: 9.487 (9.487) Accm: 0.04 (0.04) Acct: 0.07 (0.07) proj_loss: 0.0000 (0.0000) time: 0.4552 data: 0.0003
[11-27 00:40:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 417/1669] eta: 0:09:28 tlr: 4.4e-05 tnm: 0.92 Lm: 9.230 (9.230) Lt: 9.236 (9.236) Accm: 0.13 (0.13) Acct: 0.14 (0.14) proj_loss: 0.0000 (0.0000) time: 0.4531 data: 0.0002
[11-27 00:40:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 417/1669] eta: 0:09:28 tlr: 4.4e-05 tnm: 0.92 Lm: 9.215 (9.215) Lt: 9.233 (9.233) Accm: 0.12 (0.12) Acct: 0.10 (0.10) proj_loss: 0.0000 (0.0000) time: 0.4531 data: 0.0003
[11-27 00:40:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 417/1669] eta: 0:09:28 tlr: 4.4e-05 tnm: 0.92 Lm: 9.218 (9.218) Lt: 9.216 (9.216) Accm: 0.13 (0.13) Acct: 0.13 (0.13) proj_loss: 0.0000 (0.0000) time: 0.4531 data: 0.0002
[11-27 00:40:33] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 417/1669] eta: 0:09:28 tlr: 4.4e-05 tnm: 0.92 Lm: 9.228 (9.228) Lt: 9.246 (9.246) Accm: 0.13 (0.13) Acct: 0.09 (0.09) proj_loss: 0.0000 (0.0000) time: 0.4531 data: 0.0002
[11-27 00:43:42] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 834/1669] eta: 0:06:18 tlr: 5.2e-05 tnm: 1.56 Lm: 9.005 (8.972) Lt: 8.991 (8.917) Accm: 0.21 (0.24) Acct: 0.19 (0.22) proj_loss: 0.0000 (0.0000) time: 0.4545 data: 0.0002
[11-27 00:43:42] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 834/1669] eta: 0:06:18 tlr: 5.2e-05 tnm: 1.56 Lm: 8.991 (8.972) Lt: 8.947 (8.909) Accm: 0.23 (0.26) Acct: 0.22 (0.36) proj_loss: 0.0000 (0.0000) time: 0.4545 data: 0.0002
[11-27 00:43:42] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 834/1669] eta: 0:06:18 tlr: 5.2e-05 tnm: 1.56 Lm: 9.020 (8.981) Lt: 8.985 (8.926) Accm: 0.23 (0.22) Acct: 0.21 (0.22) proj_loss: 0.0000 (0.0000) time: 0.4545 data: 0.0003
[11-27 00:43:42] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [ 834/1669] eta: 0:06:18 tlr: 5.2e-05 tnm: 1.56 Lm: 8.985 (8.970) Lt: 8.968 (8.918) Accm: 0.21 (0.23) Acct: 0.17 (0.26) proj_loss: 0.0000 (0.0000) time: 0.4545 data: 0.0002
[11-27 00:46:52] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [1251/1669] eta: 0:03:09 tlr: 6.1e-05 tnm: 1.51 Lm: 8.735 (8.805) Lt: 8.621 (8.690) Accm: 0.38 (0.34) Acct: 0.49 (0.46) proj_loss: 0.0000 (0.0000) time: 0.4532 data: 0.0002
[11-27 00:46:52] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [1251/1669] eta: 0:03:09 tlr: 6.1e-05 tnm: 1.51 Lm: 8.732 (8.798) Lt: 8.625 (8.690) Accm: 0.34 (0.32) Acct: 0.34 (0.34) proj_loss: 0.0000 (0.0000) time: 0.4532 data: 0.0002
[11-27 00:46:52] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [1251/1669] eta: 0:03:09 tlr: 6.1e-05 tnm: 1.51 Lm: 8.751 (8.817) Lt: 8.645 (8.717) Accm: 0.32 (0.30) Acct: 0.30 (0.36) proj_loss: 0.0000 (0.0000) time: 0.4532 data: 0.0002
[11-27 00:46:52] (e/user/VAR/utils/misc.py, line 314)=> [Ep]: [ 1/350] [1251/1669] eta: 0:03:09 tlr: 6.1e-05 tnm: 1.51 Lm: 8.732 (8.799) Lt: 8.628 (8.695) Accm: 0.33 (0.28) Acct: 0.34 (0.32) proj_loss: 0.0000 (0.0000) time: 0.4532 data: 0.0002