kiwhansong's picture
fix readme & batch size
0afe03d
debug: False
lr: 5e-5
backbone:
name: u_vit3d_pose
channels:
- 128
- 256
- 576
- 1152
emb_channels: 1024
patch_size: 2
block_types:
- ResBlock
- ResBlock
- TransformerBlock
- TransformerBlock
block_dropouts:
- 0.0
- 0.0
- 0.1
- 0.1
num_updown_blocks:
- 3
- 3
- 6
num_mid_blocks: 20
num_heads: 9
pos_emb_type: rope
use_checkpointing:
- false
- false
- false
- true
conditioning:
dim: null
external_cond_dropout: 0.1
use_fourier_noise_embedding: true
x_shape: [3, 256, 256]
max_frames: 8
n_frames: 8
frame_skip: 1
context_frames: 1
latent:
enable: False
type: pre_sample
suffix: null
downsampling_factor: [1, 8]
num_channels: 4
data_mean: [[[0.577]], [[0.517]], [[0.461]]]
data_std: [[[0.249]], [[0.249]], [[0.268]]]
external_cond_dim: 16
external_cond_stack: False
external_cond_processing: null
compile: false
weight_decay: 0.01
optimizer_beta:
- 0.9
- 0.99
lr_scheduler:
name: constant_with_warmup
num_warmup_steps: 10000
num_training_steps: 550000
noise_level: random_independent
uniform_future:
enabled: false
fixed_context:
enabled: false
indices: null
dropout: 0
variable_context:
enabled: false
prob: 0
dropout: 0
chunk_size: -1
scheduling_matrix: full_sequence
replacement: noisy_scale
diffusion:
is_continuous: true
timesteps: 1000
beta_schedule: cosine_simple_diffusion
schedule_fn_kwargs:
shift: 1.0
shifted: 0.125
interpolated: false
use_causal_mask: false
clip_noise: 20.0
objective: pred_v
loss_weighting:
strategy: sigmoid
snr_clip: 5.0
cum_snr_decay: 0.9
sigmoid_bias: -1.0
sampling_timesteps: 50
ddim_sampling_eta: 0.0
reconstruction_guidance: 0.0
training_schedule:
name: cosine
shift: 0.125
precond_scale: 0.125
vae:
pretrained_path: null
pretrained_kwargs: {}
use_fp16: true
batch_size: 2
checkpoint:
reset_optimizer: false
strict: true
tasks:
prediction:
enabled: true
history_guidance:
name: stabilized_vanilla
guidance_scale: 4.0
stabilization_level: 0.02
visualize: False
keyframe_density: null
sliding_context_len: null
interpolation:
enabled: false
history_guidance:
name: vanilla
guidance_scale: 1
visualize: False
max_batch_size: 4
logging:
deterministic: null
loss_freq: 100
grad_norm_freq: 100
max_num_videos: 256
n_metrics_frames: null
metrics: []
metrics_batch_size: 16
sanity_generation: false
raw_dir: null
camera_pose_conditioning:
normalize_by: first
bound: null
type: ray_encoding