debug: False lr: 5e-5 backbone: name: u_vit3d_pose channels: - 128 - 256 - 576 - 1152 emb_channels: 1024 patch_size: 2 block_types: - ResBlock - ResBlock - TransformerBlock - TransformerBlock block_dropouts: - 0.0 - 0.0 - 0.1 - 0.1 num_updown_blocks: - 3 - 3 - 6 num_mid_blocks: 20 num_heads: 9 pos_emb_type: rope use_checkpointing: - false - false - false - true conditioning: dim: null external_cond_dropout: 0.1 use_fourier_noise_embedding: true x_shape: [3, 256, 256] max_frames: 8 n_frames: 8 frame_skip: 1 context_frames: 1 latent: enable: False type: pre_sample suffix: null downsampling_factor: [1, 8] num_channels: 4 data_mean: [[[0.577]], [[0.517]], [[0.461]]] data_std: [[[0.249]], [[0.249]], [[0.268]]] external_cond_dim: 16 external_cond_stack: False external_cond_processing: null compile: false weight_decay: 0.01 optimizer_beta: - 0.9 - 0.99 lr_scheduler: name: constant_with_warmup num_warmup_steps: 10000 num_training_steps: 550000 noise_level: random_independent uniform_future: enabled: false fixed_context: enabled: false indices: null dropout: 0 variable_context: enabled: false prob: 0 dropout: 0 chunk_size: -1 scheduling_matrix: full_sequence replacement: noisy_scale diffusion: is_continuous: true timesteps: 1000 beta_schedule: cosine_simple_diffusion schedule_fn_kwargs: shift: 1.0 shifted: 0.125 interpolated: false use_causal_mask: false clip_noise: 20.0 objective: pred_v loss_weighting: strategy: sigmoid snr_clip: 5.0 cum_snr_decay: 0.9 sigmoid_bias: -1.0 sampling_timesteps: 50 ddim_sampling_eta: 0.0 reconstruction_guidance: 0.0 training_schedule: name: cosine shift: 0.125 precond_scale: 0.125 vae: pretrained_path: null pretrained_kwargs: {} use_fp16: true batch_size: 2 checkpoint: reset_optimizer: false strict: true tasks: prediction: enabled: true history_guidance: name: stabilized_vanilla guidance_scale: 4.0 stabilization_level: 0.02 visualize: False keyframe_density: null sliding_context_len: null interpolation: enabled: false history_guidance: name: vanilla guidance_scale: 1 visualize: False max_batch_size: 4 logging: deterministic: null loss_freq: 100 grad_norm_freq: 100 max_num_videos: 256 n_metrics_frames: null metrics: [] metrics_batch_size: 16 sanity_generation: false raw_dir: null camera_pose_conditioning: normalize_by: first bound: null type: ray_encoding