{
  "adanorm_num_embeddings": 4,
  "architectures": [
    "VocosWithEncodecModel"
  ],
  "bandwidths": [
    1.5,
    3.0,
    6.0,
    12.0
  ],
  "encodec_config": {
    "audio_channels": 1,
    "chunk_length_s": null,
    "codebook_dim": 128,
    "codebook_size": 1024,
    "compress": 2,
    "dilation_growth_rate": 2,
    "hidden_size": 128,
    "kernel_size": 7,
    "last_kernel_size": 7,
    "model_type": "encodec",
    "norm_type": "weight_norm",
    "normalize": false,
    "num_filters": 32,
    "num_lstm_layers": 2,
    "num_residual_layers": 1,
    "overlap": null,
    "pad_mode": "reflect",
    "residual_kernel_size": 3,
    "sampling_rate": 24000,
    "target_bandwidths": [
      1.5,
      3.0,
      6.0,
      12.0,
      24.0
    ],
    "trim_right_ratio": 1.0,
    "upsampling_ratios": [
      8,
      5,
      4,
      2
    ],
    "use_causal_conv": true,
    "use_conv_shortcut": true
  },
  "hidden_dim": 384,
  "hop_length": 320,
  "input_channels": 128,
  "intermediate_dim": 1152,
  "kernel_size": 7,
  "layer_norm_eps": 1e-06,
  "layer_scale_init_value": 0.125,
  "model_type": "vocos_with_encodec",
  "n_fft": 1280,
  "num_layers": 8,
  "padding": 3,
  "spec_padding": "same",
  "torch_dtype": "float32",
  "train_codebooks": false,
  "transformers_version": "4.55.2",
  "use_adaptive_norm": true
}