{ "architectures": [ "DiaModel" ], "bos_token_id": 1026, "decoder_config": { "_attn_implementation_autoset": true, "cross_head_dim": 128, "cross_num_attention_heads": 16, "cross_num_key_value_heads": 8, "dropout": 0, "head_dim": 128, "hidden_act": "silu", "hidden_size": 2048, "intermediate_size": 8192, "model_type": "dia_decoder", "norm_eps": 1e-05, "num_attention_heads": 16, "num_channels": 9, "num_hidden_layers": 18, "num_key_value_heads": 4, "rope_max_timescale": 10000, "rope_min_timescale": 1, "torch_dtype": "float32", "vocab_size": 1028 }, "delay_pattern": [ 0, 8, 9, 10, 11, 12, 13, 14, 15 ], "encoder_config": { "_attn_implementation_autoset": true, "dropout": 0, "head_dim": 128, "hidden_act": "silu", "hidden_size": 1024, "intermediate_size": 4096, "model_type": "dia_encoder", "norm_eps": 1e-05, "num_attention_heads": 16, "num_hidden_layers": 12, "num_key_value_heads": 16, "rope_max_timescale": 10000, "rope_min_timescale": 1, "torch_dtype": "float32", "vocab_size": 256 }, "eos_token_id": 1024, "is_encoder_decoder": true, "model_type": "dia", "norm_eps": 1e-05, "pad_token_id": 1025, "torch_dtype": "float32", "transformers_version": "4.52.0.dev0" }