{ "architectures": [ "FlashAttentionForCausalLM" ], "model_type": "flash_attention_lm", "vocab_size": 50257, "hidden_size": 512, "num_hidden_layers": 8, "num_attention_heads": 8, "max_position_embeddings": 512, "hidden_dropout_prob": 0.1, "use_flash_attention": true, "gradient_checkpointing": false, "torch_dtype": "float32", "transformers_version": "4.40.0" }