{ "_name_or_path": "/mnt/bn/tiktok-mm-4/aiic/public/model/OV-Qwen2-7B-AM9", "add_time_token": true, "architectures": [ "LlavaAVQwenForCausalLM" ], "attention_dropout": 0.0, "bos_token_id": 151643, "eos_token_id": 151645, "hidden_act": "silu", "hidden_size": 3584, "image_aspect_ratio": "anyres_max_9", "image_crop_resolution": null, "image_grid_pinpoints": [ [ 384, 384 ], [ 384, 768 ], [ 384, 1152 ], [ 384, 1536 ], [ 768, 768 ], [ 384, 1920 ], [ 768, 768 ], [ 384, 2304 ], [ 768, 1152 ], [ 384, 2688 ], [ 768, 1152 ], [ 384, 3072 ], [ 768, 1536 ], [ 384, 3456 ], [ 768, 1536 ], [ 1152, 1152 ], [ 768, 1920 ], [ 1152, 1152 ], [ 768, 1920 ], [ 1152, 1152 ], [ 768, 2304 ], [ 1152, 1536 ], [ 768, 2304 ], [ 1152, 1536 ], [ 768, 2688 ], [ 1152, 1536 ], [ 768, 2688 ], [ 1152, 1920 ], [ 768, 3072 ], [ 1152, 1920 ], [ 1536, 1536 ], [ 768, 3072 ], [ 1152, 1920 ], [ 1536, 1536 ], [ 768, 3456 ], [ 1152, 2304 ], [ 1536, 1536 ], [ 768, 3456 ], [ 1152, 2304 ], [ 1536, 1536 ], [ 768, 3840 ], [ 1152, 2304 ], [ 1536, 1920 ], [ 768, 3840 ], [ 1152, 2688 ], [ 1536, 1920 ], [ 768, 4224 ], [ 1152, 2688 ], [ 1536, 1920 ], [ 768, 4224 ], [ 1152, 2688 ], [ 1536, 1920 ], [ 768, 4608 ], [ 1152, 3072 ], [ 1536, 2304 ], [ 768, 4608 ], [ 1152, 3072 ], [ 1536, 2304 ], [ 1920, 1920 ], [ 768, 4992 ], [ 1152, 3072 ], [ 1536, 2304 ], [ 1920, 1920 ], [ 768, 4992 ], [ 1152, 3456 ], [ 1536, 2304 ], [ 1920, 1920 ], [ 768, 5376 ], [ 1152, 3456 ], [ 1536, 2688 ], [ 1920, 1920 ], [ 768, 5376 ], [ 1152, 3456 ], [ 1536, 2688 ], [ 1920, 1920 ], [ 768, 5760 ], [ 1152, 3840 ], [ 1536, 2688 ], [ 1920, 2304 ], [ 768, 5760 ], [ 1152, 3840 ], [ 1536, 2688 ], [ 1920, 2304 ], [ 768, 6144 ], [ 1152, 3840 ], [ 1536, 3072 ], [ 1920, 2304 ], [ 768, 6144 ], [ 1152, 4224 ], [ 1536, 3072 ], [ 1920, 2304 ], [ 768, 6528 ], [ 1152, 4224 ], [ 1536, 3072 ], [ 1920, 2304 ], [ 768, 6528 ], [ 1152, 4224 ], [ 1536, 3072 ], [ 1920, 2688 ], [ 768, 6912 ], [ 1152, 4608 ], [ 1536, 3456 ], [ 1920, 2688 ], [ 2304, 2304 ] ], "image_split_resolution": null, "initializer_range": 0.02, "intermediate_size": 18944, "max_position_embeddings": 32768, "max_window_layers": 28, "mf_split_init": false, "mm_hidden_size": 1152, "mm_patch_merge_type": "spatial_unpad", "mm_projector_lr": null, "mm_projector_type": "mlp2x_gelu", "mm_resampler_type": null, "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model", "mm_use_im_patch_token": false, "mm_use_im_start_end": false, "mm_vision_select_feature": "patch", "mm_vision_select_layer": -2, "mm_vision_tower": "google/siglip-so400m-patch14-384", "mm_vision_tower_lr": 2e-06, "model_args": { "add_time_token": true, "audio_visual": true, "beats_path": null, "do_rag": false, "flash_attn": false, "fps": 2, "freeze_backbone": true, "freeze_beats": true, "freeze_final_linear": true, "freeze_speech_QFormer": true, "freeze_whisper": true, "from_stage_1_5": false, "image_processor": "google/siglip-so400m-patch14-384", "lora_alpha": 256, "lora_dropout": 0.05, "lora_enable": true, "lora_r": 64, "mf_split_init": false, "mm_mask_drop_mode": "fixed", "mm_mask_drop_ratio": 0.25, "mm_mask_drop_ratio_lower": null, "mm_mask_drop_ratio_upper": null, "mm_mask_drop_skip_percentage": 0.0, "mm_newline_position": "grid", "mm_patch_merge_type": "spatial_unpad", "mm_perceiver_depth": 3, "mm_perceiver_ff_mult": 4, "mm_perceiver_latents": 32, "mm_perceiver_pretrained": null, "mm_pooling_position": "before", "mm_projector_type": "mlp2x_gelu", "mm_qformer_depth": 3, "mm_qformer_latents": 32, "mm_qformer_pretrained": null, "mm_resampler_type": "spatial_pool", "mm_spatial_pool_mode": "max", "mm_spatial_pool_out_channels": 1152, "mm_spatial_pool_stride": 2, "mm_use_im_patch_token": false, "mm_use_im_start_end": false, "mm_vision_select_feature": "patch", "mm_vision_select_layer": -2, "mm_vlmattention_bert_type": "qformer_pretrain", "mm_vlmattention_compress_type": null, "mm_vlmattention_num_query": 32, "mm_vlmattention_pretrained": null, "modality_max_length": "None", "model_name_or_path": "/mnt/bn/tiktok-mm-4/aiic/users/guangzhisun/llava-video/output/models/sft_finevideo_with_caption_qa_reasoning/checkpoint_93000_qwen", "multi_frame_num": 30, "multi_frame_projector": false, "niv_cnn_params": "[(10, 5), (3, 2), (3, 2), (3, 2), (3, 2), (2, 2), (2, 2)]", "niv_in_channels": 4, "niv_out_channels": 401, "num_speech_query_token": 1, "patchify_video_feature": false, "pretrain_mm_mlp_adapter": null, "rag_input_frames": 1, "rag_topk": 5, "rag_type": "direct", "salmonn_path": null, "second_per_window": 0.2, "second_stride": 0.2, "segmentation": -1, "spt_projector": false, "tune_mm_mlp_adapter": false, "tune_mm_vision_resampler": false, "unfreeze_mm_vision_tower": false, "use_final_linear": true, "use_flash_tower": true, "use_mfcnn": false, "use_mftrans": false, "use_niv": false, "use_speech_Qformer": true, "version": "qwen_1_5", "vision_tower": "google/siglip-so400m-patch14-384", "whisper_lora": false, "whisper_path": "openai/whisper-large-v3", "window_level_Qformer": true }, "model_type": "qwen2", "multi_frame_num": 30, "multi_frame_projector": false, "num_attention_heads": 28, "num_hidden_layers": 28, "num_key_value_heads": 4, "pos_skipping_range": 4096, "rms_norm_eps": 1e-06, "rope_scaling": null, "rope_theta": 1000000.0, "sliding_window": 131072, "tie_word_embeddings": false, "tokenizer_model_max_length": 32768, "tokenizer_padding_side": "right", "torch_dtype": "bfloat16", "transformers_version": "4.39.2", "use_cache": true, "use_flash_tower": true, "use_mfcnn": false, "use_mftrans": false, "use_mm_proj": true, "use_pos_skipping": false, "use_sliding_window": false, "vision_tower_pretrained": null, "vocab_size": 152064 }