|
import logging |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.models.llama.configuration_llama import LlamaConfig |
|
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD |
|
|
|
logger = logging.getLogger("kanana-1.5-v") |
|
|
|
|
|
class KananaVVisionConfig(PretrainedConfig): |
|
model_type = "kanana-1.5-v-visual-encoder" |
|
base_config_key = "vision_config" |
|
|
|
def __init__( |
|
self, |
|
depth=32, |
|
embed_dim=1280, |
|
mlp_ratio=4, |
|
num_heads=16, |
|
in_chans=3, |
|
hidden_size=1280, |
|
patch_size=14, |
|
spatial_merge_size=2, |
|
spatial_patch_size=14, |
|
temporal_patch_size=2, |
|
initializer_range=0.02, |
|
image_size="dynamic", |
|
image_mean=OPENAI_CLIP_MEAN, |
|
image_std=OPENAI_CLIP_STD, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
|
|
self.depth = depth |
|
self.embed_dim = embed_dim |
|
self.mlp_ratio = mlp_ratio |
|
self.num_heads = num_heads |
|
self.in_chans = in_chans |
|
self.hidden_size = hidden_size |
|
self.patch_size = patch_size |
|
self.spatial_merge_size = spatial_merge_size |
|
self.spatial_patch_size = spatial_patch_size |
|
self.temporal_patch_size = temporal_patch_size |
|
self.initializer_range = initializer_range |
|
self.image_size = image_size |
|
self.image_mean = image_mean |
|
self.image_std = image_std |
|
|
|
|
|
class KananaVVisualProjectorConfig(PretrainedConfig): |
|
model_type = "kanana-1.5-v-visual_projector" |
|
base_config_key = "projector_config" |
|
|
|
def __init__( |
|
self, |
|
depth=2, |
|
encoder_hidden_size=1280, |
|
feature_layer_index=-1, |
|
hidden_size=1024, |
|
merge_size=2, |
|
mlp_depth=2, |
|
num_eos_tokens=0, |
|
output_hidden_size=2048, |
|
pos_emb=True, |
|
pos_emb_size=576, |
|
prenorm=False, |
|
projector_type="dynamic-c-abs", |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
|
|
self.depth = depth |
|
self.encoder_hidden_size = encoder_hidden_size |
|
self.feature_layer_index = feature_layer_index |
|
self.hidden_size = hidden_size |
|
self.merge_size = merge_size |
|
self.mlp_depth = mlp_depth |
|
self.num_eos_tokens = num_eos_tokens |
|
self.output_hidden_size = output_hidden_size |
|
self.pos_emb = pos_emb |
|
self.pos_emb_size = pos_emb_size |
|
self.prenorm = prenorm |
|
self.projector_type = projector_type |
|
|
|
|
|
class KananaLanguageConfig(LlamaConfig): |
|
model_type = "kanana-1.5-3b-instruct" |
|
base_config_key = "text_config" |
|
|
|
def __init__( |
|
self, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
|
|
|
|
class KananaVConfig(PretrainedConfig): |
|
model_type = "kanana-1.5-v" |
|
is_composition = True |
|
|
|
def __init__( |
|
self, |
|
vision_config: dict = {}, |
|
projector_config: dict = {}, |
|
text_config: dict = {}, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
|
|
|
|
self.vision_config = KananaVVisionConfig(**vision_config) |
|
|
|
|
|
self.projector_config = KananaVVisualProjectorConfig(**projector_config) |
|
|
|
|
|
self.text_config = KananaLanguageConfig(**text_config) |
|
|
|
@property |
|
def num_visual_tokens(self): |
|
return "dynamic" |
|
|
|
@property |
|
def hidden_size(self): |
|
return self.text_config.hidden_size |
|
|