kanana-1.5-v-3b-instruct / configuration.py
peterroh's picture
Upload folder using huggingface_hub
4ce7387 verified
import logging
from transformers.configuration_utils import PretrainedConfig
from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
logger = logging.getLogger("kanana-1.5-v")
class KananaVVisionConfig(PretrainedConfig):
model_type = "kanana-1.5-v-visual-encoder"
base_config_key = "vision_config"
def __init__(
self,
depth=32,
embed_dim=1280,
mlp_ratio=4,
num_heads=16,
in_chans=3,
hidden_size=1280,
patch_size=14,
spatial_merge_size=2,
spatial_patch_size=14,
temporal_patch_size=2,
initializer_range=0.02,
image_size="dynamic",
image_mean=OPENAI_CLIP_MEAN,
image_std=OPENAI_CLIP_STD,
**kwargs,
):
super().__init__(**kwargs)
self.depth = depth
self.embed_dim = embed_dim
self.mlp_ratio = mlp_ratio
self.num_heads = num_heads
self.in_chans = in_chans
self.hidden_size = hidden_size
self.patch_size = patch_size
self.spatial_merge_size = spatial_merge_size
self.spatial_patch_size = spatial_patch_size
self.temporal_patch_size = temporal_patch_size
self.initializer_range = initializer_range
self.image_size = image_size
self.image_mean = image_mean
self.image_std = image_std
class KananaVVisualProjectorConfig(PretrainedConfig):
model_type = "kanana-1.5-v-visual_projector"
base_config_key = "projector_config"
def __init__(
self,
depth=2,
encoder_hidden_size=1280,
feature_layer_index=-1,
hidden_size=1024,
merge_size=2,
mlp_depth=2,
num_eos_tokens=0,
output_hidden_size=2048,
pos_emb=True,
pos_emb_size=576,
prenorm=False,
projector_type="dynamic-c-abs",
**kwargs,
):
super().__init__(**kwargs)
self.depth = depth
self.encoder_hidden_size = encoder_hidden_size
self.feature_layer_index = feature_layer_index
self.hidden_size = hidden_size
self.merge_size = merge_size
self.mlp_depth = mlp_depth
self.num_eos_tokens = num_eos_tokens
self.output_hidden_size = output_hidden_size
self.pos_emb = pos_emb
self.pos_emb_size = pos_emb_size
self.prenorm = prenorm
self.projector_type = projector_type
class KananaLanguageConfig(LlamaConfig):
model_type = "kanana-1.5-3b-instruct"
base_config_key = "text_config"
def __init__(
self,
**kwargs,
):
super().__init__(**kwargs)
class KananaVConfig(PretrainedConfig):
model_type = "kanana-1.5-v"
is_composition = True
def __init__(
self,
vision_config: dict = {},
projector_config: dict = {},
text_config: dict = {},
**kwargs,
):
super().__init__(**kwargs)
# Vision config
self.vision_config = KananaVVisionConfig(**vision_config)
# Visual projector config
self.projector_config = KananaVVisualProjectorConfig(**projector_config)
# Language model config
self.text_config = KananaLanguageConfig(**text_config)
@property
def num_visual_tokens(self):
return "dynamic"
@property
def hidden_size(self):
return self.text_config.hidden_size