sarashina2-vision-14b / configuration_sarashina2_vision.py

update

073ed96 verified 3 months ago

2.92 kB

	# coding=utf-8
	# Copyright 2025 the SB Intuitions.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Sarashina2Vision model configuration"""

	from typing import Any, Optional

	from transformers import LlamaConfig, PretrainedConfig
	from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig
	from transformers.utils import logging

	logger = logging.get_logger(__name__)


	class Sarashina2VisionConfig(PretrainedConfig):
	"""
	This is the configuration class to store the configuration of a [`Sarashina2VisionModel`]. It is used to instantiate a
	Sarashina2Vision model according to the specified arguments, defining the model architecture.

	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.

	Args:
	vision_config (`Dict`, optional):
	The config for the visual encoder initialization.
	text_config (`Dict`, optional):
	The config for the text decoder initialization.
	image_token_index (`int`):
	image token id.
	start_image_token_index (`int`):
	start image token id.
	end_image_token_index (`int`):
	end image token id.
	"""

	model_type = "sarashina2_vision"

	def __init__(
	self,
	vision_config: Optional[dict[str, Any]] = None,
	text_config: Optional[dict[str, Any]] = None,
	image_token_index: int = 14,
	start_image_token_index: int = 102397,
	end_image_token_index: int = 102398,
	**kwargs,
	):
	if isinstance(text_config, dict):
	self.text_config = LlamaConfig(**text_config)
	elif isinstance(text_config, LlamaConfig):
	self.text_config = text_config
	elif text_config is None:
	self.text_config = LlamaConfig()

	if isinstance(vision_config, dict):
	self.vision_config = Qwen2VLVisionConfig(**vision_config)
	elif isinstance(vision_config, Qwen2VLVisionConfig):
	self.vision_config = vision_config
	elif vision_config is None:
	self.vision_config = Qwen2VLVisionConfig()

	self.image_token_index = image_token_index
	self.start_image_token_index = start_image_token_index
	self.end_image_token_index = end_image_token_index

	super().__init__(**kwargs)