Spaces:

rifatramadhani
/

wip-test

Running

App Files Files Community

wip-test / models_config.py

rifatramadhani

wip

231d431 10 days ago

raw

history blame contribute delete

2.1 kB

	# models_config.py

	CANONICAL_MODELS = {
	"all-MiniLM-L6-v2": {
	"name": "sentence-transformers/all-MiniLM-L6-v2",
	"dimension": 384,
	"requires_remote_code": False,
	"max_tokens": 512,
	},
	"gte-multilingual-base": {
	"name": "Alibaba-NLP/gte-multilingual-base",
	"dimension": 768,
	"requires_remote_code": True,
	"max_tokens": 8192,
	},
	"nomic-embed-text-v1.5": {
	"name": "nomic-ai/nomic-embed-text-v1.5",
	"dimension": 768,
	"requires_remote_code": True,
	"max_tokens": 8192,
	"instruction_prefix_required": True,
	"default_instruction_prefix": "search_document:",
	"known_instruction_prefixes": [
	"search_document:",
	"search_query:",
	"clustering:",
	"classification:",
	],
	},
	"all-mpnet-base-v2": {
	"name": "sentence-transformers/all-mpnet-base-v2",
	"dimension": 768,
	"requires_remote_code": False,
	"max_tokens": 384,
	},
	}

	# Mapping of aliases to their canonical model names
	MODEL_ALIASES = {
	"all-minilm": "all-MiniLM-L6-v2",
	"text-embedding-3-small": "all-MiniLM-L6-v2",
	"text-embedding-3-large": "gte-multilingual-base",
	"nomic-embed-text": "nomic-embed-text-v1.5",
	}

	# This global MODELS dictionary will be used for listing available models and validation.
	# It combines canonical names and aliases for easy lookup.
	MODELS = {CANONICAL_MODELS, {alias: CANONICAL_MODELS[canonical] for alias, canonical in MODEL_ALIASES.items()}}

	def get_model_config(requested_model_name: str) -> dict:
	"""
	Resolves a requested model name (which might be an alias) to its canonical
	configuration. Raises ValueError if the model is not found.
	"""
	canonical_name = MODEL_ALIASES.get(requested_model_name, requested_model_name)

	if canonical_name not in CANONICAL_MODELS:
	raise ValueError(f"Model '{requested_model_name}' (canonical: '{canonical_name}') is not a recognized model.")

	return CANONICAL_MODELS[canonical_name]