# models_config.py CANONICAL_MODELS = { "all-MiniLM-L6-v2": { "name": "sentence-transformers/all-MiniLM-L6-v2", "dimension": 384, "requires_remote_code": False, "max_tokens": 512, }, "gte-multilingual-base": { "name": "Alibaba-NLP/gte-multilingual-base", "dimension": 768, "requires_remote_code": True, "max_tokens": 8192, }, "nomic-embed-text-v1.5": { "name": "nomic-ai/nomic-embed-text-v1.5", "dimension": 768, "requires_remote_code": True, "max_tokens": 8192, "instruction_prefix_required": True, "default_instruction_prefix": "search_document:", "known_instruction_prefixes": [ "search_document:", "search_query:", "clustering:", "classification:", ], }, "all-mpnet-base-v2": { "name": "sentence-transformers/all-mpnet-base-v2", "dimension": 768, "requires_remote_code": False, "max_tokens": 384, }, } # Mapping of aliases to their canonical model names MODEL_ALIASES = { "all-minilm": "all-MiniLM-L6-v2", "text-embedding-3-small": "all-MiniLM-L6-v2", "text-embedding-3-large": "gte-multilingual-base", "nomic-embed-text": "nomic-embed-text-v1.5", } # This global MODELS dictionary will be used for listing available models and validation. # It combines canonical names and aliases for easy lookup. MODELS = {**CANONICAL_MODELS, **{alias: CANONICAL_MODELS[canonical] for alias, canonical in MODEL_ALIASES.items()}} def get_model_config(requested_model_name: str) -> dict: """ Resolves a requested model name (which might be an alias) to its canonical configuration. Raises ValueError if the model is not found. """ canonical_name = MODEL_ALIASES.get(requested_model_name, requested_model_name) if canonical_name not in CANONICAL_MODELS: raise ValueError(f"Model '{requested_model_name}' (canonical: '{canonical_name}') is not a recognized model.") return CANONICAL_MODELS[canonical_name]