Spaces:
Running
Running
# models_config.py | |
CANONICAL_MODELS = { | |
"all-MiniLM-L6-v2": { | |
"name": "sentence-transformers/all-MiniLM-L6-v2", | |
"dimension": 384, | |
"requires_remote_code": False, | |
"max_tokens": 512, | |
}, | |
"gte-multilingual-base": { | |
"name": "Alibaba-NLP/gte-multilingual-base", | |
"dimension": 768, | |
"requires_remote_code": True, | |
"max_tokens": 8192, | |
}, | |
"nomic-embed-text-v1.5": { | |
"name": "nomic-ai/nomic-embed-text-v1.5", | |
"dimension": 768, | |
"requires_remote_code": True, | |
"max_tokens": 8192, | |
"instruction_prefix_required": True, | |
"default_instruction_prefix": "search_document:", | |
"known_instruction_prefixes": [ | |
"search_document:", | |
"search_query:", | |
"clustering:", | |
"classification:", | |
], | |
}, | |
"all-mpnet-base-v2": { | |
"name": "sentence-transformers/all-mpnet-base-v2", | |
"dimension": 768, | |
"requires_remote_code": False, | |
"max_tokens": 384, | |
}, | |
} | |
# Mapping of aliases to their canonical model names | |
MODEL_ALIASES = { | |
"all-minilm": "all-MiniLM-L6-v2", | |
"text-embedding-3-small": "all-MiniLM-L6-v2", | |
"text-embedding-3-large": "gte-multilingual-base", | |
"nomic-embed-text": "nomic-embed-text-v1.5", | |
} | |
# This global MODELS dictionary will be used for listing available models and validation. | |
# It combines canonical names and aliases for easy lookup. | |
MODELS = {**CANONICAL_MODELS, **{alias: CANONICAL_MODELS[canonical] for alias, canonical in MODEL_ALIASES.items()}} | |
def get_model_config(requested_model_name: str) -> dict: | |
""" | |
Resolves a requested model name (which might be an alias) to its canonical | |
configuration. Raises ValueError if the model is not found. | |
""" | |
canonical_name = MODEL_ALIASES.get(requested_model_name, requested_model_name) | |
if canonical_name not in CANONICAL_MODELS: | |
raise ValueError(f"Model '{requested_model_name}' (canonical: '{canonical_name}') is not a recognized model.") | |
return CANONICAL_MODELS[canonical_name] |