tasal9
/

Multilingual-ZamAI-Embeddings

Sentence Similarity

sentence-transformers

feature-extraction

text-embeddings-inference

Model card Files Files and versions

Multilingual-ZamAI-Embeddings / setup.py

tasal9's picture

Add setup.py

f02797c verified 13 days ago

history blame contribute delete

2.86 kB

	"""
	ZamAI Embeddings Model Setup
	This script sets up the Multilingual ZamAI Embeddings model and vector database.
	"""
	import os
	import chromadb
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	from llama_index.vector_stores.chroma import ChromaVectorStore
	from llama_index.core import StorageContext, VectorStoreIndex
	from llama_index.readers.file import SimpleDirectoryReader

	def setup_embedding_model(corpus_path="data/text_corpus/",
	db_path="./models/embeddings/chroma_db",
	model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
	"""
	Set up the embedding model and vector database for multilingual document retrieval.

	Args:
	corpus_path: Path to the text corpus directory
	db_path: Path where the ChromaDB database will be stored
	model_name: Name of the HuggingFace embedding model to use

	Returns:
	query_engine: A query engine for searching the indexed documents
	"""
	# Ensure directories exist
	os.makedirs(corpus_path, exist_ok=True)
	os.makedirs(os.path.dirname(db_path), exist_ok=True)

	# Load documents if corpus directory has files
	if os.listdir(corpus_path):
	text_docs = SimpleDirectoryReader(corpus_path).load_data()
	else:
	print(f"Warning: No documents found in {corpus_path}")
	text_docs = []

	# Initialize embedding model
	embed_model = HuggingFaceEmbedding(model_name=model_name)

	# Initialize ChromaDB
	chroma_client = chromadb.PersistentClient(path=db_path)
	collection = chroma_client.get_or_create_collection("zamAI_collection")
	vector_store = ChromaVectorStore(chroma_collection=collection)
	storage_context = StorageContext.from_defaults(vector_store=vector_store)

	# Build or load index if we have documents
	if text_docs:
	index = VectorStoreIndex.from_documents(
	text_docs, storage_context=storage_context, embed_model=embed_model
	)
	else:
	# If no documents yet, just initialize the index with the embedding model
	index = VectorStoreIndex.from_vector_store(
	vector_store=vector_store,
	embed_model=embed_model,
	storage_context=storage_context
	)

	# Create a query engine
	query_engine = index.as_query_engine()

	return {
	"index": index,
	"query_engine": query_engine,
	"embed_model": embed_model,
	"vector_store": vector_store
	}

	if __name__ == "__main__":
	# Example usage
	embedding_components = setup_embedding_model()
	print("Embedding model and vector store setup complete!")
	print("You can now use the embedding_components['query_engine'] to search your documents.")