""" ZamAI Embeddings Model Setup This script sets up the Multilingual ZamAI Embeddings model and vector database. """ import os import chromadb from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.vector_stores.chroma import ChromaVectorStore from llama_index.core import StorageContext, VectorStoreIndex from llama_index.readers.file import SimpleDirectoryReader def setup_embedding_model(corpus_path="data/text_corpus/", db_path="./models/embeddings/chroma_db", model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"): """ Set up the embedding model and vector database for multilingual document retrieval. Args: corpus_path: Path to the text corpus directory db_path: Path where the ChromaDB database will be stored model_name: Name of the HuggingFace embedding model to use Returns: query_engine: A query engine for searching the indexed documents """ # Ensure directories exist os.makedirs(corpus_path, exist_ok=True) os.makedirs(os.path.dirname(db_path), exist_ok=True) # Load documents if corpus directory has files if os.listdir(corpus_path): text_docs = SimpleDirectoryReader(corpus_path).load_data() else: print(f"Warning: No documents found in {corpus_path}") text_docs = [] # Initialize embedding model embed_model = HuggingFaceEmbedding(model_name=model_name) # Initialize ChromaDB chroma_client = chromadb.PersistentClient(path=db_path) collection = chroma_client.get_or_create_collection("zamAI_collection") vector_store = ChromaVectorStore(chroma_collection=collection) storage_context = StorageContext.from_defaults(vector_store=vector_store) # Build or load index if we have documents if text_docs: index = VectorStoreIndex.from_documents( text_docs, storage_context=storage_context, embed_model=embed_model ) else: # If no documents yet, just initialize the index with the embedding model index = VectorStoreIndex.from_vector_store( vector_store=vector_store, embed_model=embed_model, storage_context=storage_context ) # Create a query engine query_engine = index.as_query_engine() return { "index": index, "query_engine": query_engine, "embed_model": embed_model, "vector_store": vector_store } if __name__ == "__main__": # Example usage embedding_components = setup_embedding_model() print("Embedding model and vector store setup complete!") print("You can now use the embedding_components['query_engine'] to search your documents.")