tasal9's picture
Add setup.py
f02797c verified
"""
ZamAI Embeddings Model Setup
This script sets up the Multilingual ZamAI Embeddings model and vector database.
"""
import os
import chromadb
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.readers.file import SimpleDirectoryReader
def setup_embedding_model(corpus_path="data/text_corpus/",
db_path="./models/embeddings/chroma_db",
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
"""
Set up the embedding model and vector database for multilingual document retrieval.
Args:
corpus_path: Path to the text corpus directory
db_path: Path where the ChromaDB database will be stored
model_name: Name of the HuggingFace embedding model to use
Returns:
query_engine: A query engine for searching the indexed documents
"""
# Ensure directories exist
os.makedirs(corpus_path, exist_ok=True)
os.makedirs(os.path.dirname(db_path), exist_ok=True)
# Load documents if corpus directory has files
if os.listdir(corpus_path):
text_docs = SimpleDirectoryReader(corpus_path).load_data()
else:
print(f"Warning: No documents found in {corpus_path}")
text_docs = []
# Initialize embedding model
embed_model = HuggingFaceEmbedding(model_name=model_name)
# Initialize ChromaDB
chroma_client = chromadb.PersistentClient(path=db_path)
collection = chroma_client.get_or_create_collection("zamAI_collection")
vector_store = ChromaVectorStore(chroma_collection=collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Build or load index if we have documents
if text_docs:
index = VectorStoreIndex.from_documents(
text_docs, storage_context=storage_context, embed_model=embed_model
)
else:
# If no documents yet, just initialize the index with the embedding model
index = VectorStoreIndex.from_vector_store(
vector_store=vector_store,
embed_model=embed_model,
storage_context=storage_context
)
# Create a query engine
query_engine = index.as_query_engine()
return {
"index": index,
"query_engine": query_engine,
"embed_model": embed_model,
"vector_store": vector_store
}
if __name__ == "__main__":
# Example usage
embedding_components = setup_embedding_model()
print("Embedding model and vector store setup complete!")
print("You can now use the embedding_components['query_engine'] to search your documents.")