|
"""
|
|
ZamAI Embeddings Model Setup
|
|
This script sets up the Multilingual ZamAI Embeddings model and vector database.
|
|
"""
|
|
import os
|
|
import chromadb
|
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
|
|
from llama_index.vector_stores.chroma import ChromaVectorStore
|
|
from llama_index.core import StorageContext, VectorStoreIndex
|
|
from llama_index.readers.file import SimpleDirectoryReader
|
|
|
|
def setup_embedding_model(corpus_path="data/text_corpus/",
|
|
db_path="./models/embeddings/chroma_db",
|
|
model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
|
|
"""
|
|
Set up the embedding model and vector database for multilingual document retrieval.
|
|
|
|
Args:
|
|
corpus_path: Path to the text corpus directory
|
|
db_path: Path where the ChromaDB database will be stored
|
|
model_name: Name of the HuggingFace embedding model to use
|
|
|
|
Returns:
|
|
query_engine: A query engine for searching the indexed documents
|
|
"""
|
|
|
|
os.makedirs(corpus_path, exist_ok=True)
|
|
os.makedirs(os.path.dirname(db_path), exist_ok=True)
|
|
|
|
|
|
if os.listdir(corpus_path):
|
|
text_docs = SimpleDirectoryReader(corpus_path).load_data()
|
|
else:
|
|
print(f"Warning: No documents found in {corpus_path}")
|
|
text_docs = []
|
|
|
|
|
|
embed_model = HuggingFaceEmbedding(model_name=model_name)
|
|
|
|
|
|
chroma_client = chromadb.PersistentClient(path=db_path)
|
|
collection = chroma_client.get_or_create_collection("zamAI_collection")
|
|
vector_store = ChromaVectorStore(chroma_collection=collection)
|
|
storage_context = StorageContext.from_defaults(vector_store=vector_store)
|
|
|
|
|
|
if text_docs:
|
|
index = VectorStoreIndex.from_documents(
|
|
text_docs, storage_context=storage_context, embed_model=embed_model
|
|
)
|
|
else:
|
|
|
|
index = VectorStoreIndex.from_vector_store(
|
|
vector_store=vector_store,
|
|
embed_model=embed_model,
|
|
storage_context=storage_context
|
|
)
|
|
|
|
|
|
query_engine = index.as_query_engine()
|
|
|
|
return {
|
|
"index": index,
|
|
"query_engine": query_engine,
|
|
"embed_model": embed_model,
|
|
"vector_store": vector_store
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
|
|
embedding_components = setup_embedding_model()
|
|
print("Embedding model and vector store setup complete!")
|
|
print("You can now use the embedding_components['query_engine'] to search your documents.")
|
|
|