File size: 2,863 Bytes
f02797c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
"""

ZamAI Embeddings Model Setup

This script sets up the Multilingual ZamAI Embeddings model and vector database.

"""
import os
import chromadb
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.readers.file import SimpleDirectoryReader

def setup_embedding_model(corpus_path="data/text_corpus/", 

                         db_path="./models/embeddings/chroma_db",

                         model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
    """

    Set up the embedding model and vector database for multilingual document retrieval.

    

    Args:

        corpus_path: Path to the text corpus directory

        db_path: Path where the ChromaDB database will be stored

        model_name: Name of the HuggingFace embedding model to use

    

    Returns:

        query_engine: A query engine for searching the indexed documents

    """
    # Ensure directories exist
    os.makedirs(corpus_path, exist_ok=True)
    os.makedirs(os.path.dirname(db_path), exist_ok=True)
    
    # Load documents if corpus directory has files
    if os.listdir(corpus_path):
        text_docs = SimpleDirectoryReader(corpus_path).load_data()
    else:
        print(f"Warning: No documents found in {corpus_path}")
        text_docs = []
    
    # Initialize embedding model
    embed_model = HuggingFaceEmbedding(model_name=model_name)
    
    # Initialize ChromaDB
    chroma_client = chromadb.PersistentClient(path=db_path)
    collection = chroma_client.get_or_create_collection("zamAI_collection")
    vector_store = ChromaVectorStore(chroma_collection=collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    
    # Build or load index if we have documents
    if text_docs:
        index = VectorStoreIndex.from_documents(
            text_docs, storage_context=storage_context, embed_model=embed_model
        )
    else:
        # If no documents yet, just initialize the index with the embedding model
        index = VectorStoreIndex.from_vector_store(
            vector_store=vector_store,
            embed_model=embed_model,
            storage_context=storage_context
        )
    
    # Create a query engine
    query_engine = index.as_query_engine()
    
    return {
        "index": index,
        "query_engine": query_engine,
        "embed_model": embed_model,
        "vector_store": vector_store
    }

if __name__ == "__main__":
    # Example usage
    embedding_components = setup_embedding_model()
    print("Embedding model and vector store setup complete!")
    print("You can now use the embedding_components['query_engine'] to search your documents.")