File size: 3,766 Bytes
1f49ee0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e4b78a
1f49ee0
 
 
 
 
 
5e4b78a
 
 
 
 
1f49ee0
5e4b78a
 
 
2edf2fb
5e4b78a
 
1f49ee0
 
5e4b78a
1f49ee0
 
 
5e4b78a
2edf2fb
 
1f49ee0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e4b78a
1f49ee0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5e4b78a
2edf2fb
 
1f49ee0
5e4b78a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest
from langchain_qdrant import QdrantVectorStore
import hashlib
import defaults

embedding_model = defaults.default_embedding_model
qdrant_url = defaults.default_url

"""
This code creates a hash for every chunk and checks to see if that chunk already exists in the
vector database.  We only want one collection in Qdrant, but want to make sure that if a user
selects a document that has already been embedded and stored, it does not get stored again.  We
also add metadata for the document title, so that we can make our retriever focus on documents of
interest.  For example, after some usage, the application might have 20 documents for the user to 
select from.  We want the retriever to be exactly right for the documents that they selected.

This could also be useful if different versions of documents are in existence.  We would not want to
recreate a large vectorstore.  But the user could select the most recent version.
"""

def get_document_hash(doc_content):
    """Generate a unique hash for the document content."""
    return hashlib.md5(doc_content.encode()).hexdigest()

def getVectorstore(document, file_name):
    # Add a unique hash to your documents
    for doc in document:
        doc.metadata['content_hash'] = get_document_hash(doc.page_content)

    # Add the document title
    for doc in document:
        doc.metadata['document_title'] = file_name

    # Add page to metadata
    for i, doc in enumerate(document):
        doc.metadata['source'] = f"source_{i}"

    # collection_name = f"pdf_to_parse_{uuid.uuid4()}"
    collection_name = "protocol_collection"
    
    client = QdrantClient( url=qdrant_url)


    # If the collection exists, then we need to check to see if our document is already
    # present, in which case we would not want to store it again.
    if client.collection_exists(collection_name):
        print("Collection exists")
        qdrant_vectorstore = QdrantVectorStore.from_existing_collection(
            embedding=embedding_model,
            collection_name=collection_name,
            url=qdrant_url
            # location = ":memory:"
        )
        
        # Check for existing documents and only add new ones
        existing_hashes = set()
        new_docs = []
        
        # Get all existing hashes
        scroll_filter = rest.Filter(
            should=[
                rest.FieldCondition(
                    key="metadata.content_hash",
                    match=rest.MatchValue(value=doc.metadata['content_hash'])
                ) for doc in document
            ]
        )
        
        scroll_results = client.scroll(
            collection_name=collection_name,
            scroll_filter=scroll_filter,
            limit=len(document)  # Adjust this if you have a large number of documents
        )
        
        existing_hashes = set(point.payload.get('metadata', {}).get('content_hash') for point in scroll_results[0])
        
        for doc in document:
            if doc.metadata['content_hash'] not in existing_hashes:
                new_docs.append(doc)
        
        if new_docs:
            qdrant_vectorstore.add_documents(new_docs)
        
        print(f"Added {len(new_docs)} new documents")
        print(f"Skipped {len(existing_hashes)} existing documents")
    else: 
        print("Collection does not exist")                           #So we go ahead and just add the documents
        qdrant_vectorstore = QdrantVectorStore.from_documents(
            documents=document,
            embedding=embedding_model,
            collection_name=collection_name,
            # location = ":memory:"
            url=qdrant_url
        )
    return qdrant_vectorstore