Spaces:
Runtime error
Runtime error
from qdrant_client import QdrantClient | |
from qdrant_client.http import models as rest | |
from langchain_qdrant import QdrantVectorStore | |
import hashlib | |
import defaults | |
embedding_model = defaults.default_embedding_model | |
qdrant_url = defaults.default_url | |
""" | |
This code creates a hash for every chunk and checks to see if that chunk already exists in the | |
vector database. We only want one collection in Qdrant, but want to make sure that if a user | |
selects a document that has already been embedded and stored, it does not get stored again. We | |
also add metadata for the document title, so that we can make our retriever focus on documents of | |
interest. For example, after some usage, the application might have 20 documents for the user to | |
select from. We want the retriever to be exactly right for the documents that they selected. | |
This could also be useful if different versions of documents are in existence. We would not want to | |
recreate a large vectorstore. But the user could select the most recent version. | |
""" | |
def get_document_hash(doc_content): | |
"""Generate a unique hash for the document content.""" | |
return hashlib.md5(doc_content.encode()).hexdigest() | |
def getVectorstore(document, file_name): | |
# Add a unique hash to your documents | |
for doc in document: | |
doc.metadata['content_hash'] = get_document_hash(doc.page_content) | |
# Add the document title | |
for doc in document: | |
doc.metadata['document_title'] = file_name | |
# Add page to metadata | |
for i, doc in enumerate(document): | |
doc.metadata['source'] = f"source_{i}" | |
# collection_name = f"pdf_to_parse_{uuid.uuid4()}" | |
collection_name = "protocol_collection" | |
client = QdrantClient( url=qdrant_url) | |
# If the collection exists, then we need to check to see if our document is already | |
# present, in which case we would not want to store it again. | |
if client.collection_exists(collection_name): | |
print("Collection exists") | |
qdrant_vectorstore = QdrantVectorStore.from_existing_collection( | |
embedding=embedding_model, | |
collection_name=collection_name, | |
url=qdrant_url | |
# location = ":memory:" | |
) | |
# Check for existing documents and only add new ones | |
existing_hashes = set() | |
new_docs = [] | |
# Get all existing hashes | |
scroll_filter = rest.Filter( | |
should=[ | |
rest.FieldCondition( | |
key="metadata.content_hash", | |
match=rest.MatchValue(value=doc.metadata['content_hash']) | |
) for doc in document | |
] | |
) | |
scroll_results = client.scroll( | |
collection_name=collection_name, | |
scroll_filter=scroll_filter, | |
limit=len(document) # Adjust this if you have a large number of documents | |
) | |
existing_hashes = set(point.payload.get('metadata', {}).get('content_hash') for point in scroll_results[0]) | |
for doc in document: | |
if doc.metadata['content_hash'] not in existing_hashes: | |
new_docs.append(doc) | |
if new_docs: | |
qdrant_vectorstore.add_documents(new_docs) | |
print(f"Added {len(new_docs)} new documents") | |
print(f"Skipped {len(existing_hashes)} existing documents") | |
else: | |
print("Collection does not exist") #So we go ahead and just add the documents | |
qdrant_vectorstore = QdrantVectorStore.from_documents( | |
documents=document, | |
embedding=embedding_model, | |
collection_name=collection_name, | |
# location = ":memory:" | |
url=qdrant_url | |
) | |
return qdrant_vectorstore | |