bstraehle commited on
Commit
800f223
·
verified ·
1 Parent(s): 497da77

Delete mongodb_.py

Browse files
Files changed (1) hide show
  1. mongodb_.py +0 -113
mongodb_.py DELETED
@@ -1,113 +0,0 @@
1
- import time
2
-
3
- from pymongo.mongo_client import MongoClient
4
- from pymongo.operations import SearchIndexModel
5
-
6
- database_name = "airbnb_dataset"
7
- collection_name = "listings_reviews"
8
-
9
- def get_db_collection(listings):
10
- mongo_client = MongoClient(os.environ["MONGODB_ATLAS_CLUSTER_URI"], appname="advanced-rag")
11
-
12
- db = mongo_client.get_database(database_name)
13
-
14
- collection = db.get_collection(collection_name)
15
- collection.delete_many({})
16
- collection.insert_many(listings)
17
-
18
- return db, collection
19
-
20
- def create_vector_search_index(collection):
21
- text_embedding_field_name = "text_embeddings"
22
-
23
- vector_search_index_name_text = "vector_index_text"
24
-
25
- vector_search_index_model = SearchIndexModel(
26
- definition={
27
- "mappings": { # describes how fields in the database documents are indexed and stored
28
- "dynamic": True, # automatically index new fields that appear in the document
29
- "fields": { # properties of the fields that will be indexed.
30
- text_embedding_field_name: {
31
- "dimensions": 1536, # size of the vector.
32
- "similarity": "cosine", # algorithm used to compute the similarity between vectors
33
- "type": "knnVector",
34
- }
35
- },
36
- }
37
- },
38
- name=vector_search_index_name_text, # identifier for the vector search index
39
- )
40
-
41
- # Check if the index already exists
42
- index_exists = False
43
- for index in collection.list_indexes():
44
- print(index)
45
- if index['name'] == vector_search_index_name_text:
46
- index_exists = True
47
- break
48
-
49
- # Create the index if it doesn't exist
50
- if not index_exists:
51
- try:
52
- result = collection.create_search_index(model=vector_search_index_model)
53
- print("Creating index...")
54
- time.sleep(20) # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization
55
- print("Index created successfully:", result)
56
- print("Wait a few minutes before conducting search with index to ensure index intialization")
57
- except Exception as e:
58
- print(f"Error creating vector search index: {str(e)}")
59
- else:
60
- print(f"Index '{vector_search_index_name_text}' already exists.")
61
-
62
- def vector_search(user_query, db, collection, vector_index="vector_index_text"):
63
- """
64
- Perform a vector search in the MongoDB collection based on the user query.
65
-
66
- Args:
67
- user_query (str): The user's query string.
68
- db (MongoClient.database): The database object.
69
- collection (MongoCollection): The MongoDB collection to search.
70
- additional_stages (list): Additional aggregation stages to include in the pipeline.
71
-
72
- Returns:
73
- list: A list of matching documents.
74
- """
75
-
76
- # Generate embedding for the user query
77
- query_embedding = get_embedding(user_query)
78
-
79
- if query_embedding is None:
80
- return "Invalid query or embedding generation failed."
81
-
82
- # Define the vector search stage
83
- vector_search_stage = {
84
- "$vectorSearch": {
85
- "index": vector_index, # specifies the index to use for the search
86
- "queryVector": query_embedding, # the vector representing the query
87
- "path": text_embedding_field_name, # field in the documents containing the vectors to search against
88
- "numCandidates": 150, # number of candidate matches to consider
89
- "limit": 20 # return top 20 matches
90
- }
91
- }
92
-
93
- # Define the aggregate pipeline with the vector search stage and additional stages
94
- pipeline = [vector_search_stage]
95
-
96
- # Execute the search
97
- results = collection.aggregate(pipeline)
98
-
99
- explain_query_execution = db.command( # sends a database command directly to the MongoDB server
100
- 'explain', { # return information about how MongoDB executes a query or command without actually running it
101
- 'aggregate': collection.name, # specifies the name of the collection on which the aggregation is performed
102
- 'pipeline': pipeline, # the aggregation pipeline to analyze
103
- 'cursor': {} # indicates that default cursor behavior should be used
104
- },
105
- verbosity='executionStats') # detailed statistics about the execution of each stage of the aggregation pipeline
106
-
107
-
108
- vector_search_explain = explain_query_execution['stages'][0]['$vectorSearch']
109
- millis_elapsed = vector_search_explain['explain']['collectStats']['millisElapsed']
110
-
111
- print(f"Total time for the execution to complete on the database server: {millis_elapsed} milliseconds")
112
-
113
- return list(results)