Spaces:

bstraehle
/

advanced-rag

Sleeping

App Files Files Community

bstraehle commited on Jul 28, 2024

Commit

800f223

verified ·

1 Parent(s): 497da77

Delete mongodb_.py

Browse files

Files changed (1) hide show

mongodb_.py +0 -113

mongodb_.py DELETED Viewed

@@ -1,113 +0,0 @@
-import time
-from pymongo.mongo_client import MongoClient
-from pymongo.operations import SearchIndexModel
-database_name = "airbnb_dataset"
-collection_name = "listings_reviews"
-def get_db_collection(listings):
-    mongo_client = MongoClient(os.environ["MONGODB_ATLAS_CLUSTER_URI"], appname="advanced-rag")
-    db = mongo_client.get_database(database_name)
-    collection = db.get_collection(collection_name)
-    collection.delete_many({})
-    collection.insert_many(listings)
-    return db, collection
-def create_vector_search_index(collection):
-    text_embedding_field_name = "text_embeddings"
-    vector_search_index_name_text = "vector_index_text"
-    vector_search_index_model = SearchIndexModel(
-        definition={
-            "mappings": { # describes how fields in the database documents are indexed and stored
-                "dynamic": True, # automatically index new fields that appear in the document
-                "fields": { # properties of the fields that will be indexed.
-                    text_embedding_field_name: {
-                        "dimensions": 1536, # size of the vector.
-                        "similarity": "cosine", # algorithm used to compute the similarity between vectors
-                        "type": "knnVector",
-                    }
-                },
-            }
-        },
-        name=vector_search_index_name_text, # identifier for the vector search index
-    )
-    # Check if the index already exists
-    index_exists = False
-    for index in collection.list_indexes():
-        print(index)
-        if index['name'] == vector_search_index_name_text:
-            index_exists = True
-            break
-    # Create the index if it doesn't exist
-    if not index_exists:
-        try:
-            result = collection.create_search_index(model=vector_search_index_model)
-            print("Creating index...")
-            time.sleep(20)  # Sleep for 20 seconds, adding sleep to ensure vector index has compeleted inital sync before utilization
-            print("Index created successfully:", result)
-            print("Wait a few minutes before conducting search with index to ensure index intialization")
-        except Exception as e:
-            print(f"Error creating vector search index: {str(e)}")
-    else:
-        print(f"Index '{vector_search_index_name_text}' already exists.")
-def vector_search(user_query, db, collection, vector_index="vector_index_text"):
-    """
-    Perform a vector search in the MongoDB collection based on the user query.
-    Args:
-    user_query (str): The user's query string.
-    db (MongoClient.database): The database object.
-    collection (MongoCollection): The MongoDB collection to search.
-    additional_stages (list): Additional aggregation stages to include in the pipeline.
-    Returns:
-    list: A list of matching documents.
-    """
-    # Generate embedding for the user query
-    query_embedding = get_embedding(user_query)
-    if query_embedding is None:
-        return "Invalid query or embedding generation failed."
-    # Define the vector search stage
-    vector_search_stage = {
-        "$vectorSearch": {
-            "index": vector_index, # specifies the index to use for the search
-            "queryVector": query_embedding, # the vector representing the query
-            "path": text_embedding_field_name, # field in the documents containing the vectors to search against
-            "numCandidates": 150, # number of candidate matches to consider
-            "limit": 20 # return top 20 matches
-        }
-    }
-    # Define the aggregate pipeline with the vector search stage and additional stages
-    pipeline = [vector_search_stage]
-    # Execute the search
-    results = collection.aggregate(pipeline)
-    explain_query_execution = db.command( # sends a database command directly to the MongoDB server
-        'explain', { # return information about how MongoDB executes a query or command without actually running it
-            'aggregate': collection.name, # specifies the name of the collection on which the aggregation is performed
-            'pipeline': pipeline, # the aggregation pipeline to analyze
-            'cursor': {} # indicates that default cursor behavior should be used
-        },
-        verbosity='executionStats') # detailed statistics about the execution of each stage of the aggregation pipeline
-    vector_search_explain = explain_query_execution['stages'][0]['$vectorSearch']
-    millis_elapsed = vector_search_explain['explain']['collectStats']['millisElapsed']
-    print(f"Total time for the execution to complete on the database server: {millis_elapsed} milliseconds")
-    return list(results)