Spaces:
Sleeping
Sleeping
| import openai, os, time | |
| from datasets import load_dataset | |
| from pymongo.mongo_client import MongoClient | |
| DB_NAME = "airbnb_dataset" | |
| COLLECTION_NAME = "listings_reviews" | |
| def connect_to_database(): | |
| MONGODB_ATLAS_CLUSTER_URI = os.environ["MONGODB_ATLAS_CLUSTER_URI"] | |
| mongo_client = MongoClient(MONGODB_ATLAS_CLUSTER_URI, appname="advanced-rag") | |
| db = mongo_client.get_database(DB_NAME) | |
| collection = db.get_collection(COLLECTION_NAME) | |
| return db, collection | |
| def rag_ingestion(collection): | |
| dataset = load_dataset("bstraehle/airbnb-san-francisco-202403-embed", streaming=True, split="train") | |
| collection.delete_many({}) | |
| collection.insert_many(dataset) | |
| return "Manually create a vector search index (in free tier, this feature is not available via SDK)" | |
| def rag_retrieval_naive(openai_api_key, | |
| prompt, | |
| accomodates, | |
| bedrooms, | |
| db, | |
| collection, | |
| vector_index="vector_index"): | |
| # Naive RAG: Semantic search | |
| retrieval_result = vector_search_naive( | |
| openai_api_key, | |
| prompt, | |
| accomodates, | |
| bedrooms, | |
| db, | |
| collection, | |
| vector_index | |
| ) | |
| if not retrieval_result: | |
| return "No results found." | |
| #print(retrieval_result) | |
| return retrieval_result | |
| def rag_retrieval_advanced(openai_api_key, | |
| prompt, | |
| accomodates, | |
| bedrooms, | |
| db, | |
| collection, | |
| vector_index="vector_index"): | |
| # Advanced RAG: Semantic search plus... | |
| # 1a) Pre-retrieval processing: index filter (accomodates, bedrooms) plus... | |
| # 1b) Post-retrieval processing: result filter (accomodates, bedrooms) plus... | |
| # 2) Weighted average review, sorted in descending order | |
| additional_stages = [ | |
| get_stage_average_review_and_review_count(), | |
| get_stage_weighting(), | |
| get_stage_sorting() | |
| ] | |
| retrieval_result = vector_search_advanced( | |
| openai_api_key, | |
| prompt, | |
| accomodates, | |
| bedrooms, | |
| db, | |
| collection, | |
| additional_stages, | |
| vector_index | |
| ) | |
| if not retrieval_result: | |
| return "No results found." | |
| #print(retrieval_result) | |
| return retrieval_result | |
| def inference(openai_api_key, prompt): | |
| content = ( | |
| "Answer the question.\n" | |
| "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n" | |
| "Keep the answer as concise as possible.\n\n" | |
| f"Question: {prompt}\n" | |
| "Helpful Answer: " | |
| ) | |
| return invoke_llm(openai_api_key, content) | |
| def rag_inference(openai_api_key, prompt, retrieval_result): | |
| content = ( | |
| "Use the following pieces of context to answer the question at the end.\n" | |
| "If you don't know the answer, just say that you don't know, don't try to make up an answer.\n" | |
| "Keep the answer as concise as possible.\n\n" | |
| f"{retrieval_result}\n\n" | |
| f"Question: {prompt}\n" | |
| "Helpful Answer: " | |
| ) | |
| return invoke_llm(openai_api_key, content) | |
| def invoke_llm(openai_api_key, content): | |
| openai.api_key = openai_api_key | |
| completion = openai.chat.completions.create( | |
| model="gpt-4o", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are an AirBnB listing recommendation system."}, | |
| { | |
| "role": "user", | |
| "content": content | |
| } | |
| ], | |
| temperature=0.01 | |
| ) | |
| return completion.choices[0].message.content | |
| def vector_search_naive(openai_api_key, | |
| prompt, | |
| accomodates, | |
| bedrooms, | |
| db, | |
| collection, | |
| vector_index="vector_index"): | |
| query_embedding = get_text_embedding(openai_api_key, prompt) | |
| if query_embedding is None: | |
| return "Invalid query or embedding generation failed." | |
| vector_search_stage = { | |
| "$vectorSearch": { | |
| "index": vector_index, | |
| "queryVector": query_embedding, | |
| "path": "description_embedding", | |
| "numCandidates": 150, | |
| "limit": 25, | |
| } | |
| } | |
| pipeline = [ | |
| vector_search_stage, | |
| get_stage_include_fields(), | |
| get_stage_filter_result(accomodates, bedrooms) | |
| ] | |
| return invoke_search(db, collection, pipeline) | |
| def vector_search_advanced(openai_api_key, | |
| prompt, | |
| accommodates, | |
| bedrooms, | |
| db, | |
| collection, | |
| additional_stages=[], | |
| vector_index="vector_index"): | |
| query_embedding = get_text_embedding(openai_api_key, prompt) | |
| if query_embedding is None: | |
| return "Invalid query or embedding generation failed." | |
| vector_search_and_filter_stage = { | |
| "$vectorSearch": { | |
| "index": vector_index, | |
| "queryVector": query_embedding, | |
| "path": "description_embedding", | |
| "numCandidates": 150, | |
| "limit": 25, | |
| "filter": { | |
| "$and": [ | |
| {"accommodates": {"$eq": accommodates}}, | |
| {"bedrooms": {"$eq": bedrooms}} | |
| ] | |
| }, | |
| } | |
| } | |
| pipeline = [ | |
| vector_search_and_filter_stage, | |
| get_stage_include_fields() | |
| ] + additional_stages | |
| return invoke_search(db, collection, pipeline) | |
| def get_stage_exclude_fields(): | |
| return { | |
| "$unset": "description_embedding" | |
| } | |
| def get_stage_include_fields(): | |
| return { | |
| "$project": { | |
| "id": 1, | |
| "listing_url": 1, | |
| "name": 1, | |
| "description": 1, | |
| "neighborhood_overview": 1, | |
| "picture_url": 1, | |
| "host_id": 1, | |
| "host_url": 1, | |
| "host_name": 1, | |
| "host_since": 1, | |
| "host_location": 1, | |
| "host_about": 1, | |
| "host_response_time": 1, | |
| "host_response_rate": 1, | |
| "host_acceptance_rate": 1, | |
| "host_is_superhost": 1, | |
| "host_thumbnail_url": 1, | |
| "host_picture_url": 1, | |
| "host_neighbourhood": 1, | |
| "host_listings_count": 1, | |
| "host_total_listings_count": 1, | |
| "host_verifications": 1, | |
| "host_has_profile_pic": 1, | |
| "host_identity_verified": 1, | |
| "neighbourhood": 1, | |
| "neighbourhood_cleansed": 1, | |
| "neighbourhood_group_cleansed": 1, | |
| "latitude": 1, | |
| "longitude": 1, | |
| "property_type": 1, | |
| "room_type": 1, | |
| "accommodates": 1, | |
| "bathrooms": 1, | |
| "bathrooms_text": 1, | |
| "bedrooms": 1, | |
| "beds": 1, | |
| "amenities": 1, | |
| "price": 1, | |
| "minimum_nights": 1, | |
| "maximum_nights": 1, | |
| "minimum_minimum_nights": 1, | |
| "maximum_minimum_nights": 1, | |
| "minimum_maximum_nights": 1, | |
| "maximum_maximum_nights": 1, | |
| "minimum_nights_avg_ntm": 1, | |
| "maximum_nights_avg_ntm": 1, | |
| "calendar_updated": 1, | |
| "has_availability": 1, | |
| "availability_30": 1, | |
| "availability_60": 1, | |
| "availability_90": 1, | |
| "availability_365": 1, | |
| "number_of_reviews": 1, | |
| "number_of_reviews_ltm": 1, | |
| "number_of_reviews_l30d": 1, | |
| "first_review": 1, | |
| "last_review": 1, | |
| "review_scores_rating": 1, | |
| "review_scores_accuracy": 1, | |
| "review_scores_cleanliness": 1, | |
| "review_scores_checkin": 1, | |
| "review_scores_communication": 1, | |
| "review_scores_location": 1, | |
| "review_scores_value": 1, | |
| "license": 1, | |
| "instant_bookable": 1, | |
| "calculated_host_listings_count": 1, | |
| "calculated_host_listings_count_entire_homes": 1, | |
| "calculated_host_listings_count_private_rooms": 1, | |
| "calculated_host_listings_count_shared_rooms": 1, | |
| "reviews_per_month": 1, | |
| } | |
| } | |
| def get_stage_filter_result(accomodates, bedrooms): | |
| return { | |
| "$match": { | |
| "accommodates": { "$eq": accomodates}, | |
| "bedrooms": { "$eq": bedrooms} | |
| } | |
| } | |
| def get_stage_average_review_and_review_count(): | |
| return { | |
| "$addFields": { | |
| "averageReview": { | |
| "$divide": [ | |
| { | |
| "$add": [ | |
| "$review_scores_rating", | |
| "$review_scores_accuracy", | |
| "$review_scores_cleanliness", | |
| "$review_scores_checkin", | |
| "$review_scores_communication", | |
| "$review_scores_location", | |
| "$review_scores_value", | |
| ] | |
| }, | |
| 7 | |
| ] | |
| }, | |
| "reviewCount": "$number_of_reviews" | |
| } | |
| } | |
| def get_stage_weighting(): | |
| return { | |
| "$addFields": { | |
| "weightedAverageReview": { | |
| "$add": [ | |
| {"$multiply": ["$averageReview", 0.9]}, | |
| {"$multiply": ["$reviewCount", 0.1]}, | |
| ] | |
| } | |
| } | |
| } | |
| def get_stage_sorting(): | |
| return { | |
| "$sort": {"weightedAverageReview": -1} | |
| } | |
| def invoke_search(db, collection, pipeline): | |
| results = collection.aggregate(pipeline) | |
| print(f"Vector search millis elapsed: {get_millis_elapsed(db, collection, pipeline)}") | |
| return list(results) | |
| def get_millis_elapsed(db, collection, pipeline): | |
| explain_query_execution = db.command( | |
| "explain", { | |
| "aggregate": collection.name, | |
| "pipeline": pipeline, | |
| "cursor": {} | |
| }, | |
| verbosity="executionStats") | |
| explain_vector_search = explain_query_execution["stages"][0]["$vectorSearch"] | |
| return explain_vector_search["explain"]["collectStats"]["allCollectorStats"]["millisElapsed"] | |
| def get_text_embedding(openai_api_key, text): | |
| if not text or not isinstance(text, str): | |
| return None | |
| openai.api_key = openai_api_key | |
| try: | |
| return openai.embeddings.create( | |
| input=text, | |
| model="text-embedding-3-small", dimensions=1536 | |
| ).data[0].embedding | |
| except Exception as e: | |
| print(f"Error in get_embedding: {e}") | |
| return None |