# Import required libraries import os from dotenv import load_dotenv # Load environment variables from .env file load_dotenv() # MongoDB Configuration# settings.py # Path to the directory where the fine-tuned reranker model will be saved # This is used to store the model after fine-tuning, so it can be loaded later for inference FINE_TUNED_RERANKER_SAVE_PATH = "models/reranker_fine_tuned" # Connection string for MongoDB Atlas database MONGO_URI = os.getenv("MONGO_URI", "mongodb+srv://sundram22verma:Inform12345@newsfeeddataset.hawox3o.mongodb.net/NewsDataSet?retryWrites=true&w=majority") # Name of the MongoDB database MONGO_DB_NAME = os.getenv("MONGO_DB_NAME", "NewsDataSet") # Name of the collection storing news articles (parsed XML articles) MONGO_NEWS_COLLECTION_NAME = os.getenv("MONGO_NEWS_COLLECTION_NAME", "parsedXmlArticles") # Name of the collection storing user session data MONGO_SESSIONS_COLLECTION_NAME = os.getenv("MONGO_SESSIONS_COLLECTION_NAME", "user_sessions") # Name of the collection storing FAISS index metadata (like indexed IDs) MONGO_FAISS_META_COLLECTION_NAME = os.getenv("MONGO_FAISS_META_COLLECTION_NAME", "faiss_index_meta") # Name of the collection storing user feedback/tracking data MONGO_TRACKING_COLLECTION_NAME = os.getenv("MONGO_TRACKING_COLLECTION_NAME", "user_feedback_tracking") # Model Configuration # Name of the embedding model used for text vectorization EMBED_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2" # Name of the text generation model for Indic languages GENERATOR_MODEL_NAME = "ai4bharat/IndicBART" # Name of the reranking model for improving search results RERANKER_MODEL_NAME = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1" # File Paths # Path to store the FAISS index for fast similarity search INDEX_PATH = "DataEmbeddings.bin" # Path to store the list of IDs corresponding to the FAISS index vectors (legacy, now primarily in MongoDB) INDEX_IDS_PATH = "DataEmbeddings_ids.json" # Path to store user interaction logs INTERACTION_LOG_PATH = "logs/Hindi_User_Interactions.json" # Path to Indic NLP resources for text processing # Use relative path from project root _DEFAULT_INDIC_NLP_RESOURCES_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "indic_nlp_resources") _ENV_SUPPLIED_INDIC_NLP_PATH = os.getenv("INDIC_NLP_RESOURCES_PATH") if _ENV_SUPPLIED_INDIC_NLP_PATH: if os.path.exists(_ENV_SUPPLIED_INDIC_NLP_PATH): INDIC_NLP_RESOURCES_PATH = _ENV_SUPPLIED_INDIC_NLP_PATH elif os.path.exists(_DEFAULT_INDIC_NLP_RESOURCES_PATH): print(f"WARNING: INDIC_NLP_RESOURCES_PATH from environment ('{_ENV_SUPPLIED_INDIC_NLP_PATH}') not found or invalid.") print(f"Falling back to default path: '{_DEFAULT_INDIC_NLP_RESOURCES_PATH}'") INDIC_NLP_RESOURCES_PATH = _DEFAULT_INDIC_NLP_RESOURCES_PATH else: # Environment path is set but invalid, and default path is also invalid. Let it fail with the env path. INDIC_NLP_RESOURCES_PATH = _ENV_SUPPLIED_INDIC_NLP_PATH else: # Environment variable not set, use the default. INDIC_NLP_RESOURCES_PATH = _DEFAULT_INDIC_NLP_RESOURCES_PATH # Column Names # Column name for article headlines HEADLINE_COL = "hl" # Column name for SEO location / URL SEOLOCATION_COL = "seolocation" # Column name for deeplink DEEPLINK_COL = "dl" # Column name for last updated LAST_UPDATED_COL = "lu" # Column name for image ID IMAGE_ID_COL = "imageid" # Column name for image ratio IMAGE_RATIO_COL = "imgratio" # Column name for image size IMAGE_SIZE_COL = "imgsize" SYN_COL = "syn" KEY_COL= "key" # Column name for article IDs ID_COL = "id" # Column name for article topics TOPIC_COL = "tn" # Column name for taxonomy (list of objects with 'name' and 'code') TAXONOMY_COL = "tx" # Column name for article source/property PROPERTY_COL = "host" # API Configuration # Title of the API service API_TITLE = "RAG Recommendation API" # Description of the API service API_DESCRIPTION = "API providing RAG-based recommendations for multi content, using MongoDB Atlas" # Version of the API API_VERSION = "1.0.0" # Model Parameters # Default number of recommendations to return DEFAULT_K = 5 # Threshold for similarity matching SIMILARITY_THRESHOLD = -8.0 # Multiplier for number of candidates to consider before reranking CANDIDATE_MULTIPLIER = 3