recommendation / src /config /settings.py
sundaram22verma's picture
initial commit
9d76e23
# Import required libraries
import os
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# MongoDB Configuration# settings.py
# Path to the directory where the fine-tuned reranker model will be saved
# This is used to store the model after fine-tuning, so it can be loaded later for inference
FINE_TUNED_RERANKER_SAVE_PATH = "models/reranker_fine_tuned"
# Connection string for MongoDB Atlas database
MONGO_URI = os.getenv("MONGO_URI", "mongodb+srv://sundram22verma:[email protected]/NewsDataSet?retryWrites=true&w=majority")
# Name of the MongoDB database
MONGO_DB_NAME = os.getenv("MONGO_DB_NAME", "NewsDataSet")
# Name of the collection storing news articles (parsed XML articles)
MONGO_NEWS_COLLECTION_NAME = os.getenv("MONGO_NEWS_COLLECTION_NAME", "parsedXmlArticles")
# Name of the collection storing user session data
MONGO_SESSIONS_COLLECTION_NAME = os.getenv("MONGO_SESSIONS_COLLECTION_NAME", "user_sessions")
# Name of the collection storing FAISS index metadata (like indexed IDs)
MONGO_FAISS_META_COLLECTION_NAME = os.getenv("MONGO_FAISS_META_COLLECTION_NAME", "faiss_index_meta")
# Name of the collection storing user feedback/tracking data
MONGO_TRACKING_COLLECTION_NAME = os.getenv("MONGO_TRACKING_COLLECTION_NAME", "user_feedback_tracking")
# Model Configuration
# Name of the embedding model used for text vectorization
EMBED_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2"
# Name of the text generation model for Indic languages
GENERATOR_MODEL_NAME = "ai4bharat/IndicBART"
# Name of the reranking model for improving search results
RERANKER_MODEL_NAME = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1"
# File Paths
# Path to store the FAISS index for fast similarity search
INDEX_PATH = "DataEmbeddings.bin"
# Path to store the list of IDs corresponding to the FAISS index vectors (legacy, now primarily in MongoDB)
INDEX_IDS_PATH = "DataEmbeddings_ids.json"
# Path to store user interaction logs
INTERACTION_LOG_PATH = "logs/Hindi_User_Interactions.json"
# Path to Indic NLP resources for text processing
# Use relative path from project root
_DEFAULT_INDIC_NLP_RESOURCES_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "indic_nlp_resources")
_ENV_SUPPLIED_INDIC_NLP_PATH = os.getenv("INDIC_NLP_RESOURCES_PATH")
if _ENV_SUPPLIED_INDIC_NLP_PATH:
if os.path.exists(_ENV_SUPPLIED_INDIC_NLP_PATH):
INDIC_NLP_RESOURCES_PATH = _ENV_SUPPLIED_INDIC_NLP_PATH
elif os.path.exists(_DEFAULT_INDIC_NLP_RESOURCES_PATH):
print(f"WARNING: INDIC_NLP_RESOURCES_PATH from environment ('{_ENV_SUPPLIED_INDIC_NLP_PATH}') not found or invalid.")
print(f"Falling back to default path: '{_DEFAULT_INDIC_NLP_RESOURCES_PATH}'")
INDIC_NLP_RESOURCES_PATH = _DEFAULT_INDIC_NLP_RESOURCES_PATH
else:
# Environment path is set but invalid, and default path is also invalid. Let it fail with the env path.
INDIC_NLP_RESOURCES_PATH = _ENV_SUPPLIED_INDIC_NLP_PATH
else:
# Environment variable not set, use the default.
INDIC_NLP_RESOURCES_PATH = _DEFAULT_INDIC_NLP_RESOURCES_PATH
# Column Names
# Column name for article headlines
HEADLINE_COL = "hl"
# Column name for SEO location / URL
SEOLOCATION_COL = "seolocation"
# Column name for deeplink
DEEPLINK_COL = "dl"
# Column name for last updated
LAST_UPDATED_COL = "lu"
# Column name for image ID
IMAGE_ID_COL = "imageid"
# Column name for image ratio
IMAGE_RATIO_COL = "imgratio"
# Column name for image size
IMAGE_SIZE_COL = "imgsize"
SYN_COL = "syn"
KEY_COL= "key"
# Column name for article IDs
ID_COL = "id"
# Column name for article topics
TOPIC_COL = "tn"
# Column name for taxonomy (list of objects with 'name' and 'code')
TAXONOMY_COL = "tx"
# Column name for article source/property
PROPERTY_COL = "host"
# API Configuration
# Title of the API service
API_TITLE = "RAG Recommendation API"
# Description of the API service
API_DESCRIPTION = "API providing RAG-based recommendations for multi content, using MongoDB Atlas"
# Version of the API
API_VERSION = "1.0.0"
# Model Parameters
# Default number of recommendations to return
DEFAULT_K = 5
# Threshold for similarity matching
SIMILARITY_THRESHOLD = -8.0
# Multiplier for number of candidates to consider before reranking
CANDIDATE_MULTIPLIER = 3