Spaces:
Sleeping
Sleeping
# Import required libraries | |
import os | |
from dotenv import load_dotenv | |
# Load environment variables from .env file | |
load_dotenv() | |
# MongoDB Configuration# settings.py | |
# Path to the directory where the fine-tuned reranker model will be saved | |
# This is used to store the model after fine-tuning, so it can be loaded later for inference | |
FINE_TUNED_RERANKER_SAVE_PATH = "models/reranker_fine_tuned" | |
# Connection string for MongoDB Atlas database | |
MONGO_URI = os.getenv("MONGO_URI", "mongodb+srv://sundram22verma:[email protected]/NewsDataSet?retryWrites=true&w=majority") | |
# Name of the MongoDB database | |
MONGO_DB_NAME = os.getenv("MONGO_DB_NAME", "NewsDataSet") | |
# Name of the collection storing news articles (parsed XML articles) | |
MONGO_NEWS_COLLECTION_NAME = os.getenv("MONGO_NEWS_COLLECTION_NAME", "parsedXmlArticles") | |
# Name of the collection storing user session data | |
MONGO_SESSIONS_COLLECTION_NAME = os.getenv("MONGO_SESSIONS_COLLECTION_NAME", "user_sessions") | |
# Name of the collection storing FAISS index metadata (like indexed IDs) | |
MONGO_FAISS_META_COLLECTION_NAME = os.getenv("MONGO_FAISS_META_COLLECTION_NAME", "faiss_index_meta") | |
# Name of the collection storing user feedback/tracking data | |
MONGO_TRACKING_COLLECTION_NAME = os.getenv("MONGO_TRACKING_COLLECTION_NAME", "user_feedback_tracking") | |
# Model Configuration | |
# Name of the embedding model used for text vectorization | |
EMBED_MODEL_NAME = "paraphrase-multilingual-MiniLM-L12-v2" | |
# Name of the text generation model for Indic languages | |
GENERATOR_MODEL_NAME = "ai4bharat/IndicBART" | |
# Name of the reranking model for improving search results | |
RERANKER_MODEL_NAME = "cross-encoder/mmarco-mMiniLMv2-L12-H384-v1" | |
# File Paths | |
# Path to store the FAISS index for fast similarity search | |
INDEX_PATH = "DataEmbeddings.bin" | |
# Path to store the list of IDs corresponding to the FAISS index vectors (legacy, now primarily in MongoDB) | |
INDEX_IDS_PATH = "DataEmbeddings_ids.json" | |
# Path to store user interaction logs | |
INTERACTION_LOG_PATH = "logs/Hindi_User_Interactions.json" | |
# Path to Indic NLP resources for text processing | |
# Use relative path from project root | |
_DEFAULT_INDIC_NLP_RESOURCES_PATH = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), "indic_nlp_resources") | |
_ENV_SUPPLIED_INDIC_NLP_PATH = os.getenv("INDIC_NLP_RESOURCES_PATH") | |
if _ENV_SUPPLIED_INDIC_NLP_PATH: | |
if os.path.exists(_ENV_SUPPLIED_INDIC_NLP_PATH): | |
INDIC_NLP_RESOURCES_PATH = _ENV_SUPPLIED_INDIC_NLP_PATH | |
elif os.path.exists(_DEFAULT_INDIC_NLP_RESOURCES_PATH): | |
print(f"WARNING: INDIC_NLP_RESOURCES_PATH from environment ('{_ENV_SUPPLIED_INDIC_NLP_PATH}') not found or invalid.") | |
print(f"Falling back to default path: '{_DEFAULT_INDIC_NLP_RESOURCES_PATH}'") | |
INDIC_NLP_RESOURCES_PATH = _DEFAULT_INDIC_NLP_RESOURCES_PATH | |
else: | |
# Environment path is set but invalid, and default path is also invalid. Let it fail with the env path. | |
INDIC_NLP_RESOURCES_PATH = _ENV_SUPPLIED_INDIC_NLP_PATH | |
else: | |
# Environment variable not set, use the default. | |
INDIC_NLP_RESOURCES_PATH = _DEFAULT_INDIC_NLP_RESOURCES_PATH | |
# Column Names | |
# Column name for article headlines | |
HEADLINE_COL = "hl" | |
# Column name for SEO location / URL | |
SEOLOCATION_COL = "seolocation" | |
# Column name for deeplink | |
DEEPLINK_COL = "dl" | |
# Column name for last updated | |
LAST_UPDATED_COL = "lu" | |
# Column name for image ID | |
IMAGE_ID_COL = "imageid" | |
# Column name for image ratio | |
IMAGE_RATIO_COL = "imgratio" | |
# Column name for image size | |
IMAGE_SIZE_COL = "imgsize" | |
SYN_COL = "syn" | |
KEY_COL= "key" | |
# Column name for article IDs | |
ID_COL = "id" | |
# Column name for article topics | |
TOPIC_COL = "tn" | |
# Column name for taxonomy (list of objects with 'name' and 'code') | |
TAXONOMY_COL = "tx" | |
# Column name for article source/property | |
PROPERTY_COL = "host" | |
# API Configuration | |
# Title of the API service | |
API_TITLE = "RAG Recommendation API" | |
# Description of the API service | |
API_DESCRIPTION = "API providing RAG-based recommendations for multi content, using MongoDB Atlas" | |
# Version of the API | |
API_VERSION = "1.0.0" | |
# Model Parameters | |
# Default number of recommendations to return | |
DEFAULT_K = 5 | |
# Threshold for similarity matching | |
SIMILARITY_THRESHOLD = -8.0 | |
# Multiplier for number of candidates to consider before reranking | |
CANDIDATE_MULTIPLIER = 3 |