import dotenv # Load environment variables from .env file dotenv.load_dotenv() import streamlit as st import os import sys import pickle import numpy as np import spacy # Added to explicitly check for spacy model loading # --- Custom CSS for reduced whitespace and colors --- st.markdown( """ """, unsafe_allow_html=True ) # --- Global message log --- # This list will store messages to be displayed in the log expander app_messages = [] def log_message(type, message): """ Helper function to append messages to the log list and display them prominently based on their type. """ app_messages.append((type, message)) if type == "error": st.error(message) # Add the 'Scripts' directory to the Python path # This allows importing modules like Query_processing, Retrieval, and Answer_Generation script_dir = os.path.join(os.path.dirname(__file__), 'Scripts') log_message("info", f"Attempting to add '{script_dir}' to Python path.") if script_dir not in sys.path: sys.path.append(script_dir) log_message("info", f"'{script_dir}' added to sys.path.") else: log_message("info", f"'{script_dir}' already in sys.path.") # --- Debugging: Check if script files exist --- script_files_to_check = { "Query_processing.py": False, "Retrieval.py": False, "Answer_Generation.py": False } all_scripts_found = True for script_name in script_files_to_check: script_path = os.path.join(script_dir, script_name) if os.path.exists(script_path): script_files_to_check[script_name] = True else: all_scripts_found = False log_message("error", f"Error: Script file not found at expected path: {script_path}") if not all_scripts_found: log_message("error", "One or more essential script files are missing from the 'Scripts' directory. " "Please ensure your project structure is correct.") st.stop() # Stop execution if critical files are missing # Import your core logic modules try: from Query_processing import preprocess_query from Retrieval import Retrieval_averagedQP from Answer_Generation import answer_generation log_message("success", "Core modules imported successfully!") except ImportError as e: log_message("error", f"Error importing core modules. Make sure 'Scripts' directory is correctly structured and contains " f"Query_processing.py, Retrieval.py, and Answer_Generation.py. Error: {e}") st.stop() # --- Configuration --- # Set page configuration for a wider layout st.set_page_config(layout="wide", page_title="Drugbot!", page_icon="💊") # Define paths to your data and vectors # These paths are relative to the app.py location DATASET_PATH = os.path.join(os.path.dirname(__file__), 'Datasets', 'flattened_drug_dataset_cleaned.csv') VECTORS_DIR = os.path.join(os.path.dirname(__file__), 'Vectors') FAISS_INDEX_PATH = os.path.join(VECTORS_DIR, 'faiss_index.idx') DOC_METADATA_PATH = os.path.join(VECTORS_DIR, 'doc_metadata.pkl') DOC_VECTORS_PATH = os.path.join(VECTORS_DIR, 'doc_vectors.npy') # --- Cached Resources --- # Use st.cache_resource to load heavy models and data only once @st.cache_resource def load_all_assets(): """ Verifies the existence of necessary files and attempts to load core NLP models. This function will be run only once across all user sessions. """ with st.spinner("Verifying medical knowledge base and models... This might take a moment."): try: # 1. Check for presence of FAISS and embedding files if not os.path.exists(FAISS_INDEX_PATH): log_message("error", f"Missing FAISS index file: {FAISS_INDEX_PATH}") return False if not os.path.exists(DOC_METADATA_PATH): log_message("error", f"Missing document metadata file: {DOC_METADATA_PATH}") return False if not os.path.exists(DOC_VECTORS_PATH): log_message("error", f"Missing document vectors file: {DOC_VECTORS_PATH}") return False # 2. Attempt to load the SciSpaCy model (if Query_processing doesn't handle it globally) # This is a common point of failure, so we'll explicitly check. # Assuming 'en_core_sci_md' is the model name. try: # If spacy.load() is called multiple times, it might cause issues. # It's better if Query_processing handles its own model loading once. # This check is just to ensure the model is loadable. # nlp = spacy.load("en_core_sci_md") # del nlp # Release the model if it's not needed globally here log_message("info", "SciSpaCy 'en_core_sci_md' model is expected to be loaded by Query_processing.") except OSError: log_message("error", "SciSpaCy 'en_core_sci_md' model not found or linked. " "Please ensure it's installed correctly (e.g., `pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_md-0.5.4.tar.gz`).") return False except Exception as e: log_message("error", f"An unexpected error occurred while checking SciSpaCy model: {e}") return False log_message("success", "Medical knowledge base files verified. Models will be loaded as needed.") return True # Indicate successful verification except Exception as e: log_message("error", f"Failed to verify assets. Please ensure all data and vector files are in their correct paths. Error: {e}") return False # Load all assets at the start of the application assets_loaded = load_all_assets() # --- Title and Header --- st.title("💊 DrugBot") st.markdown("---") # --- Instructions --- # This section is already placed directly after the title and horizontal rule. st.header("How to Use:") st.write( """ Welcome to DrugBot - Retrieval based Medical Drug QA Chatbot! You can ask questions about medical drugs, and I will retrieve information from a verified database to provide accurate answers. 1. **Select an example query** from the dropdown or **type your own question** in the text area below. 2. Click the **"Get Answer"** button. 3. Wait for the chatbot to process your query and generate an answer. """ ) st.markdown("---") # --- Example Queries --- st.header("Try These Examples:") example_queries = [ "Select an example query...", "What is the dosage for Azithromycin?", "What are the side effects of Ibuprofen?", "How should I take Amoxicillin?", "What are the precautions for Warfarin?", "What are the drug interactions for Metformin?", "What is Paracetamol used for?", "Can pregnant women take Aspirin?", "How does Prednisone work?", "What is the recommended dose for children for Tylenol?" ] selected_example = st.selectbox( "Choose a pre-defined question:", example_queries ) user_query = st.text_area( "Or type your question here:", value="" if selected_example == "Select an example query..." else selected_example, height=100, placeholder="e.g., What is the dosage for Azithromycin?" ) # --- Chatbot Interaction --- if st.button("Get Answer", type="primary"): if not assets_loaded: log_message("error", "Application assets failed to verify. Please check the console for errors.") elif not user_query.strip(): log_message("warning", "Please enter a question or select an example query.") else: # Check for Groq API Key if "GROQ_API_KEY" not in os.environ: log_message("error", "GROQ_API_KEY environment variable not set. Please set it to use the chatbot.") else: with st.spinner("Thinking... Retrieving and generating answer..."): try: # 1. Preprocess Query # Query_processing.py should handle its own spacy model loading. (intent, sub_intent), entities = preprocess_query(user_query) log_message("info", f"Detected Intent: {intent}, Sub-Intent: {sub_intent}, Entities: {entities}") # 2. Retrieve Chunks # Retrieval_averagedQP is expected to load FAISS index and vectors internally. chunks = Retrieval_averagedQP(user_query, intent, entities) if not chunks.empty: # Check if chunks DataFrame is not empty # 3. Generate Answer answer = answer_generation(user_query, chunks) log_message("info", f"Generated Answer Content: {answer[:200]}...") # Log first 200 chars if not answer.strip(): # Check if answer is empty after stripping whitespace log_message("warning", "Answer generation returned an empty response.") st.warning("Could not generate a clear answer for this query. Please try rephrasing.") else: log_message("success", "Answer generated successfully!") st.success("Answer:") # Display success message st.write(answer) # This prints the answer in the main area with st.expander("See Retrieved Chunks (for debugging/transparency)"): st.write("Top 3 Retrieved Chunks:") for i, chunk in enumerate(chunks.head(3).to_dict(orient='records')): # Display top 3 for brevity st.write(f"**Chunk {i+1}:**") st.json(chunk) # Use st.json for better display of dict st.markdown("---") else: log_message("warning", "No relevant information found for your query. Please try rephrasing.") except Exception as e: log_message("error", f"An error occurred while processing your request: {e}") st.info("Please try again or rephrase your question.") # User-friendly message st.markdown("---") # --- About Section --- st.header("About This Project") with st.expander("Learn More About the Medical Drug QA Chatbot"): st.markdown( """ This project implements a **Retrieval-Based Question Answering (QA) system** designed to answer user queries about medical drugs. It aims to provide accurate and factually grounded information by retrieving relevant details from a verified database. ### Purpose With the rapid increase in approved medications, ensuring factual accuracy in medical information is critical. Traditional Large Language Models (LLMs) can sometimes "hallucinate" or provide untraceable answers. Our system addresses this by grounding its responses in a curated database, ensuring factual consistency and increasing user trust. ### Methodology The system follows a multi-stage pipeline: 1. **Data Acquisition & Preprocessing:** Information about 2,755 drugs was web-scraped from MayoClinic.com, cleaned, and flattened into a structured CSV dataset. 2. **Embedding Generation:** The dataset content is embedded using the **MiniLM-V6** model, and indexed with **FAISS** (Facebook AI Similarity Search) for efficient similarity-based retrieval. 3. **Query Processing:** User queries undergo **intent and sub-intent classification** (e.g., identifying if the user is asking about "side effects" or "dosage") and **Named Entity Recognition (NER)** using SciSpaCy to improve retrieval precision. 4. **Retrieval Pipeline:** * **Query Vectorization:** The user query is vectorized using MiniLM-V6, incorporating weighted intent vectors. * **Initial Retrieval:** FAISS is used to retrieve the top 10 most similar document chunks. * **Reranking:** The retrieved chunks are then reranked using **Sentence-BioBERT**, which excels at capturing biomedical contexts, significantly improving the relevance of the final selected documents. 5. **Answer Generation:** The top 3 reranked context chunks, along with the original query, are fed to the **LLaMA-4 model** (via Groq API). The LLM is prompted to generate an answer *strictly based on the provided context*, minimizing hallucination. ### Models Used * **MiniLM-L6-v2:** For FAISS-based vector retrieval. * **Sentence-BioBERT:** For reranking candidate chunks. * **LLaMA-4:** For final answer generation (accessed via Groq API). * **SciSpaCy:** For Named Entity Recognition and intent classification. This project was developed by Niranjan Sathish and Hariharan Chandrasekar. """ ) # --- Repository Link Button (Placeholder) --- st.markdown("---") st.write("### Project Resources") st.markdown( """ Once the project is hosted, you'll find links to the repository or Hugging Face Space here. """ ) # Placeholder for the actual button. You can uncomment and update this later. # if st.button("Go to GitHub Repository"): # st.markdown("[GitHub Repository Link](YOUR_GITHUB_REPO_URL_HERE)") # if st.button("Go to Hugging Face Space"): # st.markdown("[Hugging Face Space Link](YOUR_HUGGING_FACE_SPACE_URL_HERE)") # --- Application Logs Section --- st.markdown("---") st.header("Application Logs") with st.expander("Show/Hide Logs"): if app_messages: for msg_type, msg_content in app_messages: if msg_type == "info": st.info(msg_content) elif msg_type == "success": st.success(msg_content) elif msg_type == "warning": st.warning(msg_content) elif msg_type == "error": st.error(msg_content) else: st.write("No application messages yet.")