Spaces:

rishi002
/

medVedaReportAnalysis

Paused

App Files Files Community

rishi002 commited on Apr 21

Commit

16e6135

verified ·

1 Parent(s): 4326e7e

Update app.py

Browse files

Files changed (1) hide show

app.py +224 -441

app.py CHANGED Viewed

@@ -1,482 +1,265 @@
 import os
-import shutil
-import tempfile
-import io
-import re
-from pathlib import Path
 import gradio as gr
-import torch
-from langchain.chains import RetrievalQA
-from langchain.document_loaders import PyPDFLoader, DirectoryLoader, TextLoader
 from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
-from langchain.prompts import PromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.vectorstores import FAISS
 from collections import OrderedDict
-import fitz  # PyMuPDF for more robust PDF handling
-from fastapi import FastAPI, File, UploadFile, HTTPException, Request
-from fastapi.responses import JSONResponse
-from fastapi.middleware.cors import CORSMiddleware
 # Constants
-KNOWLEDGE_DIR = "medical_knowledge"
-VECTOR_STORE_PATH = "vectorstore"
-MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"  # Gated model requiring authentication
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-EMBEDDING_MODEL = "rishi002/all-MiniLM-L6-v2"  # Using the embedding model from chatbot code
 CACHE_DIR = "/tmp/models_cache"
-# Get HF token from environment variables (set in HF Spaces secrets)
-HF_TOKEN = os.environ.get("HF_TOKEN")
-if not HF_TOKEN:
-    print("Warning: HF_TOKEN not found in environment variables. You may not be able to access gated models.")
-# Create cache directory
 os.makedirs(CACHE_DIR, exist_ok=True)
-class MedicalReportAnalyzer:
-    def __init__(self):
-        self.vector_store = None
-        self.llm = None
-        self.qa_chain = None
-        self.user_report_data = "No report data available."  # Default value
-        self.original_report_data = "No original report data available."  # Store original data
-        # Initialize everything
-        self._load_or_create_vector_store()
-        self._initialize_llm()
-        self._setup_qa_chain()
-    def _load_or_create_vector_store(self):
-        """Load existing vector store or create a new one from knowledge documents"""
-        embeddings = HuggingFaceEmbeddings(
-            model_name=EMBEDDING_MODEL,
-            cache_folder=CACHE_DIR
-        )
-        # Check if vector store exists
-        if os.path.exists(VECTOR_STORE_PATH):
-            print("Loading existing vector store...")
-            self.vector_store = FAISS.load_local(VECTOR_STORE_PATH, embeddings, allow_dangerous_deserialization=True)
-        else:
-            print("Creating new vector store from documents...")
-            # Create knowledge directory if it doesn't exist
-            os.makedirs(KNOWLEDGE_DIR, exist_ok=True)
-            # Check if there are documents to process
-            if len(os.listdir(KNOWLEDGE_DIR)) == 0:
-                print(f"Warning: No documents found in {KNOWLEDGE_DIR}. Please add medical PDFs.")
-                # Initialize empty vector store
-                self.vector_store = FAISS.from_texts(["No medical knowledge available yet."], embeddings)
-                self.vector_store.save_local(VECTOR_STORE_PATH)
-                return
-            # Load all PDFs from the knowledge directory
-            try:
-                # First try with DirectoryLoader
-                loader = DirectoryLoader(KNOWLEDGE_DIR, glob="**/*.pdf", loader_cls=PyPDFLoader)
-                documents = loader.load()
-                # Split documents into chunks
-                text_splitter = RecursiveCharacterTextSplitter(
-                    chunk_size=1000,
-                    chunk_overlap=200,
-                    length_function=len
-                )
-                chunks = text_splitter.split_documents(documents)
-                # Create and save the vector store
-                self.vector_store = FAISS.from_documents(chunks, embeddings)
-                self.vector_store.save_local(VECTOR_STORE_PATH)
-            except Exception as e:
-                print(f"Error loading documents with DirectoryLoader: {str(e)}")
-                # Initialize with minimal data
-                self.vector_store = FAISS.from_texts(["No medical knowledge available yet."], embeddings)
-                self.vector_store.save_local(VECTOR_STORE_PATH)
-    def _initialize_llm(self):
-        """Initialize the language model using HuggingFaceEndpoint"""
-        print(f"Initializing LLM with {MODEL_NAME}...")
-        try:
-            self.llm = HuggingFaceEndpoint(
-                repo_id=MODEL_NAME,
-                task="text-generation",
-                temperature=0.5,
-                token=HF_TOKEN,
-                model_kwargs={"max_length": 512}
-            )
-        except Exception as e:
-            print(f"Error initializing HuggingFaceEndpoint: {str(e)}")
-            # Fallback to a simpler model if needed
-            fallback_model = "google/flan-t5-large"
-            print(f"Falling back to {fallback_model}")
-            self.llm = HuggingFaceEndpoint(
-                repo_id=fallback_model,
-                task="text-generation",
-                temperature=0.5
-            )
-    def _setup_qa_chain(self):
-        """Set up the question-answering chain"""
-        # Define a custom prompt template for medical analysis
-        template = """
-        You are a medical assistant analyzing patient medical reports. Use the following pieces of context to answer the question. If you don't know the answer, just say that you don't know, don't try to make up an answer.
-        Also summarize your answer strictly in not more than 350 words and keep the language of your answer simple and easy to understand. Make sure you use easy and simple terms for explanation. Each important point should be stated only once.
-        Patient Report Summary: {patient_data}
-        Context from medical knowledge base: {context}
-        Question: {question}
-        Start the answer directly:
-        """
-        # Create prompt with correct variable names
-        prompt = PromptTemplate(
-            template=template,
-            input_variables=["context", "question", "patient_data"]
-        )
-        # Setup the retriever
-        retriever = self.vector_store.as_retriever(search_kwargs={"k": 3})
-        # Create the QA chain with fixed parameters
-        self.qa_chain = RetrievalQA.from_chain_type(
-            llm=self.llm,
-            chain_type="stuff",
-            retriever=retriever,
-            return_source_documents=False,
-            chain_type_kwargs={"prompt": prompt}
         )
-    def remove_header_information(self, text):
-        """Remove header information from the report text"""
-        # Store the original text
-        self.original_report_data = text
-        # Split the text into lines to analyze
-        lines = text.split('\n')
-        # Define patterns to identify header information
-        header_patterns = [
-            r'(Name\s*:)',
-            r'(Patient\s*Name\s*:)',
-            r'(DOB|Date of Birth\s*:)',
-            r'(Age\s*:)',
-            r'(Gender\s*:)',
-            r'(Lab No\.|Laboratory Number\s*:)',
-            r'(Patient ID\s*:)',
-            r'(Report Status\s*:)',
-            r'(Ref By|Referred By\s*:)',
-            r'(Collected\s*:)',
-            r'(Reported\s*:)',
-            r'(A/c Status\s*:)',
-            r'(Processed at\s*:)',
-            r'(Collected at\s*:)',
-            r'(Address\s*:)',
-            r'(Phone|Mobile|Mob\s*:)',
-        ]
-        # Create a regex pattern that matches any of the header patterns
-        combined_pattern = '|'.join(header_patterns)
-        # Find where the actual test results begin
-        test_results_start = -1
-        for i, line in enumerate(lines):
-            if re.search(r'(Test\s*Report|Test\s*Name|Test\s*Results|Results|HEMOGRAM|ROUTINE|EXAMINATION)', line, re.IGNORECASE):
-                test_results_start = i
-                break
-        # If we couldn't find the start of test results, look for key medical terms
-        if test_results_start == -1:
-            for i, line in enumerate(lines):
-                # Look for common test result sections
-                if re.search(r'(Hemoglobin|Blood|Urine|CBC|Glucose|Cholesterol|Protein|RBC|WBC)', line, re.IGNORECASE):
-                    test_results_start = max(0, i-3)  # Start a few lines before the first test result
-                    break
-        # If we still couldn't find the start of test results, use a heuristic:
-        # Skip the first few lines which usually contain header information
-        if test_results_start == -1:
-            # Count lines with patient identifiable information
-            header_count = 0
-            for i, line in enumerate(lines):
-                if re.search(combined_pattern, line, re.IGNORECASE):
-                    header_count += 1
-            # If we found several header lines, skip those plus a few more
-            if header_count > 0:
-                test_results_start = min(header_count + 5, len(lines) // 3)
-            else:
-                # If no clear header pattern was found, just skip the first 10% of lines as a fallback
-                test_results_start = max(1, len(lines) // 10)
-        # Return text from the determined start point
-        clean_text = '\n'.join(lines[test_results_start:])
-        # If this dramatically shortened the text, use a less aggressive approach
-        if len(clean_text) < len(text) * 0.5:
-            print("Warning: Header removal may have removed too much content. Using alternative approach.")
-            # Alternative approach: Just remove lines with header patterns
-            filtered_lines = []
-            for line in lines:
-                if not re.search(combined_pattern, line, re.IGNORECASE):
-                    filtered_lines.append(line)
-            clean_text = '\n'.join(filtered_lines)
-        return clean_text
-    def extract_text_from_pdf_pymupdf(self, pdf_path):
-        """Extract text from PDF using PyMuPDF (more robust than PyPDF)"""
-        text = ""
-        try:
-            doc = fitz.open(pdf_path)
-            for page in doc:
-                text += page.get_text()
-            doc.close()
-            return text
-        except Exception as e:
-            print(f"PyMuPDF extraction error: {str(e)}")
-            return None
-    def extract_text_from_pdf_pypdf(self, pdf_path):
-        """Extract text using PyPDF as a backup method"""
-        try:
-            loader = PyPDFLoader(pdf_path)
-            pages = loader.load()
-            return "\n".join([page.page_content for page in pages])
-        except Exception as e:
-            print(f"PyPDF extraction error: {str(e)}")
-            return None
-    def process_user_report(self, report_file):
-        """Process the uploaded medical report with multiple fallback methods"""
-        if report_file is None:
-            return "No file uploaded. Please upload a medical report."
-        # Ensure the uploaded file is read as bytes
-        temp_dir = tempfile.mkdtemp()
-        try:
-            # Copy the uploaded file to the temp directory
-            temp_file_path = os.path.join(temp_dir, "user_report.pdf")
-            # Handle file based on its type
-            try:
-                if isinstance(report_file, str):  # If it's a file path
-                    shutil.copy(report_file, temp_file_path)
-                elif hasattr(report_file, 'name'):  # Gradio file object
-                    with open(temp_file_path, 'wb') as f:
-                        with open(report_file.name, 'rb') as source:
-                            f.write(source.read())
-                else:  # Try to handle as bytes or file-like object
-                    with open(temp_file_path, 'wb') as f:
-                        f.write(report_file.read() if hasattr(report_file, 'read') else report_file)
-            except Exception as e:
-                print(f"Error saving file: {str(e)}")
-                return f"Error saving the uploaded file: {str(e)}"
-            # Try multiple methods to extract text from the PDF
-            text = None
-            # Method 1: PyMuPDF
-            text = self.extract_text_from_pdf_pymupdf(temp_file_path)
-            # Method 2: PyPDF as fallback
-            if not text:
-                text = self.extract_text_from_pdf_pypdf(temp_file_path)
-            # Method 3: Last resort - try to read as raw text
-            if not text:
-                try:
-                    with open(temp_file_path, 'r', errors='ignore') as f:
-                        text = f.read()
-                except Exception as e:
-                    print(f"Raw text reading error: {str(e)}")
-            # If we got text, process it
-            if text and len(text.strip()) > 0:
-                # Remove header information from the text
-                cleaned_text = self.remove_header_information(text)
-                # Store the cleaned text
-                self.user_report_data = cleaned_text
-                # Split into chunks if needed
-                text_splitter = RecursiveCharacterTextSplitter(
-                    chunk_size=1000,
-                    chunk_overlap=200,
-                    length_function=len
-                )
-                chunks = text_splitter.split_text(cleaned_text)
-                # Check if too much text was removed
-                original_length = len(text.strip())
-                cleaned_length = len(cleaned_text.strip())
-                removal_percentage = (original_length - cleaned_length) / original_length * 100
-                if removal_percentage > 80:
-                    return f"Report processed successfully, but significant content may have been filtered. Original length: {original_length} chars. Cleaned length: {cleaned_length} chars. Extracted approximately {len(chunks)} text chunks."
-                else:
-                    return f"Report processed successfully. Removed approximately {removal_percentage:.1f}% of header content. Extracted {len(chunks)} text chunks."
-            else:
-                self.user_report_data = "Unable to extract text from the provided PDF. This is an empty report placeholder."
-                return "Warning: Could not extract text from the PDF. The file may be corrupted, password-protected, or contain only images. Processing will continue with limited data."
-        finally:
-            # Clean up the temporary directory and file
-            shutil.rmtree(temp_dir)
-    def answer_question(self, question):
-        """Answer a question based on the uploaded report and knowledge base"""
-        if not self.user_report_data or self.user_report_data == "No report data available.":
-            return "No report has been processed or text extraction failed. Please upload a medical report first."
-        # Check if question is about patient demographics or identification
-        demographic_patterns = [
-            r'(patient|name|age|gender|birth|dob|address|phone|contact|id|identification)',
-            r'(doctor|physician|referring|referred by)',
-            r'(date|time|collected|processed|reported)',
-            r'(lab|laboratory|number|id)'
-        ]
-        combined_demo_pattern = '|'.join(demographic_patterns)
-        # If question might be about demographics, check if we need to use original data
-        if re.search(combined_demo_pattern, question, re.IGNORECASE):
-            # For demographic questions, check if it's asking for specific identification
-            specific_id_patterns = [
-                r'(name of|patient name|who is|what is the name)',
-                r'(exact age|age of|how old)',
-                r'(address of|where|location|contact details)',
-                r'(doctor name|name of doctor|referring doctor|who referred)',
-                r'(date of|when was|time of|report date)',
-                r'(lab number|patient id|identification number)'
-            ]
-            specific_id_pattern = '|'.join(specific_id_patterns)
-            # If it's a direct question about patient identity, don't answer
-            if re.search(specific_id_pattern, question, re.IGNORECASE):
-                return "I'm unable to provide specific patient identification information. This feature is disabled to protect patient privacy. Please ask about medical test results or interpretations instead."
-        # Try using the QA chain with proper error handling
         try:
-            # Pass the query to the qa_chain along with the patient data
-            response = self.qa_chain({"query": question, "patient_data": self.user_report_data})
-            # Extract the answer from the response
-            if isinstance(response, dict) and "result" in response:
-                # Get the raw result
-                result = response["result"]
-                # Process like in the chatbot code - remove duplicates
-                sentences = [s.strip() for s in result.split('.') if s.strip()]
-                unique_sentences = list(OrderedDict.fromkeys(sentences))
-                cleaned_result = '. '.join(unique_sentences)
-                # Add period if needed
-                if cleaned_result and not cleaned_result.endswith('.'):
-                    cleaned_result += '.'
-                return cleaned_result
-            else:
-                return str(response)
-        except Exception as e:
-            print(f"Error in QA chain: {str(e)}")
-            # Log details about the error for debugging
-            print(f"QA chain type: {type(self.qa_chain).__name__}")
-            # Fallback to direct LLM call
-            try:
-                direct_prompt = f"""
-                Act as an expert doctor who performs medical report analysis accurately. Analyze the given patient data and provide me the answer to the question asked about the medical report in strictly less than 200 words.
-                NOTE : I ONLY WANT THE ANSWER FROM YOU, DO NOT GIVE ME THE PATIENT REPORT DETAILS AND THE QUESTIONS WHICH I ASKED IN YOUR ANSWERS.
-                Also use simple and easy to understand terms in your answer and keep your answer in easy to understand language.
-                Question about medical report: {question}
-                Patient data available: {self.user_report_data[:800]}... (truncated)
-                Please answer based on this information:
-                """
-                direct_result = self.llm(direct_prompt)
-                return f"Fallback answer {direct_result}"
-            except Exception as fallback_error:
-                print(f"Fallback also failed: {str(fallback_error)}")
-                return f"Error processing your question. Please try a different question or report."
-# Initialize the analyzer
-analyzer = MedicalReportAnalyzer()
-# FastAPI app
-app = FastAPI()
-# CORS support for frontend testing
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-@app.post("/process_user_report")
-async def process_user_report(report_file: UploadFile = File(...)):
-    try:
-        temp_dir = tempfile.mkdtemp()
-        temp_file_path = os.path.join(temp_dir, report_file.filename)
-        with open(temp_file_path, "wb") as f:
-            shutil.copyfileobj(report_file.file, f)
-        result = analyzer.process_user_report(temp_file_path)
-        return {"status": "success", "message": result}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-    finally:
-        report_file.file.close()
-        shutil.rmtree(temp_dir, ignore_errors=True)
-@app.post("/answer_question")
-async def answer_question(request: Request):
-    try:
-        data = await request.json()
-        question = data.get("question", "").strip()
-        if not question:
-            raise HTTPException(status_code=400, detail="Question is required")
-        answer = analyzer.answer_question(question)
-        return {"status": "success", "answer": answer}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=str(e))
-# Optional: Keep Gradio interface for debugging or UI testing
 if __name__ == "__main__":
-    with gr.Blocks(title="Medical Report Analyzer") as demo:
-        gr.Markdown("# Medical Report Analyzer")
-        gr.Markdown("Upload your medical report and ask questions about it. The system will analyze your report and provide answers based on medical knowledge.")
-        with gr.Row():
-            with gr.Column(scale=1):
-                report_file = gr.File(label="Upload Medical Report (PDF)")
-                upload_button = gr.Button("Process Report")
-                upload_output = gr.Textbox(label="Processing Status")
-            with gr.Column(scale=2):
-                question_input = gr.Textbox(label="Ask a question about your report")
-                answer_button = gr.Button("Get Answer")
-                answer_output = gr.Textbox(label="Answer")
-        upload_button.click(fn=analyzer.process_user_report, inputs=[report_file], outputs=[upload_output])
-        answer_button.click(fn=analyzer.answer_question, inputs=[question_input], outputs=[answer_output])
-    demo.launch(
-        share=True,
-        favicon_path="favicon.ico" if os.path.exists("favicon.ico") else None,
-        server_name="0.0.0.0",
-        server_port=7860
-    )

 import os
 import gradio as gr
+import requests
+import tempfile
+from fastapi import FastAPI, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from langchain_community.vectorstores import FAISS
 from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint
+from langchain.chains import RetrievalQA
+from langchain_core.prompts import PromptTemplate
+from langchain_community.document_loaders import PyPDFLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from collections import OrderedDict
+# Retrieve HF_TOKEN from environment
+HF_TOKEN = os.environ.get("HF_TOKEN")
 # Constants
 CACHE_DIR = "/tmp/models_cache"
+DB_FAISS_PATH = "/tmp/vectorstore/db_faiss"
+USER_REPORT_DB_PATH = "/tmp/vectorstore/user_report_db"
+HUGGINGFACE_REPO_ID = "microsoft/Phi-3-mini-4k-instruct"
+# Create directories
 os.makedirs(CACHE_DIR, exist_ok=True)
+os.makedirs(os.path.dirname(DB_FAISS_PATH), exist_ok=True)
+os.makedirs(os.path.dirname(USER_REPORT_DB_PATH), exist_ok=True)
+# Initialize FastAPI app
+app = FastAPI()
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Load the embedding model
+embedding_model = HuggingFaceEmbeddings(
+    model_name="rishi002/all-MiniLM-L6-v2",
+    cache_folder=CACHE_DIR
+)
+# Global variables to track report status and database
+user_report_processed = False
+user_report_db = None
+# Load LLM
+def load_llm():
+    return HuggingFaceEndpoint(
+        repo_id=HUGGINGFACE_REPO_ID,
+        task="text-generation",
+        temperature=0.5,
+        model_kwargs={"token": HF_TOKEN, "max_length": 512}
+    )
+# Custom prompt template for medical report analysis
+MEDICAL_REPORT_PROMPT = """
+You are a helpful medical assistant analyzing a patient's medical report.
+Use only the information provided in the context to answer the user's question.
+If you don't know the answer based on the given context, simply state that you don't have enough information.
+Don't make up any medical information or conclusions not supported by the report.
+Provide concise, clear explanations in simple language that a patient can understand.
+Avoid using complex medical terminology unless necessary, and if used, briefly explain what it means.
+Keep your answer concise and focused on the question asked.
+Context: {context}
+Question: {question}
+Start the answer directly without repeating the question.
+"""
+# Function to download and process PDF from URL
+def process_pdf_from_url(pdf_url):
+    try:
+        # Download the PDF from the URL
+        response = requests.get(pdf_url)
+        response.raise_for_status()  # Raise exception for bad status codes
+        # Create a temporary file to save the PDF
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_pdf:
+            temp_pdf.write(response.content)
+            temp_path = temp_pdf.name
+        # Load the PDF
+        loader = PyPDFLoader(temp_path)
+        documents = loader.load()
+        # Split documents into chunks
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=200
         )
+        text_chunks = text_splitter.split_documents(documents)
+        # Create vector database from the text chunks
+        db = FAISS.from_documents(text_chunks, embedding_model)
+        db.save_local(USER_REPORT_DB_PATH)
+        # Clean up the temporary file
+        os.unlink(temp_path)
+        return True
+    except Exception as e:
+        print(f"Error processing PDF: {str(e)}")
+        return False
+# Create QA chain for user report
+def create_user_report_qa_chain():
+    if not os.path.exists(USER_REPORT_DB_PATH):
+        return None
+    db = FAISS.load_local(USER_REPORT_DB_PATH, embedding_model, allow_dangerous_deserialization=True)
+    prompt = PromptTemplate(template=MEDICAL_REPORT_PROMPT, input_variables=["context", "question"])
+    return RetrievalQA.from_chain_type(
+        llm=load_llm(),
+        chain_type="stuff",
+        retriever=db.as_retriever(search_kwargs={'k': 3}),
+        return_source_documents=False,
+        chain_type_kwargs={'prompt': prompt}
+    )
+# API Models
+class ReportURL(BaseModel):
+    url: str
+class Question(BaseModel):
+    query: str
+# Combined API endpoint to process a PDF report from a URL and return status
+@app.post("/api/process-report")
+async def process_report(report_data: ReportURL):
+    global user_report_processed, user_report_db
+    # Process the PDF from the URL
+    success = process_pdf_from_url(report_data.url)
+    if success:
+        user_report_processed = True
+        user_report_db = create_user_report_qa_chain()
+        return {
+            "status": "success",
+            "message": "Medical report data extracted successfully",
+            "processed": True
+        }
+    else:
+        user_report_processed = False
+        return {
+            "status": "error",
+            "message": "Failed to process the medical report",
+            "processed": False
+        }
+# API endpoint to ask questions about the processed report
+@app.post("/api/ask-question")
+async def ask_question(question_data: Question):
+    global user_report_db, user_report_processed
+    if not user_report_processed or user_report_db is None:
+        raise HTTPException(status_code=400, detail="No medical report has been processed yet")
+    try:
+        # Get answer from the QA chain
+        response = user_report_db.invoke({'query': question_data.query})
+        # Get the raw result
+        result = response["result"]
+        # Remove duplicates by splitting into sentences and keeping only unique ones
+        sentences = [s.strip() for s in result.split('.') if s.strip()]
+        # Use OrderedDict to preserve order while removing duplicates
+        unique_sentences = list(OrderedDict.fromkeys(sentences))
+        # Rejoin with periods
+        cleaned_result = '. '.join(unique_sentences) + '.' if unique_sentences else ""
+        return {"answer": cleaned_result}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing question: {str(e)}")
+# Gradio Interface
+with gr.Blocks() as iface:
+    gr.Markdown("# Medical Report Analysis")
+    with gr.Row():
+        with gr.Column():
+            pdf_url_input = gr.Textbox(label="Enter PDF Report URL")
+            process_button = gr.Button("Analyze Report")
+            status_text = gr.Textbox(label="Status", interactive=False)
+    with gr.Row():
+        with gr.Column():
+            query_input = gr.Textbox(label="Ask a question about your report")
+            query_button = gr.Button("Submit Question")
+            answer_output = gr.Textbox(label="Answer", interactive=False)
+    def process_report_gradio(url):
+        global user_report_processed, user_report_db
+        if not url:
+            return "Please enter a valid URL"
+        success = process_pdf_from_url(url)
+        if success:
+            user_report_processed = True
+            user_report_db = create_user_report_qa_chain()
+            return "Medical report data extracted successfully. You can now ask questions about your report."
+        else:
+            user_report_processed = False
+            return "Failed to process the medical report. Please check the URL and try again."
+    def ask_question_gradio(query):
+        global user_report_db, user_report_processed
+        if not user_report_processed or user_report_db is None:
+            return "No medical report has been processed yet. Please upload and analyze a report first."
         try:
+            # Get answer from the QA chain
+            response = user_report_db.invoke({'query': query})
+            # Get the raw result
+            result = response["result"]
+            # Remove duplicates by splitting into sentences and keeping only unique ones
+            sentences = [s.strip() for s in result.split('.') if s.strip()]
+            # Use OrderedDict to preserve order while removing duplicates
+            unique_sentences = list(OrderedDict.fromkeys(sentences))
+            # Rejoin with periods
+            cleaned_result = '. '.join(unique_sentences) + '.' if unique_sentences else ""
+            return cleaned_result
+        except Exception as e:
+            return f"Error: {str(e)}"
+    process_button.click(
+        fn=process_report_gradio,
+        inputs=pdf_url_input,
+        outputs=status_text
+    )
+    query_button.click(
+        fn=ask_question_gradio,
+        inputs=query_input,
+        outputs=answer_output
+    )
+# Mount the Gradio app to FastAPI
+app = gr.mount_gradio_app(app, iface, path="/")
+# Run the app with uvicorn
 if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)