# -*- coding: utf-8 -*-
"""CiPE_Streamlit

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1jACLFXfsdWM59lrfTQGcZVsTIHBO92R8
"""

# Om Maa

!pip install langchain predictionguard lancedb html2text sentence-transformers PyPDF2
!pip install huggingface_hub
!pip install transformers
!pip install sentencepiece
!pip install streamlit

import os
import urllib.request

import html2text
import predictionguard as pg
from langchain import PromptTemplate, FewShotPromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
import numpy as np
import lancedb
from lancedb.embeddings import with_embeddings
import pandas as pd


os.environ['PREDICTIONGUARD_TOKEN'] = "q1VuOjnffJ3NO2oFN8Q9m8vghYc84ld13jaqdF7E"

# Streamlit App Initiation

import streamlit as st

# Replace input() with Streamlit's input widgets
# Sidebar for inputting the name, age, gender, and ethnicity
name = st.sidebar.text_input('Name')
age = st.sidebar.number_input('Age', min_value=0, max_value=120, step=1)
gender = st.sidebar.selectbox('Gender', ['Male', 'Female', 'Other'])
ethnicity = st.sidebar.text_input('Ethnicity')


# Main container
with st.form(key='patient_form'):
    # Text input for procedures
    disease = st.text_area('DISEASE', height=100)

    # Text input for prescriptions (where you would get drug_names)
    prescriptions = st.text_area('PRESCRIPTIONS', height=100)

    # Text input for additional information
    additional_info = st.text_area('ADDITIONAL INFO', height=100)

    # Submit button for the form
    submit_button = st.form_submit_button(label='Predict Drug Effects')

from PyPDF2 import PdfReader

# Replace 'path_to_your_pdf_file.pdf' with the path to your PDF file
pdf_path = '/content/drug_side_effects_summary_cleaned.pdf'
reader = PdfReader(pdf_path)

# Initialize an empty string to accumulate text
text = ''

# Iterate over each page in the PDF
for page in reader.pages:
    # Extract text from the page and append it to the text string
    text += page.extract_text() + "\n"

# Now, `text` contains the text content of the PDF. You can print it or process it further.
print(text[:500])  # Example: print the first 500 characters to understand the structure

import re

# Function to clean the extracted text
def clean_text(text):
    # Correcting unwanted line breaks and spaces
    text = re.sub(r'-\n', '', text)  # Remove hyphenation
    text = re.sub(r'\n', ' ', text)  # Replace new lines with space
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = text.strip()  # Remove leading and trailing spaces
    return text

# Clean the extracted text
cleaned_text = clean_text(text)

# Return a portion of the cleaned text to verify the cleaning
cleaned_text[:500]

# Define a function to chunk text with specified size and overlap using standard Python
def chunk_text(text, chunk_size=700, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        # If we're not at the beginning, move back 'overlap' characters for context
        if start > 0:
            start -= overlap
        end = start + chunk_size
        chunks.append(text[start:end])
        start += chunk_size
    return chunks

# Chunk the cleaned text into smaller pieces for LLM input
docs_alternative = chunk_text(cleaned_text, chunk_size=700, overlap=50)

# Prepare to display the first few chunks to verify the result
chunks_to_display_alt = 3
chunks_preview_alt = [docs_alternative[i] for i in range(min(len(docs_alternative), chunks_to_display_alt))]

chunks_preview_alt

# Format the chunks to avoid prompt template conflicts
chunks_preview_alt = [x.replace('#', '-') for x in chunks_preview_alt]

# Embeddings setup
name = "all-MiniLM-L12-v2"
model = SentenceTransformer(name)

# Embedding functions
def embed_batch(batch):
    return [model.encode(sentence, show_progress_bar=True) for sentence in batch]

def embed(sentence):
    return model.encode(sentence)

# Ensure the LanceDB directory does not exist already to avoid errors
lancedb_dir = ".lancedb"
if not os.path.exists(lancedb_dir):
    os.mkdir(lancedb_dir)
uri = lancedb_dir
db = lancedb.connect(uri)

# Prepare metadata for embedding
metadata = [[i, chunks_preview_alt] for i, chunks_preview_alt in enumerate(chunks_preview_alt)]
doc_df = pd.DataFrame(metadata, columns=["chunk", "text"])

# Embed the documents
data = with_embeddings(embed_batch, doc_df)

# LanceDB operations
# if not db.has_table("pdf_data"):
db.create_table("pdf_data", data=data)
table = db.open_table("pdf_data")
table.add(data=data)

# Note: Adjust the 'create_table' and 'open_table' to match your dataset/table names

message = "What are the side effects of doxycycline for treating Acne?"
results = table.search(embed(message)).limit(5).to_pandas()
#print(results.head())


message = "What are the side effects of doxycycline for treating Acne?"
results = table.search(embed(message)).limit(5).to_pandas()
#print(results.head())

# Assuming the setup for embeddings, LanceDB, and the PromptTemplate are already in place

# Assuming drug_names are retrieved from the prescriptions field
# You should parse the prescriptions field to extract the drug names
drug_names = prescriptions.split(',')  # This is an example, the actual extraction depends on how the prescriptions are entered
disease = disease  # Replace this with the actual method of getting the disease from the user


def rag_answer_drug_side_effects(name, drug_names, disease):
    # Formulate a question related to drug side effects
    message = f"What are the potential side effects of using {drug_names} for treating {disease}? Please provide a list of side effects specific to the use of these drugs in the context of the mentioned disease of {name} person."

    # Search the database for relevant context
    results = table.search(embed(message)).limit(10).to_pandas()  # Adjust based on the correct API call
    results.sort_values(by=['_distance'], inplace=True, ascending=True)
    context = results['text'].iloc[0]  # Use the most relevant document

    # Define the prompt template
    template = """### Instruction:
    Start with Hi, {name}. Then give a compassionate answer in bullet points and list.
    Read the below input context and respond with a mid length answer to the given question. If you cannot find an exact answer then look up something nearer to the medicaiton and disease
    "

    ### Input:
    Context: {context}

    Question: {question}

    ### Response:
    """

    # Augment the prompt with the retrieved context
    prompt = template.format(context=context, question=message)

    # Get a response
    result = pg.Completion.create(
    model="Neural-Chat-7B",
    prompt = prompt
    )

    # # Here you would call your LLM or any other model to generate an answer based on the prompt
    # # Since we cannot execute dynamic model calls in this environment, we'll simulate a response
    # simulated_response = "Sorry, I can't find an answer, but you might try looking in the following resource."

    return result['choices'][0]['text']


def rag_answer_drug_benfit_effects(name, drug_names, disease):
    # Formulate a question related to drug side effects
    message = f"What are the potential benefits of using {drug_names} for treating {disease}? Please provide a list of benefits specific to the use of these drugs in the context of the mentioned disease of {name} person."

    # Search the database for relevant context
    results = table.search(embed(message)).limit(10).to_pandas()  # Adjust based on the correct API call
    results.sort_values(by=['_distance'], inplace=True, ascending=True)
    context = results['text'].iloc[0]  # Use the most relevant document

    # Define the prompt template
    template = """### Instruction:
    Start with Hi, {name}. Then give a compassionate answer in bullet points and list.
    Read the below input context and respond with a mid length answer to the given question. If you cannot find an exact answer then look up something nearer to the medicaiton and disease
    "

    ### Input:
    Context: {context}

    Question: {question}

    ### Response:
    """

    # Augment the prompt with the retrieved context
    prompt = template.format(context=context, question=message)

    # Get a response
    result = pg.Completion.create(
    model="Neural-Chat-7B",
    prompt = prompt
    )

    # # Here you would call your LLM or any other model to generate an answer based on the prompt
    # # Since we cannot execute dynamic model calls in this environment, we'll simulate a response
    # simulated_response = "Sorry, I can't find an answer, but you might try looking in the following resource."

    return result['choices'][0]['text']

# When this button is clicked, it will return True
if st.button('Predict Drug Effects'):
    # Call your processing functions here
    # For example:
    side_effects, benefits = rag_answer_drug_side_effects(name, drug_names, disease), rag_answer_drug_benfit_effects(name,drug_names,disease )
    # You will need to define the process_input function to process these inputs

# When this button is clicked, it will return True
if submit_button:
    # Call your processing functions here
    # Make sure to validate input and handle errors/exceptions as necessary
    try:
        side_effects_response = rag_answer_drug_side_effects(name, drug_names, disease)
        benefits_response = rag_answer_drug_benfit_effects(name, drug_names, disease)
        st.write("Side Effects:", side_effects_response)
        st.write("Benefits:", benefits_response)
    except Exception as e:
        st.error(f"An error occurred: {e}")

from huggingface_hub import notebook_login, Repository

notebook_login()

from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Define the path to the checkpoint
checkpoint_path = r"filius-Dei/CiPE"

# # Load the model
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path)

# # Load the tokenizer
# tokenzier = AutoTokenzier.from_pretrained("distilbert-base-uncased")

# # # Define the path to the checkpoint
# # checkpoint_path = r'https://huggingface.co/filius-Dei/CiPE'

# # # Correct format for repo_id
# # repo_id = "filius-Dei/CiPE"

# # model = AutoModelForSequenceClassification.from_pretrained(repo_id)

# # # Load the tokenizer
# # tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")