# -*- coding: utf-8 -*- """CiPE_Streamlit Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1jACLFXfsdWM59lrfTQGcZVsTIHBO92R8 """ # Om Maa !pip install langchain predictionguard lancedb html2text sentence-transformers PyPDF2 !pip install huggingface_hub !pip install transformers !pip install sentencepiece !pip install streamlit import os import urllib.request import html2text import predictionguard as pg from langchain import PromptTemplate, FewShotPromptTemplate from langchain.text_splitter import CharacterTextSplitter from sentence_transformers import SentenceTransformer import numpy as np import lancedb from lancedb.embeddings import with_embeddings import pandas as pd os.environ['PREDICTIONGUARD_TOKEN'] = "q1VuOjnffJ3NO2oFN8Q9m8vghYc84ld13jaqdF7E" # Streamlit App Initiation import streamlit as st # Replace input() with Streamlit's input widgets # Sidebar for inputting the name, age, gender, and ethnicity name = st.sidebar.text_input('Name') age = st.sidebar.number_input('Age', min_value=0, max_value=120, step=1) gender = st.sidebar.selectbox('Gender', ['Male', 'Female', 'Other']) ethnicity = st.sidebar.text_input('Ethnicity') # Main container with st.form(key='patient_form'): # Text input for procedures disease = st.text_area('DISEASE', height=100) # Text input for prescriptions (where you would get drug_names) prescriptions = st.text_area('PRESCRIPTIONS', height=100) # Text input for additional information additional_info = st.text_area('ADDITIONAL INFO', height=100) # Submit button for the form submit_button = st.form_submit_button(label='Predict Drug Effects') from PyPDF2 import PdfReader # Replace 'path_to_your_pdf_file.pdf' with the path to your PDF file pdf_path = '/content/drug_side_effects_summary_cleaned.pdf' reader = PdfReader(pdf_path) # Initialize an empty string to accumulate text text = '' # Iterate over each page in the PDF for page in reader.pages: # Extract text from the page and append it to the text string text += page.extract_text() + "\n" # Now, `text` contains the text content of the PDF. You can print it or process it further. print(text[:500]) # Example: print the first 500 characters to understand the structure import re # Function to clean the extracted text def clean_text(text): # Correcting unwanted line breaks and spaces text = re.sub(r'-\n', '', text) # Remove hyphenation text = re.sub(r'\n', ' ', text) # Replace new lines with space text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with single space text = text.strip() # Remove leading and trailing spaces return text # Clean the extracted text cleaned_text = clean_text(text) # Return a portion of the cleaned text to verify the cleaning cleaned_text[:500] # Define a function to chunk text with specified size and overlap using standard Python def chunk_text(text, chunk_size=700, overlap=50): chunks = [] start = 0 while start < len(text): # If we're not at the beginning, move back 'overlap' characters for context if start > 0: start -= overlap end = start + chunk_size chunks.append(text[start:end]) start += chunk_size return chunks # Chunk the cleaned text into smaller pieces for LLM input docs_alternative = chunk_text(cleaned_text, chunk_size=700, overlap=50) # Prepare to display the first few chunks to verify the result chunks_to_display_alt = 3 chunks_preview_alt = [docs_alternative[i] for i in range(min(len(docs_alternative), chunks_to_display_alt))] chunks_preview_alt # Format the chunks to avoid prompt template conflicts chunks_preview_alt = [x.replace('#', '-') for x in chunks_preview_alt] # Embeddings setup name = "all-MiniLM-L12-v2" model = SentenceTransformer(name) # Embedding functions def embed_batch(batch): return [model.encode(sentence, show_progress_bar=True) for sentence in batch] def embed(sentence): return model.encode(sentence) # Ensure the LanceDB directory does not exist already to avoid errors lancedb_dir = ".lancedb" if not os.path.exists(lancedb_dir): os.mkdir(lancedb_dir) uri = lancedb_dir db = lancedb.connect(uri) # Prepare metadata for embedding metadata = [[i, chunks_preview_alt] for i, chunks_preview_alt in enumerate(chunks_preview_alt)] doc_df = pd.DataFrame(metadata, columns=["chunk", "text"]) # Embed the documents data = with_embeddings(embed_batch, doc_df) # LanceDB operations # if not db.has_table("pdf_data"): db.create_table("pdf_data", data=data) table = db.open_table("pdf_data") table.add(data=data) # Note: Adjust the 'create_table' and 'open_table' to match your dataset/table names message = "What are the side effects of doxycycline for treating Acne?" results = table.search(embed(message)).limit(5).to_pandas() #print(results.head()) message = "What are the side effects of doxycycline for treating Acne?" results = table.search(embed(message)).limit(5).to_pandas() #print(results.head()) # Assuming the setup for embeddings, LanceDB, and the PromptTemplate are already in place # Assuming drug_names are retrieved from the prescriptions field # You should parse the prescriptions field to extract the drug names drug_names = prescriptions.split(',') # This is an example, the actual extraction depends on how the prescriptions are entered disease = disease # Replace this with the actual method of getting the disease from the user def rag_answer_drug_side_effects(name, drug_names, disease): # Formulate a question related to drug side effects message = f"What are the potential side effects of using {drug_names} for treating {disease}? Please provide a list of side effects specific to the use of these drugs in the context of the mentioned disease of {name} person." # Search the database for relevant context results = table.search(embed(message)).limit(10).to_pandas() # Adjust based on the correct API call results.sort_values(by=['_distance'], inplace=True, ascending=True) context = results['text'].iloc[0] # Use the most relevant document # Define the prompt template template = """### Instruction: Start with Hi, {name}. Then give a compassionate answer in bullet points and list. Read the below input context and respond with a mid length answer to the given question. If you cannot find an exact answer then look up something nearer to the medicaiton and disease " ### Input: Context: {context} Question: {question} ### Response: """ # Augment the prompt with the retrieved context prompt = template.format(context=context, question=message) # Get a response result = pg.Completion.create( model="Neural-Chat-7B", prompt = prompt ) # # Here you would call your LLM or any other model to generate an answer based on the prompt # # Since we cannot execute dynamic model calls in this environment, we'll simulate a response # simulated_response = "Sorry, I can't find an answer, but you might try looking in the following resource." return result['choices'][0]['text'] def rag_answer_drug_benfit_effects(name, drug_names, disease): # Formulate a question related to drug side effects message = f"What are the potential benefits of using {drug_names} for treating {disease}? Please provide a list of benefits specific to the use of these drugs in the context of the mentioned disease of {name} person." # Search the database for relevant context results = table.search(embed(message)).limit(10).to_pandas() # Adjust based on the correct API call results.sort_values(by=['_distance'], inplace=True, ascending=True) context = results['text'].iloc[0] # Use the most relevant document # Define the prompt template template = """### Instruction: Start with Hi, {name}. Then give a compassionate answer in bullet points and list. Read the below input context and respond with a mid length answer to the given question. If you cannot find an exact answer then look up something nearer to the medicaiton and disease " ### Input: Context: {context} Question: {question} ### Response: """ # Augment the prompt with the retrieved context prompt = template.format(context=context, question=message) # Get a response result = pg.Completion.create( model="Neural-Chat-7B", prompt = prompt ) # # Here you would call your LLM or any other model to generate an answer based on the prompt # # Since we cannot execute dynamic model calls in this environment, we'll simulate a response # simulated_response = "Sorry, I can't find an answer, but you might try looking in the following resource." return result['choices'][0]['text'] # When this button is clicked, it will return True if st.button('Predict Drug Effects'): # Call your processing functions here # For example: side_effects, benefits = rag_answer_drug_side_effects(name, drug_names, disease), rag_answer_drug_benfit_effects(name,drug_names,disease ) # You will need to define the process_input function to process these inputs # When this button is clicked, it will return True if submit_button: # Call your processing functions here # Make sure to validate input and handle errors/exceptions as necessary try: side_effects_response = rag_answer_drug_side_effects(name, drug_names, disease) benefits_response = rag_answer_drug_benfit_effects(name, drug_names, disease) st.write("Side Effects:", side_effects_response) st.write("Benefits:", benefits_response) except Exception as e: st.error(f"An error occurred: {e}") from huggingface_hub import notebook_login, Repository notebook_login() from transformers import AutoModelForSequenceClassification, AutoTokenizer # Define the path to the checkpoint checkpoint_path = r"filius-Dei/CiPE" # # Load the model # model = AutoModelForSequenceClassification.from_pretrained(checkpoint_path) # # Load the tokenizer # tokenzier = AutoTokenzier.from_pretrained("distilbert-base-uncased") # # # Define the path to the checkpoint # # checkpoint_path = r'https://huggingface.co/filius-Dei/CiPE' # # # Correct format for repo_id # # repo_id = "filius-Dei/CiPE" # # model = AutoModelForSequenceClassification.from_pretrained(repo_id) # # # Load the tokenizer # # tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")