Spaces:

rishi002
/

mediVedaLLM

Sleeping

App Files Files Community

rishi002 commited on Jun 17

Commit

930a98a

verified ·

1 Parent(s): d7ea88b

Update embeddings.py

Browse files

Files changed (1) hide show

embeddings.py +16 -126

embeddings.py CHANGED Viewed

@@ -3,161 +3,51 @@ from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
-from langchain.chains import RetrievalQA
-from langchain_core.prompts import PromptTemplate
-from langchain.llms.base import LLM
-from typing import Optional, List
-import google.generativeai as genai
 # Set Paths
 DATA_PATH = "dataFolder/"
 DB_FAISS_PATH = "/tmp/vectorstore/db_faiss"
-# Google AI API setup
-GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
-if not GOOGLE_API_KEY:
-    raise ValueError("GOOGLE_API_KEY environment variable is required!")
-genai.configure(api_key=GOOGLE_API_KEY)
-# Custom Gemini LLM wrapper for LangChain
-class GeminiLLM(LLM):
-    def __init__(self, model_name="gemini-2.0-flash"):
-        self.model = genai.GenerativeModel(model_name)
-    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
-        try:
-            response = self.model.generate_content(prompt)
-            return response.text
-        except Exception as e:
-            return f"Error generating response: {str(e)}"
-    @property
-    def _identifying_params(self):
-        return {"name": "gemini-flash"}
-    @property
-    def _llm_type(self):
-        return "gemini"
 # Step 1: Load PDF Files
-def load_pdf_files(data_path):
     loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
     documents = loader.load()
     return documents
 # Step 2: Create Chunks
 def create_chunks(documents):
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
     text_chunks = text_splitter.split_documents(documents)
     return text_chunks
 # Step 3: Generate Embeddings
 def get_embedding_model():
     CACHE_DIR = "/tmp/models_cache"
     os.makedirs(CACHE_DIR, exist_ok=True)
     embedding_model = HuggingFaceEmbeddings(
         model_name="rishi002/all-MiniLM-L6-v2",
-        cache_folder="/tmp/models_cache"
     )
     return embedding_model
 # Step 4: Store Embeddings in FAISS
-def store_embeddings(text_chunks, embedding_model, db_path):
     db = FAISS.from_documents(text_chunks, embedding_model)
     db.save_local(db_path)
     return db
 # Step 5: Load FAISS Database
-def load_faiss_db(db_path, embedding_model):
-    return FAISS.load_local(db_path, embedding_model, allow_dangerous_deserialization=True)
-# Step 6: Load Gemini LLM Model
-def load_llm():
-    return GeminiLLM()
-# Step 7: Set Custom Prompt with Health Profile
-CUSTOM_PROMPT_TEMPLATE = """
-Use the provided context to answer the user's question.
-If the answer is unknown, say you don't know. Do not make up information.
-Only respond based on the context.
-Context: {context}
-User Health Profile: {health_info}
-Question: {question}
-Start your answer directly.
-"""
-def set_custom_prompt(template):
-    return PromptTemplate(template=template, input_variables=["context", "question", "health_info"])
-# Step 8: Create Retrieval QA Chain
-def create_qa_chain(llm, db):
-    return RetrievalQA.from_chain_type(
-        llm=llm,
-        chain_type="stuff",
-        retriever=db.as_retriever(search_kwargs={"k": 3}),
-        return_source_documents=False,
-        chain_type_kwargs={"prompt": set_custom_prompt(CUSTOM_PROMPT_TEMPLATE)}
-    )
-# Function to get user health profile via API (placeholder)
-def get_user_health_profile():
-    """
-    This function should make an API call to get the user's health profile.
-    Replace this placeholder with your actual API implementation.
-    """
-    try:
-        # Placeholder - replace with your actual API call
-        return "No health profile available"
-    except Exception as e:
-        print(f"Error fetching health profile: {e}")
-        return "Health profile unavailable"
-# Create and load all models and FAISS (for Gradio)
-def prepare_qa_system():
-    # Load and process PDFs, create FAISS index, etc.
-    print("🔄 Loading PDFs...")
-    documents = load_pdf_files(DATA_PATH)
-    print("📄 Creating Chunks...")
-    text_chunks = create_chunks(documents)
-    print("🧠 Generating Embeddings...")
-    embedding_model = get_embedding_model()
-    print("💾 Storing in FAISS...")
-    db = store_embeddings(text_chunks, embedding_model, DB_FAISS_PATH)
-    print("🔄 Loading FAISS Database...")
-    db = load_faiss_db(DB_FAISS_PATH, embedding_model)
-    print("🤖 Loading Gemini LLM...")
-    llm = load_llm()
-    print("🔗 Creating QA Chain...")
-    qa_chain = create_qa_chain(llm, db)
-    return qa_chain
-# Create the QA system and get the chain ready
-qa_chain = prepare_qa_system()
-# Gradio Interface function
-def ask_question(query: str):
-    try:
-        # Get user's health profile via API
-        health_info = get_user_health_profile()
-        # Prepare inputs for the QA chain
-        qa_inputs = {
-            'query': query,
-            'health_info': health_info
-        }
-        response = qa_chain.invoke(qa_inputs)
-        return response["result"], []
-    except Exception as e:
-        return f"Error: {str(e)}", []

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import FAISS
 # Set Paths
 DATA_PATH = "dataFolder/"
 DB_FAISS_PATH = "/tmp/vectorstore/db_faiss"
 # Step 1: Load PDF Files
+def load_pdf_files(data_path=DATA_PATH):
+    print("🔄 Loading PDFs from:", data_path)
     loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
     documents = loader.load()
+    print(f"✅ Loaded {len(documents)} document(s).")
     return documents
 # Step 2: Create Chunks
 def create_chunks(documents):
+    print("📄 Creating text chunks...")
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
     text_chunks = text_splitter.split_documents(documents)
+    print(f"✅ Created {len(text_chunks)} chunk(s).")
     return text_chunks
 # Step 3: Generate Embeddings
 def get_embedding_model():
+    print("🧠 Loading embedding model...")
     CACHE_DIR = "/tmp/models_cache"
     os.makedirs(CACHE_DIR, exist_ok=True)
     embedding_model = HuggingFaceEmbeddings(
         model_name="rishi002/all-MiniLM-L6-v2",
+        cache_folder=CACHE_DIR
     )
+    print("✅ Embedding model loaded.")
     return embedding_model
 # Step 4: Store Embeddings in FAISS
+def store_embeddings(text_chunks, embedding_model, db_path=DB_FAISS_PATH):
+    print("💾 Storing embeddings in FAISS...")
     db = FAISS.from_documents(text_chunks, embedding_model)
     db.save_local(db_path)
+    print(f"✅ FAISS index saved to: {db_path}")
     return db
 # Step 5: Load FAISS Database
+def load_faiss_db(db_path=DB_FAISS_PATH, embedding_model=None):
+    print("📦 Loading FAISS database from:", db_path)
+    db = FAISS.load_local(db_path, embedding_model, allow_dangerous_deserialization=True)
+    print("✅ FAISS database loaded.")
+    return db