Spaces:

aamirhameed
/

xTwin

Running

App Files Files Community

aamirhameed commited on 6 days ago

Commit

0ccee0d

verified ·

1 Parent(s): da51ddf

Update knowledge_engine.py

Browse files

Files changed (1) hide show

knowledge_engine.py +10 -15

knowledge_engine.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
 from langchain.vectorstores import FAISS
-from langchain.embeddings import HuggingFaceInstructEmbeddings
 from langchain.chains import RetrievalQA
 from langchain.llms import HuggingFacePipeline
 from transformers import pipeline
@@ -19,42 +19,37 @@ class KnowledgeManager:
         self._load_knowledge_base()
     def _initialize_llm(self):
-        # Initialize the HuggingFace pipeline for the LLM (FLAN-T5 small)
         local_pipe = pipeline("text2text-generation", model="google/flan-t5-small", max_length=256)
         self.llm = HuggingFacePipeline(pipeline=local_pipe)
     def _initialize_embeddings(self):
-        # Initialize the HuggingFace Instructor embeddings model
-        self.embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
     def _load_knowledge_base(self):
-        # Find all .txt files in the root directory
         txt_files = [f for f in os.listdir(self.root_dir) if f.endswith(".txt")]
         if not txt_files:
             raise FileNotFoundError("No .txt files found in root directory.")
-        # Read all txt files content
         all_texts = []
         for filename in txt_files:
             path = os.path.join(self.root_dir, filename)
             with open(path, "r", encoding="utf-8") as f:
-                content = f.read()
-                all_texts.append(content)
         full_text = "\n\n".join(all_texts)
-        # Split full text into smaller chunks for embedding
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-        chunks = text_splitter.split_text(full_text)
-        # Create LangChain documents from chunks
-        docs = text_splitter.create_documents(chunks)
-        # Create FAISS vector store from documents and embeddings
         self.docsearch = FAISS.from_documents(docs, self.embeddings)
-        # Create the RetrievalQA chain with the LLM and retriever
         self.qa_chain = RetrievalQA.from_chain_type(
             llm=self.llm,
             chain_type="stuff",

 import os
 from langchain.vectorstores import FAISS
+from langchain.embeddings import HuggingFaceEmbeddings
 from langchain.chains import RetrievalQA
 from langchain.llms import HuggingFacePipeline
 from transformers import pipeline
         self._load_knowledge_base()
     def _initialize_llm(self):
+        # Load local text2text model using HuggingFace pipeline (FLAN-T5 small)
         local_pipe = pipeline("text2text-generation", model="google/flan-t5-small", max_length=256)
         self.llm = HuggingFacePipeline(pipeline=local_pipe)
     def _initialize_embeddings(self):
+        # Use general-purpose sentence transformer
+        self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     def _load_knowledge_base(self):
+        # Automatically find all .txt files in the root directory
         txt_files = [f for f in os.listdir(self.root_dir) if f.endswith(".txt")]
         if not txt_files:
             raise FileNotFoundError("No .txt files found in root directory.")
         all_texts = []
         for filename in txt_files:
             path = os.path.join(self.root_dir, filename)
             with open(path, "r", encoding="utf-8") as f:
+                all_texts.append(f.read())
         full_text = "\n\n".join(all_texts)
+        # Split text into chunks for embedding
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+        docs = text_splitter.create_documents([full_text])
+        # Create FAISS vector store
         self.docsearch = FAISS.from_documents(docs, self.embeddings)
+        # Build the QA chain
         self.qa_chain = RetrievalQA.from_chain_type(
             llm=self.llm,
             chain_type="stuff",