aamirhameed commited on
Commit
0ccee0d
·
verified ·
1 Parent(s): da51ddf

Update knowledge_engine.py

Browse files
Files changed (1) hide show
  1. knowledge_engine.py +10 -15
knowledge_engine.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
  from langchain.vectorstores import FAISS
3
- from langchain.embeddings import HuggingFaceInstructEmbeddings
4
  from langchain.chains import RetrievalQA
5
  from langchain.llms import HuggingFacePipeline
6
  from transformers import pipeline
@@ -19,42 +19,37 @@ class KnowledgeManager:
19
  self._load_knowledge_base()
20
 
21
  def _initialize_llm(self):
22
- # Initialize the HuggingFace pipeline for the LLM (FLAN-T5 small)
23
  local_pipe = pipeline("text2text-generation", model="google/flan-t5-small", max_length=256)
24
  self.llm = HuggingFacePipeline(pipeline=local_pipe)
25
 
26
  def _initialize_embeddings(self):
27
- # Initialize the HuggingFace Instructor embeddings model
28
- self.embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-large")
29
 
30
  def _load_knowledge_base(self):
31
- # Find all .txt files in the root directory
32
  txt_files = [f for f in os.listdir(self.root_dir) if f.endswith(".txt")]
33
 
34
  if not txt_files:
35
  raise FileNotFoundError("No .txt files found in root directory.")
36
 
37
- # Read all txt files content
38
  all_texts = []
39
  for filename in txt_files:
40
  path = os.path.join(self.root_dir, filename)
41
  with open(path, "r", encoding="utf-8") as f:
42
- content = f.read()
43
- all_texts.append(content)
44
 
45
  full_text = "\n\n".join(all_texts)
46
 
47
- # Split full text into smaller chunks for embedding
48
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
49
- chunks = text_splitter.split_text(full_text)
50
 
51
- # Create LangChain documents from chunks
52
- docs = text_splitter.create_documents(chunks)
53
-
54
- # Create FAISS vector store from documents and embeddings
55
  self.docsearch = FAISS.from_documents(docs, self.embeddings)
56
 
57
- # Create the RetrievalQA chain with the LLM and retriever
58
  self.qa_chain = RetrievalQA.from_chain_type(
59
  llm=self.llm,
60
  chain_type="stuff",
 
1
  import os
2
  from langchain.vectorstores import FAISS
3
+ from langchain.embeddings import HuggingFaceEmbeddings
4
  from langchain.chains import RetrievalQA
5
  from langchain.llms import HuggingFacePipeline
6
  from transformers import pipeline
 
19
  self._load_knowledge_base()
20
 
21
  def _initialize_llm(self):
22
+ # Load local text2text model using HuggingFace pipeline (FLAN-T5 small)
23
  local_pipe = pipeline("text2text-generation", model="google/flan-t5-small", max_length=256)
24
  self.llm = HuggingFacePipeline(pipeline=local_pipe)
25
 
26
  def _initialize_embeddings(self):
27
+ # Use general-purpose sentence transformer
28
+ self.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
29
 
30
  def _load_knowledge_base(self):
31
+ # Automatically find all .txt files in the root directory
32
  txt_files = [f for f in os.listdir(self.root_dir) if f.endswith(".txt")]
33
 
34
  if not txt_files:
35
  raise FileNotFoundError("No .txt files found in root directory.")
36
 
 
37
  all_texts = []
38
  for filename in txt_files:
39
  path = os.path.join(self.root_dir, filename)
40
  with open(path, "r", encoding="utf-8") as f:
41
+ all_texts.append(f.read())
 
42
 
43
  full_text = "\n\n".join(all_texts)
44
 
45
+ # Split text into chunks for embedding
46
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
47
+ docs = text_splitter.create_documents([full_text])
48
 
49
+ # Create FAISS vector store
 
 
 
50
  self.docsearch = FAISS.from_documents(docs, self.embeddings)
51
 
52
+ # Build the QA chain
53
  self.qa_chain = RetrievalQA.from_chain_type(
54
  llm=self.llm,
55
  chain_type="stuff",