Files changed (1) hide show
  1. app.py +13 -3
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import zipfile
 
3
  import gradio as gr
4
 
5
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
@@ -17,12 +18,18 @@ if not os.path.exists("faiss_index") and os.path.exists("faiss_index.zip"):
17
 
18
  # --- Step 2: Load embedding and vectorstore ---
19
  embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
20
- vectordb = FAISS.load_local("faiss_index", embedding_model,allow_dangerous_deserialization=True)
21
 
22
  # --- Step 3: Load the LLM ---
23
  model_id = "tiiuae/falcon-7b-instruct"
24
  tokenizer = AutoTokenizer.from_pretrained(model_id)
25
- model = AutoModelForCausalLM.from_pretrained(model_id)
 
 
 
 
 
 
26
 
27
  pipe = pipeline(
28
  "text-generation",
@@ -57,15 +64,18 @@ qa_chain = ConversationalRetrievalChain.from_llm(
57
  )
58
 
59
  UH_LOGO = "images/UH.png"
 
60
  # --- Step 5: Define chatbot logic ---
61
  def chat(message, history):
62
  result = qa_chain.invoke({"question": message})
63
  response = result.get("answer", "")
64
  response = response.split("Answer:")[-1].replace("<|assistant|>", "").strip()
 
 
65
  if torch.cuda.is_available():
66
  torch.cuda.empty_cache()
 
67
  return response
68
-
69
 
70
  # --- Step 6: UI ---
71
  sample_questions = [
 
1
  import os
2
  import zipfile
3
+ import torch # βœ… Import torch so empty_cache works
4
  import gradio as gr
5
 
6
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
18
 
19
  # --- Step 2: Load embedding and vectorstore ---
20
  embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
21
+ vectordb = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)
22
 
23
  # --- Step 3: Load the LLM ---
24
  model_id = "tiiuae/falcon-7b-instruct"
25
  tokenizer = AutoTokenizer.from_pretrained(model_id)
26
+
27
+ # βœ… Use device_map + float16 to save memory
28
+ model = AutoModelForCausalLM.from_pretrained(
29
+ model_id,
30
+ device_map="auto",
31
+ torch_dtype=torch.float16
32
+ )
33
 
34
  pipe = pipeline(
35
  "text-generation",
 
64
  )
65
 
66
  UH_LOGO = "images/UH.png"
67
+
68
  # --- Step 5: Define chatbot logic ---
69
  def chat(message, history):
70
  result = qa_chain.invoke({"question": message})
71
  response = result.get("answer", "")
72
  response = response.split("Answer:")[-1].replace("<|assistant|>", "").strip()
73
+
74
+ # βœ… Actually clear unused GPU memory
75
  if torch.cuda.is_available():
76
  torch.cuda.empty_cache()
77
+
78
  return response
 
79
 
80
  # --- Step 6: UI ---
81
  sample_questions = [