Spaces:
Sleeping
Sleeping
Update app.py
#4
by
seanerons
- opened
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import os
|
2 |
import zipfile
|
|
|
3 |
import gradio as gr
|
4 |
|
5 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
@@ -17,12 +18,18 @@ if not os.path.exists("faiss_index") and os.path.exists("faiss_index.zip"):
|
|
17 |
|
18 |
# --- Step 2: Load embedding and vectorstore ---
|
19 |
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
|
20 |
-
vectordb = FAISS.load_local("faiss_index", embedding_model,allow_dangerous_deserialization=True)
|
21 |
|
22 |
# --- Step 3: Load the LLM ---
|
23 |
model_id = "tiiuae/falcon-7b-instruct"
|
24 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
pipe = pipeline(
|
28 |
"text-generation",
|
@@ -57,15 +64,18 @@ qa_chain = ConversationalRetrievalChain.from_llm(
|
|
57 |
)
|
58 |
|
59 |
UH_LOGO = "images/UH.png"
|
|
|
60 |
# --- Step 5: Define chatbot logic ---
|
61 |
def chat(message, history):
|
62 |
result = qa_chain.invoke({"question": message})
|
63 |
response = result.get("answer", "")
|
64 |
response = response.split("Answer:")[-1].replace("<|assistant|>", "").strip()
|
|
|
|
|
65 |
if torch.cuda.is_available():
|
66 |
torch.cuda.empty_cache()
|
|
|
67 |
return response
|
68 |
-
|
69 |
|
70 |
# --- Step 6: UI ---
|
71 |
sample_questions = [
|
|
|
1 |
import os
|
2 |
import zipfile
|
3 |
+
import torch # β
Import torch so empty_cache works
|
4 |
import gradio as gr
|
5 |
|
6 |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
|
|
|
18 |
|
19 |
# --- Step 2: Load embedding and vectorstore ---
|
20 |
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
|
21 |
+
vectordb = FAISS.load_local("faiss_index", embedding_model, allow_dangerous_deserialization=True)
|
22 |
|
23 |
# --- Step 3: Load the LLM ---
|
24 |
model_id = "tiiuae/falcon-7b-instruct"
|
25 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
26 |
+
|
27 |
+
# β
Use device_map + float16 to save memory
|
28 |
+
model = AutoModelForCausalLM.from_pretrained(
|
29 |
+
model_id,
|
30 |
+
device_map="auto",
|
31 |
+
torch_dtype=torch.float16
|
32 |
+
)
|
33 |
|
34 |
pipe = pipeline(
|
35 |
"text-generation",
|
|
|
64 |
)
|
65 |
|
66 |
UH_LOGO = "images/UH.png"
|
67 |
+
|
68 |
# --- Step 5: Define chatbot logic ---
|
69 |
def chat(message, history):
|
70 |
result = qa_chain.invoke({"question": message})
|
71 |
response = result.get("answer", "")
|
72 |
response = response.split("Answer:")[-1].replace("<|assistant|>", "").strip()
|
73 |
+
|
74 |
+
# β
Actually clear unused GPU memory
|
75 |
if torch.cuda.is_available():
|
76 |
torch.cuda.empty_cache()
|
77 |
+
|
78 |
return response
|
|
|
79 |
|
80 |
# --- Step 6: UI ---
|
81 |
sample_questions = [
|