HARRISON_GPT / app.py
hackergeek's picture
Update app.py
98b95ab verified
raw
history blame
4.06 kB
import os
# ======================================================
# OPTION A: Use ephemeral /tmp cache to avoid 50 GB quota
# ======================================================
os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
os.environ["HF_HOME"] = "/tmp/hf_home"
os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
os.environ["HF_MODULES_CACHE"] = "/tmp/hf_modules"
# ======================================================
import torch
import gradio as gr
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
# ------------------------------------------------------
# 1️⃣ Model setup
# ------------------------------------------------------
GEN_MODEL = "hackergeek/qwen3-harrison-rag" # main generation model
EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2" # embedding model (lightweight)
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
model = AutoModelForCausalLM.from_pretrained(
GEN_MODEL,
cache_dir="/tmp/hf_cache",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto",
low_cpu_mem_usage=True
)
embedder = SentenceTransformer(EMB_MODEL, cache_folder="/tmp/hf_cache")
# ------------------------------------------------------
# 2️⃣ Load and index documents
# ------------------------------------------------------
DOCS_DIR = "docs"
os.makedirs(DOCS_DIR, exist_ok=True)
# create a small demo doc if none exists
if not os.listdir(DOCS_DIR):
with open(os.path.join(DOCS_DIR, "example.txt"), "w") as f:
f.write(
"Qwen3-Harrison-RAG combines the Qwen3 language model with retrieval-augmented generation for context-aware responses."
)
docs = []
for fn in os.listdir(DOCS_DIR):
with open(os.path.join(DOCS_DIR, fn), encoding="utf-8") as f:
docs.append(f.read())
# simple fixed-size chunking
chunks = []
for doc in docs:
for i in range(0, len(doc), 500):
chunks.append(doc[i:i+500])
embs = embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=False)
index = faiss.IndexFlatL2(embs.shape[1])
index.add(embs)
# ------------------------------------------------------
# 3️⃣ Retrieval + generation logic
# ------------------------------------------------------
def retrieve_context(query, k=5):
q_emb = embedder.encode([query], convert_to_numpy=True)
D, I = index.search(q_emb, k)
return "\n\n".join([chunks[i] for i in I[0]])
def generate_response(query, history):
context = retrieve_context(query)
system_prompt = (
"You are a helpful assistant that uses the retrieved context to answer questions.\n\n"
f"Context:\n{context}\n\n"
f"User: {query}\nAssistant:"
)
inputs = tokenizer(system_prompt, return_tensors="pt").to(model.device)
output_ids = model.generate(**inputs, max_new_tokens=300)
output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
return output.split("Assistant:")[-1].strip()
def chat_fn(user_message, history):
response = generate_response(user_message, history)
history = history + [(user_message, response)]
return history, history
# ------------------------------------------------------
# 4️⃣ Gradio UI
# ------------------------------------------------------
with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo:
gr.Markdown(
"""
# 🤖 Qwen3-Harrison-RAG Chatbot
Ask me anything — I’ll retrieve relevant context and answer!
"""
)
chatbot = gr.Chatbot(height=400)
with gr.Row():
msg = gr.Textbox(placeholder="Type your message here...", scale=4)
clear = gr.Button("Clear", scale=1)
msg.submit(chat_fn, [msg, chatbot], [chatbot, chatbot])
clear.click(lambda: None, None, chatbot, queue=False)
# ------------------------------------------------------
# 5️⃣ Launch for Hugging Face Spaces
# ------------------------------------------------------
if __name__ == "__main__":
demo.launch()