Spaces:

hackergeek
/

HARRISON_GPT

Running

App Files Files Community

hackergeek commited on Nov 4

Commit

fb8bdfe

verified ·

1 Parent(s): 3a080d9

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -30

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from sentence_transformers import SentenceTransformer
 from inspect import signature
 # =====================================================
-# OPTION: Use ephemeral /tmp cache
 # =====================================================
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_HOME"] = "/tmp/hf_home"
@@ -16,7 +16,7 @@ os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
 os.environ["HF_MODULES_CACHE"] = "/tmp/hf_modules"
 # =====================================================
-# 1️⃣ Model setup
 # =====================================================
 GEN_MODEL_PRIVATE = "hackergeek/qwen3-harrison-rag"
 GEN_MODEL_PUBLIC = "Qwen/Qwen2.5-1.5B-Instruct"
@@ -33,7 +33,7 @@ except ImportError:
     accelerate_available = False
     print("⚠️ `accelerate` not installed. Large private models with device_map='auto' may fail.")
-# --- Helper to load model safely ---
 def load_model(model_name, token=None):
     dtype_value = torch.float16 if torch.cuda.is_available() else torch.float32
     try:
@@ -45,10 +45,8 @@ def load_model(model_name, token=None):
             "cache_dir": "/tmp/hf_cache",
             "low_cpu_mem_usage": True,
         }
         if accelerate_available:
             load_kwargs["device_map"] = "auto"
         if token:
             load_kwargs["token"] = token
@@ -58,7 +56,7 @@ def load_model(model_name, token=None):
     except Exception as e:
         raise RuntimeError(f"Failed to load model '{model_name}': {e}")
-# --- Attempt to load private model, fallback to public ---
 try:
     tokenizer, model = load_model(GEN_MODEL_PRIVATE, token=HF_TOKEN)
     print(f"✅ Loaded private model: {GEN_MODEL_PRIVATE}")
@@ -71,57 +69,81 @@ except Exception as e:
 embedder = SentenceTransformer(EMB_MODEL, cache_folder="/tmp/hf_cache")
 # =====================================================
-# 2️⃣ Retrieval + generation logic (deterministic)
 # =====================================================
 index = faiss.IndexFlatL2(384)
 chunks = ["This is a sample context chunk. Replace with real documents."]
-def retrieve_context(query, max_k=5):
     q_emb = embedder.encode([query], convert_to_numpy=True)
     if index.ntotal == 0:
         return "No context available."
     D, I = index.search(q_emb, max_k)
-    # Sort by distance ascending to make retrieval deterministic
-    sorted_idx = [i for _, i in sorted(zip(D[0], I[0]))]
-    return "\n\n".join([chunks[i] for i in sorted_idx])
-def calculate_max_tokens(query, min_tokens=50, max_tokens=600, factor=3):
     query_tokens = len(tokenizer(query)["input_ids"])
     dynamic_tokens = query_tokens * factor
     return min(max(dynamic_tokens, min_tokens), max_tokens)
-def generate_response(query, history):
-    # Set fixed seeds for reproducibility
     torch.manual_seed(42)
     np.random.seed(42)
     context = retrieve_context(query)
-    system_prompt = (
         "You are a helpful assistant that uses the retrieved context to answer questions.\n\n"
         f"Context:\n{context}\n\n"
         f"User: {query}\nAssistant:"
     )
-    inputs = tokenizer(system_prompt, return_tensors="pt").to(model.device)
-    max_new_tokens = calculate_max_tokens(query)
-    # Deterministic generation: do_sample=False
-    output_ids = model.generate(
-        **inputs,
-        max_new_tokens=max_new_tokens,
-        do_sample=False,             # deterministic
-        pad_token_id=tokenizer.eos_token_id
-    )
-    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    return output.split("Assistant:")[-1].strip()
 def chat_fn(user_message, history):
-    response = generate_response(user_message, history)
     history = history + [(user_message, response)]
     return history, history
 # =====================================================
-# 3️⃣ Gradio UI
 # =====================================================
 with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo:
     gr.Markdown("""
@@ -136,7 +158,7 @@ with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo:
     clear.click(lambda: None, None, chatbot, queue=False)
 # =====================================================
-# 4️⃣ Launch
 # =====================================================
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

 from inspect import signature
 # =====================================================
+# Cache setup
 # =====================================================
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_HOME"] = "/tmp/hf_home"
 os.environ["HF_MODULES_CACHE"] = "/tmp/hf_modules"
 # =====================================================
+# Model setup
 # =====================================================
 GEN_MODEL_PRIVATE = "hackergeek/qwen3-harrison-rag"
 GEN_MODEL_PUBLIC = "Qwen/Qwen2.5-1.5B-Instruct"
     accelerate_available = False
     print("⚠️ `accelerate` not installed. Large private models with device_map='auto' may fail.")
+# --- Load model helper ---
 def load_model(model_name, token=None):
     dtype_value = torch.float16 if torch.cuda.is_available() else torch.float32
     try:
             "cache_dir": "/tmp/hf_cache",
             "low_cpu_mem_usage": True,
         }
         if accelerate_available:
             load_kwargs["device_map"] = "auto"
         if token:
             load_kwargs["token"] = token
     except Exception as e:
         raise RuntimeError(f"Failed to load model '{model_name}': {e}")
+# --- Attempt private model, fallback to public ---
 try:
     tokenizer, model = load_model(GEN_MODEL_PRIVATE, token=HF_TOKEN)
     print(f"✅ Loaded private model: {GEN_MODEL_PRIVATE}")
 embedder = SentenceTransformer(EMB_MODEL, cache_folder="/tmp/hf_cache")
 # =====================================================
+# Retrieval + generation logic
 # =====================================================
 index = faiss.IndexFlatL2(384)
 chunks = ["This is a sample context chunk. Replace with real documents."]
+def retrieve_context(query, max_k=5, distance_threshold=0.5, max_tokens=1500):
     q_emb = embedder.encode([query], convert_to_numpy=True)
     if index.ntotal == 0:
         return "No context available."
     D, I = index.search(q_emb, max_k)
+    sorted_idx = [i for _, i in sorted(zip(D[0], I[0]))]  # deterministic
+    context = []
+    total_tokens = 0
+    for idx in sorted_idx:
+        # skip distant chunks
+        if D[0][list(sorted_idx).index(idx)] > distance_threshold:
+            continue
+        chunk_tokens = len(tokenizer(chunks[idx])["input_ids"])
+        if total_tokens + chunk_tokens > max_tokens:
+            break
+        context.append(chunks[idx])
+        total_tokens += chunk_tokens
+    return "\n\n".join(context) if context else chunks[sorted_idx[0]]
+def calculate_max_tokens(query, min_tokens=50, max_tokens=800, factor=3):
     query_tokens = len(tokenizer(query)["input_ids"])
     dynamic_tokens = query_tokens * factor
     return min(max(dynamic_tokens, min_tokens), max_tokens)
+def generate_full_answer(query, history):
     torch.manual_seed(42)
     np.random.seed(42)
     context = retrieve_context(query)
+    prompt = (
         "You are a helpful assistant that uses the retrieved context to answer questions.\n\n"
         f"Context:\n{context}\n\n"
         f"User: {query}\nAssistant:"
     )
+    full_response = ""
+    remaining_prompt = prompt
+    while True:
+        inputs = tokenizer(remaining_prompt, return_tensors="pt").to(model.device)
+        max_new_tokens = calculate_max_tokens(query)
+        output_ids = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            do_sample=False,  # deterministic
+            pad_token_id=tokenizer.eos_token_id
+        )
+        partial_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        partial_answer = partial_answer.split("Assistant:")[-1].strip()
+        # Append partial answer
+        full_response += partial_answer
+        # Stop if last character is sentence-ending punctuation
+        if full_response.endswith(('.', '!', '?')):
+            break
+        # Continue generating by feeding back the last output
+        remaining_prompt = full_response
+    return full_response
 def chat_fn(user_message, history):
+    response = generate_full_answer(user_message, history)
     history = history + [(user_message, response)]
     return history, history
 # =====================================================
+# Gradio UI
 # =====================================================
 with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo:
     gr.Markdown("""
     clear.click(lambda: None, None, chatbot, queue=False)
 # =====================================================
+# Launch
 # =====================================================
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))