Spaces:

hackergeek
/

HARRISON_GPT

Running

App Files Files Community

hackergeek commited on Nov 4

Commit

80f88de

verified ·

1 Parent(s): 3fbdc08

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -144

app.py CHANGED Viewed

@@ -1,14 +1,10 @@
 import os
 import torch
 import gradio as gr
-import faiss
-import numpy as np
 from transformers import AutoTokenizer, AutoModelForCausalLM
-from sentence_transformers import SentenceTransformer
-from inspect import signature
 # =====================================================
-# Cache setup
 # =====================================================
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_HOME"] = "/tmp/hf_home"
@@ -16,174 +12,108 @@ os.environ["HF_DATASETS_CACHE"] = "/tmp/hf_datasets"
 os.environ["HF_MODULES_CACHE"] = "/tmp/hf_modules"
 # =====================================================
-# Model setup
 # =====================================================
-GEN_MODEL_PRIVATE = "hackergeek/qwen3-harrison-rag"
-GEN_MODEL_PUBLIC = "Qwen/Qwen2.5-1.5B-Instruct"
-EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 HF_TOKEN = os.getenv("HF_TOKEN")
-if not HF_TOKEN:
-    print("⚠️ No Hugging Face token found. Private models may fail to load.")
-try:
-    import accelerate
-    accelerate_available = True
-except ImportError:
-    accelerate_available = False
-    print("⚠️ `accelerate` not installed. Large private models with device_map='auto' may fail.")
-# --- Load model helper ---
-def load_model(model_name, token=None):
     dtype_value = torch.float16 if torch.cuda.is_available() else torch.float32
     try:
-        param_names = signature(AutoModelForCausalLM.from_pretrained).parameters
-        dtype_arg = "dtype" if "dtype" in param_names else "torch_dtype"
-        load_kwargs = {
-            dtype_arg: dtype_value,
-            "cache_dir": "/tmp/hf_cache",
-            "low_cpu_mem_usage": True,
-        }
-        if accelerate_available:
-            load_kwargs["device_map"] = "auto"
-        if token:
-            load_kwargs["token"] = token
-        tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
-        model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
-        return tokenizer, model
-    except Exception as e:
-        raise RuntimeError(f"Failed to load model '{model_name}': {e}")
-# --- Attempt private model, fallback to public ---
 try:
-    tokenizer, model = load_model(GEN_MODEL_PRIVATE, token=HF_TOKEN)
-    print(f"✅ Loaded private model: {GEN_MODEL_PRIVATE}")
 except Exception as e:
-    print(f"❌ {e}\n➡️ Falling back to public model: {GEN_MODEL_PUBLIC}")
-    tokenizer, model = load_model(GEN_MODEL_PUBLIC)
-    print(f"✅ Loaded public model: {GEN_MODEL_PUBLIC}")
-# --- Load embedding model ---
-embedder = SentenceTransformer(EMB_MODEL, cache_folder="/tmp/hf_cache")
-# =====================================================
-# FAISS index setup
-# =====================================================
-# Example medical text; replace with full dataset
-documents = [
-    "Infliximab is a humanized monoclonal antibody used in rheumatoid arthritis. "
-    "It is administered intravenously at 3–5 mg/kg every 6–8 weeks.",
-    "Colitis ulcerum is a chronic inflammatory disorder of the colon characterized by ulcerated erosions.",
-    "COPD is a chronic obstructive pulmonary disease with progressive airflow limitation."
-]
-# Function to split documents into chunks
-def chunk_text(text, chunk_size=150):
-    words = text.split()
-    chunks = []
-    for i in range(0, len(words), chunk_size):
-        chunk = " ".join(words[i:i+chunk_size])
-        chunks.append(chunk)
-    return chunks
-# Create all chunks and embeddings
-chunks = []
-for doc in documents:
-    chunks.extend(chunk_text(doc))
-chunk_embeddings = embedder.encode(chunks, convert_to_numpy=True)
-index = faiss.IndexFlatL2(chunk_embeddings.shape[1])
-index.add(np.array(chunk_embeddings))
 # =====================================================
-# Retrieval + generation
 # =====================================================
-def retrieve_context(query, max_k=5, distance_threshold=0.7, max_tokens=1500):
-    q_emb = embedder.encode([query], convert_to_numpy=True)
-    if index.ntotal == 0:
-        return "No context available."
-    D, I = index.search(q_emb, max_k)
-    sorted_idx = [i for _, i in sorted(zip(D[0], I[0]))]  # deterministic
-    context = []
-    total_tokens = 0
-    for idx in sorted_idx:
-        if D[0][list(sorted_idx).index(idx)] > distance_threshold:
-            continue
-        chunk_tokens = len(tokenizer(chunks[idx])["input_ids"])
-        if total_tokens + chunk_tokens > max_tokens:
-            break
-        context.append(chunks[idx])
-        total_tokens += chunk_tokens
-    return "\n\n".join(context) if context else "No context available."
-def calculate_max_tokens(query, min_tokens=50, max_tokens=800, factor=3):
     query_tokens = len(tokenizer(query)["input_ids"])
     dynamic_tokens = query_tokens * factor
     return min(max(dynamic_tokens, min_tokens), max_tokens)
-def generate_full_answer(query, history, max_loops=3):
-    torch.manual_seed(42)
-    np.random.seed(42)
-    context = retrieve_context(query)
-    prompt = (
-        "You are a helpful assistant. ONLY use the retrieved context to answer questions.\n\n"
-        f"Context:\n{context}\n\n"
-        f"User: {query}\nAssistant:"
     )
-    full_response = ""
-    remaining_prompt = prompt
-    loop_count = 0
-    while loop_count < max_loops:
-        inputs = tokenizer(remaining_prompt, return_tensors="pt").to(model.device)
-        max_new_tokens = calculate_max_tokens(query)
-        output_ids = model.generate(
-            **inputs,
-            max_new_tokens=max_new_tokens,
-            do_sample=False,
-            pad_token_id=tokenizer.eos_token_id,
-            no_repeat_ngram_size=4
-        )
-        partial_answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        partial_answer = partial_answer.split("Assistant:")[-1].strip()
-        new_content = partial_answer[len(full_response):].strip()
-        if not new_content:
-            break
-        full_response += new_content
-        if full_response.endswith(('.', '!', '?')):
-            break
-        remaining_prompt = full_response
-        loop_count += 1
-    return full_response
-def chat_fn(user_message, history):
-    response = generate_full_answer(user_message, history)
-    history = history + [(user_message, response)]
     return history, history
 # =====================================================
-# Gradio UI
 # =====================================================
 with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo:
     gr.Markdown("""
-    # 🤖 Qwen3-Harrison-RAG Chatbot
-    Ask me anything — I’ll retrieve relevant context and answer!
     """)
-    chatbot = gr.Chatbot(height=400)
     with gr.Row():
-        msg = gr.Textbox(placeholder="Type your message here...", scale=4)
         clear = gr.Button("Clear", scale=1)
-    msg.submit(chat_fn, [msg, chatbot], [chatbot, chatbot])
     clear.click(lambda: None, None, chatbot, queue=False)
 # =====================================================

 import os
 import torch
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # =====================================================
+# Environment setup
 # =====================================================
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_HOME"] = "/tmp/hf_home"
 os.environ["HF_MODULES_CACHE"] = "/tmp/hf_modules"
 # =====================================================
+# Model configuration
 # =====================================================
+GEN_MODEL = "hackergeek/qwen3-harrison-rag"
 HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    print("⚠️ No Hugging Face token found. Set one using:")
+    print("   export HF_TOKEN='your_hf_token_here'")
+# =====================================================
+# Load private RAG model
+# =====================================================
+def load_private_model(model_name, token):
     dtype_value = torch.float16 if torch.cuda.is_available() else torch.float32
+    load_kwargs = {
+        "dtype": dtype_value,
+        "cache_dir": "/tmp/hf_cache",
+        "low_cpu_mem_usage": True,
+    }
     try:
+        import accelerate
+        load_kwargs["device_map"] = "auto"
+    except ImportError:
+        print("⚠️ `accelerate` not installed — using default device placement.")
+    tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
+    model = AutoModelForCausalLM.from_pretrained(model_name, token=token, **load_kwargs)
+    return tokenizer, model
 try:
+    tokenizer, model = load_private_model(GEN_MODEL, token=HF_TOKEN)
+    print(f"✅ Loaded private RAG model: {GEN_MODEL}")
 except Exception as e:
+    raise RuntimeError(f"❌ Failed to load {GEN_MODEL}: {e}")
 # =====================================================
+# Dynamic token allocation
 # =====================================================
+def calculate_max_tokens(query, min_tokens=100, max_tokens=600, factor=3):
+    """Dynamically scale output length to input length."""
     query_tokens = len(tokenizer(query)["input_ids"])
     dynamic_tokens = query_tokens * factor
     return min(max(dynamic_tokens, min_tokens), max_tokens)
+# =====================================================
+# RAG-aware generation logic
+# =====================================================
+def generate_answer(query, history):
+    if not query.strip():
+        return history, history
+    # Step 1️⃣: Rephrase user query for optimal retrieval
+    rephrase_prompt = (
+        "You are a retrieval-augmented assistant.\n"
+        "Rephrase the following user query to maximize retrieval accuracy "
+        "by keeping key entities and medical terms intact:\n\n"
+        f"User query: {query}\n\n"
+        "Rephrased query:"
+    )
+    inputs = tokenizer(rephrase_prompt, return_tensors="pt").to(model.device)
+    rephrased_ids = model.generate(**inputs, max_new_tokens=80, do_sample=False)
+    rephrased_query = tokenizer.decode(rephrased_ids[0], skip_special_tokens=True).split("Rephrased query:")[-1].strip()
+    # Step 2️⃣: Main retrieval + generation
+    max_tokens = calculate_max_tokens(rephrased_query)
+    system_prompt = (
+        "You are a retrieval-augmented medical assistant. "
+        "You have access to internal knowledge and context retrieval. "
+        "Always provide clear, complete, and factual medical explanations.\n\n"
+        f"Optimized query for retrieval:\n{rephrased_query}\n\n"
+        "Answer using relevant retrieved context and your reasoning.\n\n"
+        "Assistant:"
     )
+    inputs = tokenizer(system_prompt, return_tensors="pt").to(model.device)
+    output_ids = model.generate(
+        **inputs,
+        max_new_tokens=max_tokens,
+        do_sample=False,
+        pad_token_id=tokenizer.eos_token_id,
+        no_repeat_ngram_size=4,
+        temperature=0.0,  # completely deterministic
+    )
+    output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    answer = output.split("Assistant:")[-1].strip()
+    history = history + [(query, answer)]
     return history, history
 # =====================================================
+# Gradio interface
 # =====================================================
 with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo:
     gr.Markdown("""
+    # 🤖 Qwen3-Harrison-RAG Chatbot
+    Ask me anything — I’ll rephrase your question, retrieve the right context, and answer with complete reasoning.
     """)
+    chatbot = gr.Chatbot(height=420)
     with gr.Row():
+        msg = gr.Textbox(placeholder="Ask a medical or scientific question...", scale=4)
         clear = gr.Button("Clear", scale=1)
+    msg.submit(generate_answer, [msg, chatbot], [chatbot, chatbot])
     clear.click(lambda: None, None, chatbot, queue=False)
 # =====================================================