Spaces:

hackergeek
/

HARRISON_GPT

Sleeping

App Files Files Community

hackergeek commited on Nov 3

Commit

7ac1ec8

verified ·

1 Parent(s): 81619a9

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -33

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from sentence_transformers import SentenceTransformer
 from inspect import signature
 # =====================================================
-# OPTION A: Use ephemeral /tmp cache to avoid 50 GB quota
 # =====================================================
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_HOME"] = "/tmp/hf_home"
@@ -18,54 +18,64 @@ os.environ["HF_MODULES_CACHE"] = "/tmp/hf_modules"
 # =====================================================
 # 1️⃣ Model setup
 # =====================================================
-GEN_MODEL = "hackergeek/qwen3-harrison-rag"        # private model
 EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-# --- Handle HF authentication ---
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
-    print("⚠️  No Hugging Face token found. If the model is private, set HF_TOKEN in your environment.")
-    print("   Example: export HF_TOKEN=hf_yourtoken123 or add it in Hugging Face Space secrets.")
-else:
-    print("✅ Hugging Face token detected.")
-# --- Load model/tokenizer safely ---
 try:
-    param_names = signature(AutoModelForCausalLM.from_pretrained).parameters
-    dtype_arg = "dtype" if "dtype" in param_names else "torch_dtype"
     dtype_value = torch.float16 if torch.cuda.is_available() else torch.float32
-    load_kwargs = {
-        dtype_arg: dtype_value,
-        "cache_dir": "/tmp/hf_cache",
-        "device_map": "auto",
-        "low_cpu_mem_usage": True,
-        "token": HF_TOKEN,
-    }
-    tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL, token=HF_TOKEN)
-    model = AutoModelForCausalLM.from_pretrained(GEN_MODEL, **load_kwargs)
 except Exception as e:
-    print(f"❌ Failed to load model '{GEN_MODEL}'. Reason: {e}")
-    print("➡️ Falling back to a public model: Qwen/Qwen2.5-1.5B-Instruct")
-    GEN_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
-    tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL)
-    model = AutoModelForCausalLM.from_pretrained(
-        GEN_MODEL,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        device_map="auto",
-        low_cpu_mem_usage=True,
-    )
 embedder = SentenceTransformer(EMB_MODEL, cache_folder="/tmp/hf_cache")
 # =====================================================
 # 2️⃣ Retrieval + generation logic
 # =====================================================
-# Placeholder FAISS index and chunks (replace with your actual docs)
-index = faiss.IndexFlatL2(384)  # all-MiniLM-L6-v2 output dimension
 chunks = ["This is a sample context chunk. Replace with real documents."]
 def retrieve_context(query, k=5):
@@ -108,7 +118,7 @@ with gr.Blocks(title="Qwen3-Harrison-RAG Chatbot") as demo:
     clear.click(lambda: None, None, chatbot, queue=False)
 # =====================================================
-# 4️⃣ Launch for Hugging Face Spaces
 # =====================================================
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

 from inspect import signature
 # =====================================================
+# OPTION: Use ephemeral /tmp cache
 # =====================================================
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf_cache"
 os.environ["HF_HOME"] = "/tmp/hf_home"
 # =====================================================
 # 1️⃣ Model setup
 # =====================================================
+GEN_MODEL_PRIVATE = "hackergeek/qwen3-harrison-rag"
+GEN_MODEL_PUBLIC = "Qwen/Qwen2.5-1.5B-Instruct"
 EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 HF_TOKEN = os.getenv("HF_TOKEN")
 if not HF_TOKEN:
+    print("⚠️  No Hugging Face token found. Private models may fail to load.")
+# --- Check if accelerate is available ---
 try:
+    import accelerate
+    accelerate_available = True
+except ImportError:
+    accelerate_available = False
+    print("⚠️  `accelerate` not installed. Large private models with device_map='auto' may fail.")
+# --- Helper to load model safely ---
+def load_model(model_name, token=None):
     dtype_value = torch.float16 if torch.cuda.is_available() else torch.float32
+    try:
+        param_names = signature(AutoModelForCausalLM.from_pretrained).parameters
+        dtype_arg = "dtype" if "dtype" in param_names else "torch_dtype"
+        load_kwargs = {
+            dtype_arg: dtype_value,
+            "cache_dir": "/tmp/hf_cache",
+            "low_cpu_mem_usage": True,
+        }
+        if accelerate_available:
+            load_kwargs["device_map"] = "auto"
+        if token:
+            load_kwargs["token"] = token
+        tokenizer = AutoTokenizer.from_pretrained(model_name, token=token)
+        model = AutoModelForCausalLM.from_pretrained(model_name, **load_kwargs)
+        return tokenizer, model
+    except Exception as e:
+        raise RuntimeError(f"Failed to load model '{model_name}': {e}")
+# --- Attempt to load private model, fallback to public ---
+try:
+    tokenizer, model = load_model(GEN_MODEL_PRIVATE, token=HF_TOKEN)
+    print(f"✅ Loaded private model: {GEN_MODEL_PRIVATE}")
 except Exception as e:
+    print(f"❌ {e}\n➡️ Falling back to public model: {GEN_MODEL_PUBLIC}")
+    tokenizer, model = load_model(GEN_MODEL_PUBLIC)
+    print(f"✅ Loaded public model: {GEN_MODEL_PUBLIC}")
+# --- Load embedding model ---
 embedder = SentenceTransformer(EMB_MODEL, cache_folder="/tmp/hf_cache")
 # =====================================================
 # 2️⃣ Retrieval + generation logic
 # =====================================================
+# Placeholder FAISS index and chunks (replace with your actual documents)
+index = faiss.IndexFlatL2(384)
 chunks = ["This is a sample context chunk. Replace with real documents."]
 def retrieve_context(query, k=5):
     clear.click(lambda: None, None, chatbot, queue=False)
 # =====================================================
+# 4️⃣ Launch
 # =====================================================
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))