Spaces:

Nadhari
/

Gemma-3n-Swahili-demo

Sleeping

App Files Files Community

Alfaxad commited on 19 days ago

Commit

9e33695

verified ·

1 Parent(s): b812900

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -50

app.py CHANGED Viewed

@@ -1,23 +1,24 @@
 import os, time, tarfile, shutil, threading, subprocess
 from urllib.request import urlopen
 import gradio as gr
 import ollama
-# ------------------------------------------------------------
-# Config (switch to Q8_0 if you really want, but Q4_K_S is saner on CPU)
-# ------------------------------------------------------------
 MODEL_ID = os.environ.get(
     "MODEL_ID",
-    "hf.co/Nadhari/gemma-3n-swahili-E2B-it-gguf:Q8_0",
 )
 os.environ.setdefault("OLLAMA_HOST", "http://127.0.0.1:11434")
-ready = threading.Event()
-# ------------------------------------------------------------
-# Ollama bootstrap: user-space install + serve + pull model
-# ------------------------------------------------------------
-def ensure_ollama_bin() -> str:
     found = shutil.which("ollama")
     if found:
         return found
@@ -45,83 +46,92 @@ def ensure_ollama_bin() -> str:
     os.chmod(bin_path, 0o755)
     return bin_path
-def is_server_up() -> bool:
     try:
         ollama.list()
         return True
     except Exception:
         return False
-def boot_ollama():
-    bin_path = ensure_ollama_bin()
     env = os.environ.copy()
     env.setdefault("OLLAMA_MODELS", os.path.abspath("./.ollama_models"))
     env.setdefault("OLLAMA_NUM_PARALLEL", "1")
     env.setdefault("OLLAMA_MAX_LOADED_MODELS", "1")
     os.makedirs(env["OLLAMA_MODELS"], exist_ok=True)
-    if not is_server_up():
         subprocess.Popen([bin_path, "serve"], env=env,
                          stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
-        # wait for server
         for _ in range(120):
-            if is_server_up():
                 break
             time.sleep(1)
-    # ensure model present (pull if missing)
     try:
         ollama.show(model=MODEL_ID)
     except Exception:
         for _ in ollama.pull(model=MODEL_ID, stream=True):
             pass  # quiet pull
-    ready.set()
-# start in background; UI will just show spinner until ready
-threading.Thread(target=boot_ollama, daemon=True).start()
-# ------------------------------------------------------------
-# Chat handler (original UI style, now powered by Ollama)
-# ------------------------------------------------------------
 def respond(message, history, system_message, max_tokens, temperature, top_p):
-    # Block silently until model is ready (first run = download)
-    ready.wait()
-    # history may be tuples or messages; normalize to OpenAI-style
-    msgs = [{"role": "system", "content": system_message}]
-    if history and isinstance(history[0], (list, tuple)):
         for u, a in history:
-            if u: msgs.append({"role": "user", "content": u})
-            if a: msgs.append({"role": "assistant", "content": a})
-    else:
-        msgs.extend(history or [])
-    msgs.append({"role": "user", "content": message})
-    partial = ""
-    for chunk in ollama.chat(model=MODEL_ID, messages=msgs, stream=True):
-        piece = chunk.get("message", {}).get("content", "") or chunk.get("delta", "")
-        if piece:
-            partial += piece
-            yield partial
-# ------------------------------------------------------------
-# Gradio UI — same layout as your original
-# ------------------------------------------------------------
-chatbot = gr.Chatbot(type="messages", label="Chat")  # avoid deprecated tuples
 demo = gr.ChatInterface(
     respond,
-    chatbot=chatbot,
     title="Gemma-3n-Swahili demo (E2B-it, GGUF)",
     description="Text-only interface for the Gemma-3n Swahili model. Uliza maswali kwa Kiswahili au Kiingereza!",
     additional_inputs=[
         gr.Textbox(value="Wewe ni msaidizi unaejua na kuongea Kiswahili.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1,   label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0,  value=0.7,  step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0,  value=0.95, step=0.05,label="Top-p"),
     ],
 )
 if __name__ == "__main__":
-    demo.queue().launch()

 import os, time, tarfile, shutil, threading, subprocess
 from urllib.request import urlopen
 import gradio as gr
 import ollama
+# ----------------------------------------
+# Config
+# ----------------------------------------
 MODEL_ID = os.environ.get(
     "MODEL_ID",
+    "hf.co/Nadhari/gemma-3n-swahili-E2B-it-gguf:Q8_0",  # swap to :Q8_0 if you insist
 )
 os.environ.setdefault("OLLAMA_HOST", "http://127.0.0.1:11434")
+_ready = threading.Event()
+# ----------------------------------------
+# Ollama bootstrap (user-space install + serve + pull)
+# ----------------------------------------
+def _ensure_ollama_bin() -> str:
     found = shutil.which("ollama")
     if found:
         return found
     os.chmod(bin_path, 0o755)
     return bin_path
+def _is_server_up() -> bool:
     try:
         ollama.list()
         return True
     except Exception:
         return False
+def _boot_ollama():
+    bin_path = _ensure_ollama_bin()
     env = os.environ.copy()
     env.setdefault("OLLAMA_MODELS", os.path.abspath("./.ollama_models"))
     env.setdefault("OLLAMA_NUM_PARALLEL", "1")
     env.setdefault("OLLAMA_MAX_LOADED_MODELS", "1")
     os.makedirs(env["OLLAMA_MODELS"], exist_ok=True)
+    if not _is_server_up():
         subprocess.Popen([bin_path, "serve"], env=env,
                          stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
         for _ in range(120):
+            if _is_server_up():
                 break
             time.sleep(1)
+    # Ensure model is present
     try:
         ollama.show(model=MODEL_ID)
     except Exception:
         for _ in ollama.pull(model=MODEL_ID, stream=True):
             pass  # quiet pull
+    _ready.set()
+threading.Thread(target=_boot_ollama, daemon=True).start()
+# ----------------------------------------
+# Chat (same UX as your original sample)
+# ----------------------------------------
 def respond(message, history, system_message, max_tokens, temperature, top_p):
+    # Wait silently until model is ready (first run = downloads)
+    _ready.wait()
+    # Normalize history to OpenAI-style messages
+    messages = [{"role": "system", "content": system_message}]
+    if history and isinstance(history[0], (list, tuple)):  # tuples format
         for u, a in history:
+            if u:
+                messages.append({"role": "user", "content": u})
+            if a:
+                messages.append({"role": "assistant", "content": a})
+    else:  # already messages format
+        messages.extend(history or [])
+    messages.append({"role": "user", "content": message})
+    response = ""
+    for chunk in ollama.chat(
+        model=MODEL_ID,
+        messages=messages,
+        stream=True,
+        options={
+            "num_predict": int(max_tokens),
+            "temperature": float(temperature),
+            "top_p": float(top_p),
+            "num_ctx": 4096,
+        },
+    ):
+        token = chunk.get("message", {}).get("content", "") or chunk.get("delta", "")
+        if token:
+            response += token
+            yield response
+# ----------------------------------------
+# UI (your original ChatInterface vibe)
+# ----------------------------------------
 demo = gr.ChatInterface(
     respond,
+    type="messages",  # fix deprecation + align formats
     title="Gemma-3n-Swahili demo (E2B-it, GGUF)",
     description="Text-only interface for the Gemma-3n Swahili model. Uliza maswali kwa Kiswahili au Kiingereza!",
     additional_inputs=[
         gr.Textbox(value="Wewe ni msaidizi unaejua na kuongea Kiswahili.", label="System message"),
+        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
     ],
 )
 if __name__ == "__main__":
+    # Disable SSR on Spaces to avoid the h11 Content-Length crash
+    demo.queue().launch(ssr_mode=False)