Alfaxad commited on
Commit
9e33695
·
verified ·
1 Parent(s): b812900

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -50
app.py CHANGED
@@ -1,23 +1,24 @@
1
  import os, time, tarfile, shutil, threading, subprocess
2
  from urllib.request import urlopen
 
3
  import gradio as gr
4
  import ollama
5
 
6
- # ------------------------------------------------------------
7
- # Config (switch to Q8_0 if you really want, but Q4_K_S is saner on CPU)
8
- # ------------------------------------------------------------
9
  MODEL_ID = os.environ.get(
10
  "MODEL_ID",
11
- "hf.co/Nadhari/gemma-3n-swahili-E2B-it-gguf:Q8_0",
12
  )
13
  os.environ.setdefault("OLLAMA_HOST", "http://127.0.0.1:11434")
14
 
15
- ready = threading.Event()
16
 
17
- # ------------------------------------------------------------
18
- # Ollama bootstrap: user-space install + serve + pull model
19
- # ------------------------------------------------------------
20
- def ensure_ollama_bin() -> str:
21
  found = shutil.which("ollama")
22
  if found:
23
  return found
@@ -45,83 +46,92 @@ def ensure_ollama_bin() -> str:
45
  os.chmod(bin_path, 0o755)
46
  return bin_path
47
 
48
- def is_server_up() -> bool:
49
  try:
50
  ollama.list()
51
  return True
52
  except Exception:
53
  return False
54
 
55
- def boot_ollama():
56
- bin_path = ensure_ollama_bin()
57
  env = os.environ.copy()
58
  env.setdefault("OLLAMA_MODELS", os.path.abspath("./.ollama_models"))
59
  env.setdefault("OLLAMA_NUM_PARALLEL", "1")
60
  env.setdefault("OLLAMA_MAX_LOADED_MODELS", "1")
61
  os.makedirs(env["OLLAMA_MODELS"], exist_ok=True)
62
 
63
- if not is_server_up():
64
  subprocess.Popen([bin_path, "serve"], env=env,
65
  stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
66
- # wait for server
67
  for _ in range(120):
68
- if is_server_up():
69
  break
70
  time.sleep(1)
71
 
72
- # ensure model present (pull if missing)
73
  try:
74
  ollama.show(model=MODEL_ID)
75
  except Exception:
76
  for _ in ollama.pull(model=MODEL_ID, stream=True):
77
  pass # quiet pull
78
 
79
- ready.set()
80
 
81
- # start in background; UI will just show spinner until ready
82
- threading.Thread(target=boot_ollama, daemon=True).start()
83
 
84
- # ------------------------------------------------------------
85
- # Chat handler (original UI style, now powered by Ollama)
86
- # ------------------------------------------------------------
87
  def respond(message, history, system_message, max_tokens, temperature, top_p):
88
- # Block silently until model is ready (first run = download)
89
- ready.wait()
90
 
91
- # history may be tuples or messages; normalize to OpenAI-style
92
- msgs = [{"role": "system", "content": system_message}]
93
- if history and isinstance(history[0], (list, tuple)):
94
  for u, a in history:
95
- if u: msgs.append({"role": "user", "content": u})
96
- if a: msgs.append({"role": "assistant", "content": a})
97
- else:
98
- msgs.extend(history or [])
99
- msgs.append({"role": "user", "content": message})
100
-
101
- partial = ""
102
- for chunk in ollama.chat(model=MODEL_ID, messages=msgs, stream=True):
103
- piece = chunk.get("message", {}).get("content", "") or chunk.get("delta", "")
104
- if piece:
105
- partial += piece
106
- yield partial
107
-
108
- # ------------------------------------------------------------
109
- # Gradio UI — same layout as your original
110
- # ------------------------------------------------------------
111
- chatbot = gr.Chatbot(type="messages", label="Chat") # avoid deprecated tuples
112
-
 
 
 
 
 
 
 
 
 
 
113
  demo = gr.ChatInterface(
114
  respond,
115
- chatbot=chatbot,
116
  title="Gemma-3n-Swahili demo (E2B-it, GGUF)",
117
  description="Text-only interface for the Gemma-3n Swahili model. Uliza maswali kwa Kiswahili au Kiingereza!",
118
  additional_inputs=[
119
  gr.Textbox(value="Wewe ni msaidizi unaejua na kuongea Kiswahili.", label="System message"),
120
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
121
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
122
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05,label="Top-p"),
123
  ],
124
  )
125
 
126
  if __name__ == "__main__":
127
- demo.queue().launch()
 
 
1
  import os, time, tarfile, shutil, threading, subprocess
2
  from urllib.request import urlopen
3
+
4
  import gradio as gr
5
  import ollama
6
 
7
+ # ----------------------------------------
8
+ # Config
9
+ # ----------------------------------------
10
  MODEL_ID = os.environ.get(
11
  "MODEL_ID",
12
+ "hf.co/Nadhari/gemma-3n-swahili-E2B-it-gguf:Q8_0", # swap to :Q8_0 if you insist
13
  )
14
  os.environ.setdefault("OLLAMA_HOST", "http://127.0.0.1:11434")
15
 
16
+ _ready = threading.Event()
17
 
18
+ # ----------------------------------------
19
+ # Ollama bootstrap (user-space install + serve + pull)
20
+ # ----------------------------------------
21
+ def _ensure_ollama_bin() -> str:
22
  found = shutil.which("ollama")
23
  if found:
24
  return found
 
46
  os.chmod(bin_path, 0o755)
47
  return bin_path
48
 
49
+ def _is_server_up() -> bool:
50
  try:
51
  ollama.list()
52
  return True
53
  except Exception:
54
  return False
55
 
56
+ def _boot_ollama():
57
+ bin_path = _ensure_ollama_bin()
58
  env = os.environ.copy()
59
  env.setdefault("OLLAMA_MODELS", os.path.abspath("./.ollama_models"))
60
  env.setdefault("OLLAMA_NUM_PARALLEL", "1")
61
  env.setdefault("OLLAMA_MAX_LOADED_MODELS", "1")
62
  os.makedirs(env["OLLAMA_MODELS"], exist_ok=True)
63
 
64
+ if not _is_server_up():
65
  subprocess.Popen([bin_path, "serve"], env=env,
66
  stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
 
67
  for _ in range(120):
68
+ if _is_server_up():
69
  break
70
  time.sleep(1)
71
 
72
+ # Ensure model is present
73
  try:
74
  ollama.show(model=MODEL_ID)
75
  except Exception:
76
  for _ in ollama.pull(model=MODEL_ID, stream=True):
77
  pass # quiet pull
78
 
79
+ _ready.set()
80
 
81
+ threading.Thread(target=_boot_ollama, daemon=True).start()
 
82
 
83
+ # ----------------------------------------
84
+ # Chat (same UX as your original sample)
85
+ # ----------------------------------------
86
  def respond(message, history, system_message, max_tokens, temperature, top_p):
87
+ # Wait silently until model is ready (first run = downloads)
88
+ _ready.wait()
89
 
90
+ # Normalize history to OpenAI-style messages
91
+ messages = [{"role": "system", "content": system_message}]
92
+ if history and isinstance(history[0], (list, tuple)): # tuples format
93
  for u, a in history:
94
+ if u:
95
+ messages.append({"role": "user", "content": u})
96
+ if a:
97
+ messages.append({"role": "assistant", "content": a})
98
+ else: # already messages format
99
+ messages.extend(history or [])
100
+ messages.append({"role": "user", "content": message})
101
+
102
+ response = ""
103
+ for chunk in ollama.chat(
104
+ model=MODEL_ID,
105
+ messages=messages,
106
+ stream=True,
107
+ options={
108
+ "num_predict": int(max_tokens),
109
+ "temperature": float(temperature),
110
+ "top_p": float(top_p),
111
+ "num_ctx": 4096,
112
+ },
113
+ ):
114
+ token = chunk.get("message", {}).get("content", "") or chunk.get("delta", "")
115
+ if token:
116
+ response += token
117
+ yield response
118
+
119
+ # ----------------------------------------
120
+ # UI (your original ChatInterface vibe)
121
+ # ----------------------------------------
122
  demo = gr.ChatInterface(
123
  respond,
124
+ type="messages", # fix deprecation + align formats
125
  title="Gemma-3n-Swahili demo (E2B-it, GGUF)",
126
  description="Text-only interface for the Gemma-3n Swahili model. Uliza maswali kwa Kiswahili au Kiingereza!",
127
  additional_inputs=[
128
  gr.Textbox(value="Wewe ni msaidizi unaejua na kuongea Kiswahili.", label="System message"),
129
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
130
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
131
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
132
  ],
133
  )
134
 
135
  if __name__ == "__main__":
136
+ # Disable SSR on Spaces to avoid the h11 Content-Length crash
137
+ demo.queue().launch(ssr_mode=False)