AbstractPhil commited on
Commit
2e87c77
·
1 Parent(s): f10571d

claude helping instead of gpt 5 now

Browse files
Files changed (1) hide show
  1. app.py +241 -168
app.py CHANGED
@@ -1,6 +1,7 @@
1
  """
2
  Mirel Harmony Inference – HF Space (Gradio)
3
  ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
 
4
  Single file: app.py
5
  """
6
  from __future__ import annotations
@@ -10,6 +11,7 @@ from typing import List, Dict, Optional, Any
10
  import gradio as gr
11
  import spaces # required for ZeroGPU
12
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 
13
 
14
  # -----------------------
15
  # Config & runtime modes
@@ -21,46 +23,47 @@ ADAPTER_ID = os.getenv("ADAPTER_ID") or None
21
  ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
22
  ATTN_IMPL = os.getenv("ATTN_IMPL", "eager")
23
  DTYPE = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
24
- SYSTEM_DEF = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant..")
25
- MAX_DEF = int(os.getenv("MAX_NEW_TOKENS", "256"))
26
  ZEROGPU = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
27
  LOAD_4BIT = os.getenv("LOAD_4BIT", "0") == "1"
28
 
29
- # Optional: HF auth for private/private repos (Spaces Secrets friendly)
30
- HF_TOKEN: Optional[str] = None
31
-
32
- #def _hf_login() -> None:
33
- # """Login to HF Hub using common env secret names.
34
- # Works on Spaces with a single secret set. No CUDA touched here.
35
- # """
36
- # global HF_TOKEN
37
- # HF_TOKEN = (
38
- # os.getenv("HF_TOKEN")
39
- # or os.getenv("HUGGING_FACE_HUB_TOKEN")
40
- # or os.getenv("HUGGINGFACEHUB_API_TOKEN")
41
- # )
42
- # if HF_TOKEN:
43
- # try:
44
- # from huggingface_hub import login, whoami
45
- # login(token=HF_TOKEN, add_to_git_credential=True)
46
- # try:
47
- # who = whoami(token=HF_TOKEN)
48
- # print(f"[hf] logged in as: {who.get('name') or who.get('email') or who.get('id')}")
49
- # except Exception:
50
- # pass
51
- # except Exception as e:
52
- # print(f"[hf] login failed: {e}")
53
- # else:
54
- # print("[hf] no token found; accessing only public repos")
55
- #
56
- #_hf_login()
57
 
58
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
59
- # Is HF OAuth configured for this Space? (set automatically when README has `hf_oauth: true`)
60
- OAUTH_READY = bool(os.getenv("OAUTH_CLIENT_ID"))
61
 
62
  # Tokenizer is lightweight; load once (pass token for private models)
63
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
 
 
 
 
 
64
 
65
  # -----------------------
66
  # Lazy model loader (ZeroGPU-friendly)
@@ -82,6 +85,7 @@ def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
82
  attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
83
  trust_remote_code=True,
84
  low_cpu_mem_usage=True,
 
85
  )
86
  # Only enable 4-bit when not explicitly CPU-bound
87
  if LOAD_4BIT and device_map != "cpu":
@@ -96,25 +100,30 @@ def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
96
 
97
 
98
  def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
99
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, token=HF_TOKEN, **_build_model_kwargs(device_map))
 
 
100
  if ADAPTER_ID:
101
  if not _HAS_PEFT:
102
  raise RuntimeError("peft is required when ADAPTER_ID is set.")
103
- peft_kwargs: Dict[str, Any] = {}
 
104
  if ADAPTER_SUBFOLDER:
105
  peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
106
- model = PeftModel.from_pretrained(model, ADAPTER_ID, is_trainable=False, token=HF_TOKEN, **peft_kwargs)
107
- model.eval(); model.config.use_cache = True
 
 
 
108
  return model
109
 
110
  # -----------------------
111
- # Harmony formatting
112
  # -----------------------
113
 
114
  def to_harmony_prompt(messages: List[Dict[str, str]]) -> str:
115
  """
116
  Strict Harmony: rely on the tokenizer's official chat template.
117
- If the template is missing, raise clearly so the Space uses a Harmony-enabled checkpoint.
118
  """
119
  tmpl = getattr(tokenizer, "chat_template", None)
120
  if not tmpl:
@@ -123,13 +132,38 @@ def to_harmony_prompt(messages: List[Dict[str, str]]) -> str:
123
  )
124
  return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  # -----------------------
127
  # Optional Rose guidance (logits bias)
128
- # ----------------------- (logits bias)
129
  # -----------------------
130
 
131
  def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
132
- """Create vocab bias from {token: weight}. Unknown tokens ignored. Positive promotes, negative demotes."""
133
  vocab_size = len(tokenizer)
134
  bias = torch.zeros(vocab_size, dtype=torch.float32)
135
  for tok, w in mapping.items():
@@ -140,7 +174,7 @@ def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor
140
  for t in tid:
141
  if isinstance(t, int) and t >= 0:
142
  bias[t] += float(w) / max(1, len(tid))
143
- elif isinstance(tid, int) and t >= 0:
144
  bias[tid] += float(w)
145
  return bias
146
 
@@ -149,83 +183,99 @@ class RoseGuidedLogits(torch.nn.Module):
149
  super().__init__()
150
  self.bias_vec = bias_vec
151
  self.alpha = float(alpha)
 
152
  def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
153
  return scores + self.alpha * self.bias_vec.to(scores.device)
154
 
155
- @spaces.GPU
156
  def zerogpu_generate(full_prompt: str,
157
  gen_kwargs: Dict[str, Any],
158
  rose_map: Optional[Dict[str, float]],
159
  rose_alpha: float,
160
  rose_score: Optional[float],
161
- seed: Optional[int]) -> str:
162
- """Run **entire** inference on GPU (ZeroGPU-safe). No CUDA touches in main process."""
163
- if seed is not None:
164
- torch.manual_seed(int(seed))
165
-
166
- # Load base + adapter directly on GPU inside the GPU context
167
- model = _load_model_on("auto")
168
  try:
 
 
 
 
 
 
 
169
  logits_processor = None
170
  if rose_map:
171
  bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
172
  eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
173
  logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
174
 
 
175
  inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
176
- out_ids = model.generate(
177
- **inputs,
178
- do_sample=bool(gen_kwargs.get("do_sample", True)),
179
- temperature=float(gen_kwargs.get("temperature", 0.7)),
180
- top_p=float(gen_kwargs.get("top_p", 0.9)),
181
- top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") else None),
182
- max_new_tokens=int(gen_kwargs.get("max_new_tokens", 512)),
183
- pad_token_id=tokenizer.eos_token_id,
184
- eos_token_id=tokenizer.eos_token_id,
185
- logits_processor=logits_processor,
186
- )
187
- # Decode only the generated tail (exclude prompt) and extract the `final` channel
188
- prompt_len = int(inputs["input_ids"].shape[1])
189
- gen_ids = out_ids[0][prompt_len:]
190
- decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
191
- fb, ret, end = "<|channel|>final<|message|>", "<|return|>", "<|end|>"
192
- idx = decoded.rfind(fb)
193
- if idx != -1:
194
- s = decoded[idx + len(fb):]
195
- stop = s.find(ret)
196
- if stop == -1:
197
- stop = s.find(end)
198
- if stop != -1:
199
- s = s[:stop]
200
- text = s.strip()
 
 
 
201
  else:
202
- text = decoded.strip()
203
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  finally:
205
- # Ensure no GPU state leaks back to the main process
206
  try:
207
  del model
208
- except Exception:
209
  pass
210
  gc.collect()
211
- try:
212
- if torch.cuda.is_available():
213
- torch.cuda.empty_cache()
214
- except Exception:
215
- pass
216
 
217
  # -----------------------
218
- # Gradio handlers and UI
219
  # -----------------------
220
- @dataclass
221
- class GenCfg:
222
- temperature: float
223
- top_p: float
224
- top_k: Optional[int]
225
- max_new_tokens: int
226
- do_sample: bool
227
- seed: Optional[int]
228
-
229
 
230
  def chat_to_messages(history: List[Any], system_prompt: str) -> List[Dict[str, str]]:
231
  msgs: List[Dict[str, str]] = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
@@ -238,28 +288,30 @@ def chat_to_messages(history: List[Any], system_prompt: str) -> List[Dict[str, s
238
  if isinstance(item, (list, tuple)) and len(item) == 2:
239
  u, a = item
240
  if u is not None:
241
- msgs.append({"role": "user", "content": u})
242
  if a:
243
- msgs.append({"role": "assistant", "content": a})
244
  return msgs
245
 
246
-
247
- def generate_stream(message: Any, history: List[Any], system_prompt: str,
248
  temperature: float, top_p: float, top_k: int, max_new_tokens: int,
249
  do_sample: bool, seed: Optional[int],
250
- rose_enable: bool, rose_alpha: float, rose_score: Optional[float], rose_tokens: str, rose_json: str):
251
- """ZeroGPU generator (non-streaming): do all CUDA work inside `zerogpu_generate` and
252
- return a single string. This avoids h11 Content-Length issues on exceptions mid-stream.
 
 
253
  """
254
  try:
255
  # Normalize message and build Harmony prompt
256
  if isinstance(message, dict):
257
  message = message.get("content", "")
 
258
  msgs = chat_to_messages(history, system_prompt)
259
  msgs.append({"role": "user", "content": str(message)})
260
  prompt = to_harmony_prompt(msgs)
261
 
262
- # Rose map
263
  rose_map: Optional[Dict[str, float]] = None
264
  if rose_enable:
265
  rose_map = {}
@@ -270,7 +322,7 @@ def generate_stream(message: Any, history: List[Any], system_prompt: str,
270
  k, v = p.split(":", 1)
271
  try:
272
  rose_map[k.strip()] = float(v)
273
- except Exception:
274
  pass
275
  if rose_json:
276
  try:
@@ -279,55 +331,40 @@ def generate_stream(message: Any, history: List[Any], system_prompt: str,
279
  for k, v in j.items():
280
  try:
281
  rose_map[str(k)] = float(v)
282
- except Exception:
283
  pass
284
- except Exception:
285
  pass
286
  if not rose_map:
287
  rose_map = None
288
 
289
- # Always use the GPU entrypoint; return once
290
- text = zerogpu_generate(
291
  prompt,
292
  {
293
  "do_sample": bool(do_sample),
294
  "temperature": float(temperature),
295
  "top_p": float(top_p),
296
- "top_k": (int(top_k) if int(top_k) > 0 else None),
297
  "max_new_tokens": int(max_new_tokens),
298
  },
299
  rose_map,
300
  float(rose_alpha),
301
  float(rose_score) if rose_score is not None else None,
302
  int(seed) if seed is not None else None,
 
303
  )
304
- return text
 
 
 
 
 
 
 
 
305
  except Exception as e:
306
- # Return error as plain text (no streaming) to avoid Content-Length mismatches
307
- return f"[error] {type(e).__name__}: {e}"
308
-
309
- # -----------------------
310
- # Helper: login status banner (HF OAuth)
311
- # -----------------------
312
-
313
- #def _login_status(profile: gr.OAuthProfile | None) -> str:
314
- # """Show whether the visitor is logged in to Hugging Face.
315
- # This affects ZeroGPU quotas (logged-in users get their own token/quota).
316
- # Requires the Space to have `hf_oauth: true` in README metadata.
317
- # """
318
- # # If OAuth isn't configured on the Space, inform clearly
319
- # if not os.getenv("OAUTH_CLIENT_ID"):
320
- # return (
321
- # "ℹ️ OAuth is not configured on this Space. Add `hf_oauth: true` to README metadata "
322
- # "so users can sign in and ZeroGPU can use their account quota."
323
- # )
324
- # if profile is None:
325
- # return (
326
- # "🔒 Not signed in to Hugging Face — ZeroGPU will count as anonymous (lower quota). "
327
- # "Click **Sign in with HF** above."
328
- # )
329
- # name = getattr(profile, "name", None) or getattr(profile, "preferred_username", None) or getattr(profile, "id", "user")
330
- # return f"🔓 Signed in as **{name}** — ZeroGPU will use your account quota."
331
 
332
  # -----------------------
333
  # UI
@@ -335,53 +372,89 @@ def generate_stream(message: Any, history: List[Any], system_prompt: str,
335
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
336
  gr.Markdown(
337
  """
338
- # Mirel – Harmony Inference (ZeroGPU-ready)
339
- OSS-20B + optional Rose-SFT adapter. Harmony chat template is applied automatically.
340
- """
341
- )
342
-
343
- # Sign-in note
344
- login_status = gr.Markdown(
345
- "If you're logged into huggingface.co in this browser, ZeroGPU will use *your* quota automatically."
346
  )
347
 
348
  with gr.Row():
349
- system_prompt = gr.Textbox(label="System", value=SYSTEM_DEF)
350
- with gr.Accordion("Generation settings", open=False):
 
 
 
 
 
351
  with gr.Row():
352
- temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="temperature")
353
- top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="top_p")
354
- top_k = gr.Slider(0, 200, value=0, step=1, label="top_k (0=off)")
355
- max_new = gr.Slider(16, 2048, value=MAX_DEF, step=8, label="max_new_tokens")
356
- do_sample = gr.Checkbox(value=True, label="do_sample")
357
- seed = gr.Number(value=None, label="seed (optional)")
358
- with gr.Accordion("Rose guidance (optional)", open=False):
359
  with gr.Row():
360
- rose_enable = gr.Checkbox(value=False, label="Enable Rose bias at decode")
361
- rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="rose alpha (strength)")
362
- rose_score = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="rose score (0–1)")
363
- rose_tokens = gr.Textbox(label="token:weight list (comma-separated)", value="")
364
- rose_json = gr.Textbox(label="JSON {token: weight}", value="")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
 
366
  chat = gr.ChatInterface(
367
- fn=generate_stream,
368
  type="messages",
369
- additional_inputs=[system_prompt, temperature, top_p, top_k, max_new, do_sample, seed, rose_enable, rose_alpha, rose_score, rose_tokens, rose_json],
370
- title="Mirel",
 
 
 
 
 
371
  cache_examples=False,
 
 
 
372
  )
373
 
374
-
375
-
376
  gr.Markdown(
377
  """
378
- **Notes**
379
- - Set env `ZEROGPU=1` for just-in-time GPU allocation via @spaces.GPU.
380
- - Set `ADAPTER_ID=AbstractPhil/mirel-gpt-oss-20b` and `ADAPTER_SUBFOLDER=checkpoints/checkpoint-516` to use the provided adapter.
381
- - Use `torch==2.4.0` for ZeroGPU.
382
- - Rose guidance biases logits; it does not change weights.
383
- """
 
 
 
 
 
384
  )
385
 
386
  if __name__ == "__main__":
387
- demo.queue(max_size=8 if ZEROGPU else 32).launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
 
1
  """
2
  Mirel Harmony Inference – HF Space (Gradio)
3
  ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
4
+ Chain-of-thought model with proper channel extraction
5
  Single file: app.py
6
  """
7
  from __future__ import annotations
 
11
  import gradio as gr
12
  import spaces # required for ZeroGPU
13
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
14
+ from threading import Thread
15
 
16
  # -----------------------
17
  # Config & runtime modes
 
23
  ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
24
  ATTN_IMPL = os.getenv("ATTN_IMPL", "eager")
25
  DTYPE = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
26
+ SYSTEM_DEF = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
27
+ MAX_DEF = int(os.getenv("MAX_NEW_TOKENS", "512"))
28
  ZEROGPU = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
29
  LOAD_4BIT = os.getenv("LOAD_4BIT", "0") == "1"
30
 
31
+ # HF Auth - properly handle multiple token env var names
32
+ HF_TOKEN: Optional[str] = (
33
+ os.getenv("HF_TOKEN")
34
+ or os.getenv("HUGGING_FACE_HUB_TOKEN")
35
+ or os.getenv("HUGGINGFACEHUB_API_TOKEN")
36
+ or os.getenv("HF_ACCESS_TOKEN")
37
+ )
38
+
39
+ def _hf_login() -> None:
40
+ """Login to HF Hub using common env secret names."""
41
+ if HF_TOKEN:
42
+ try:
43
+ from huggingface_hub import login, whoami
44
+ login(token=HF_TOKEN, add_to_git_credential=True)
45
+ try:
46
+ who = whoami(token=HF_TOKEN)
47
+ print(f"[HF Auth] Logged in as: {who.get('name') or who.get('fullname') or who.get('id', 'unknown')}")
48
+ except Exception:
49
+ print("[HF Auth] Login successful but couldn't get user info")
50
+ except Exception as e:
51
+ print(f"[HF Auth] Login failed: {e}")
52
+ else:
53
+ print("[HF Auth] No token found in environment variables (HF_TOKEN, HUGGING_FACE_HUB_TOKEN, etc.)")
54
+
55
+ # Login before loading any models
56
+ _hf_login()
 
 
57
 
58
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
59
 
60
  # Tokenizer is lightweight; load once (pass token for private models)
61
+ try:
62
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
63
+ print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
64
+ except Exception as e:
65
+ print(f"[Model] Failed to load tokenizer: {e}")
66
+ raise
67
 
68
  # -----------------------
69
  # Lazy model loader (ZeroGPU-friendly)
 
85
  attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
86
  trust_remote_code=True,
87
  low_cpu_mem_usage=True,
88
+ token=HF_TOKEN, # Add token here for private model access
89
  )
90
  # Only enable 4-bit when not explicitly CPU-bound
91
  if LOAD_4BIT and device_map != "cpu":
 
100
 
101
 
102
  def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
103
+ print(f"[Model] Loading base model from {MODEL_ID}...")
104
+ model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **_build_model_kwargs(device_map))
105
+
106
  if ADAPTER_ID:
107
  if not _HAS_PEFT:
108
  raise RuntimeError("peft is required when ADAPTER_ID is set.")
109
+ print(f"[Model] Loading adapter from {ADAPTER_ID}...")
110
+ peft_kwargs: Dict[str, Any] = {"token": HF_TOKEN}
111
  if ADAPTER_SUBFOLDER:
112
  peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
113
+ model = PeftModel.from_pretrained(model, ADAPTER_ID, is_trainable=False, **peft_kwargs)
114
+
115
+ model.eval()
116
+ model.config.use_cache = True
117
+ print("[Model] Model loaded successfully")
118
  return model
119
 
120
  # -----------------------
121
+ # Harmony formatting & CoT extraction
122
  # -----------------------
123
 
124
  def to_harmony_prompt(messages: List[Dict[str, str]]) -> str:
125
  """
126
  Strict Harmony: rely on the tokenizer's official chat template.
 
127
  """
128
  tmpl = getattr(tokenizer, "chat_template", None)
129
  if not tmpl:
 
132
  )
133
  return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
134
 
135
+ def extract_final_channel(text: str) -> str:
136
+ """
137
+ Extract the final channel from chain-of-thought output.
138
+ The model outputs thinking in internal channels and final response in final channel.
139
+ """
140
+ # Look for the final channel marker
141
+ final_marker = "<|channel|>final<|message|>"
142
+
143
+ if final_marker in text:
144
+ # Extract everything after the final channel marker
145
+ parts = text.split(final_marker)
146
+ if len(parts) > 1:
147
+ final_text = parts[-1]
148
+
149
+ # Clean up end markers
150
+ end_markers = ["<|return|>", "<|end|>", "<|endoftext|>"]
151
+ for marker in end_markers:
152
+ if marker in final_text:
153
+ final_text = final_text.split(marker)[0]
154
+
155
+ return final_text.strip()
156
+
157
+ # If no channel markers found, return the cleaned text
158
+ # (might be a non-CoT response or error)
159
+ return text.strip()
160
+
161
  # -----------------------
162
  # Optional Rose guidance (logits bias)
 
163
  # -----------------------
164
 
165
  def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
166
+ """Create vocab bias from {token: weight}. Unknown tokens ignored."""
167
  vocab_size = len(tokenizer)
168
  bias = torch.zeros(vocab_size, dtype=torch.float32)
169
  for tok, w in mapping.items():
 
174
  for t in tid:
175
  if isinstance(t, int) and t >= 0:
176
  bias[t] += float(w) / max(1, len(tid))
177
+ elif isinstance(tid, int) and tid >= 0:
178
  bias[tid] += float(w)
179
  return bias
180
 
 
183
  super().__init__()
184
  self.bias_vec = bias_vec
185
  self.alpha = float(alpha)
186
+
187
  def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
188
  return scores + self.alpha * self.bias_vec.to(scores.device)
189
 
190
+ @spaces.GPU(duration=120) # Give enough time for longer CoT generations
191
  def zerogpu_generate(full_prompt: str,
192
  gen_kwargs: Dict[str, Any],
193
  rose_map: Optional[Dict[str, float]],
194
  rose_alpha: float,
195
  rose_score: Optional[float],
196
+ seed: Optional[int],
197
+ stream: bool = False) -> str:
198
+ """Run inference on GPU (ZeroGPU-safe)."""
 
 
 
 
199
  try:
200
+ if seed is not None:
201
+ torch.manual_seed(int(seed))
202
+
203
+ # Load model
204
+ model = _load_model_on("auto")
205
+
206
+ # Setup logits processor for Rose guidance
207
  logits_processor = None
208
  if rose_map:
209
  bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
210
  eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
211
  logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
212
 
213
+ # Tokenize input
214
  inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
215
+
216
+ if stream:
217
+ # Streaming generation (for future use)
218
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
219
+
220
+ generation_kwargs = dict(
221
+ **inputs,
222
+ streamer=streamer,
223
+ do_sample=bool(gen_kwargs.get("do_sample", True)),
224
+ temperature=float(gen_kwargs.get("temperature", 0.7)),
225
+ top_p=float(gen_kwargs.get("top_p", 0.9)),
226
+ top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
227
+ max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
228
+ pad_token_id=tokenizer.eos_token_id,
229
+ eos_token_id=tokenizer.eos_token_id,
230
+ logits_processor=logits_processor,
231
+ )
232
+
233
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
234
+ thread.start()
235
+
236
+ generated_text = ""
237
+ for new_text in streamer:
238
+ generated_text += new_text
239
+ # Could yield here for real streaming
240
+
241
+ thread.join()
242
+ return generated_text
243
  else:
244
+ # Non-streaming generation
245
+ out_ids = model.generate(
246
+ **inputs,
247
+ do_sample=bool(gen_kwargs.get("do_sample", True)),
248
+ temperature=float(gen_kwargs.get("temperature", 0.7)),
249
+ top_p=float(gen_kwargs.get("top_p", 0.9)),
250
+ top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
251
+ max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
252
+ pad_token_id=tokenizer.eos_token_id,
253
+ eos_token_id=tokenizer.eos_token_id,
254
+ logits_processor=logits_processor,
255
+ )
256
+
257
+ # Decode the full output (including special tokens for CoT)
258
+ prompt_len = int(inputs["input_ids"].shape[1])
259
+ gen_ids = out_ids[0][prompt_len:]
260
+ decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
261
+
262
+ return decoded
263
+
264
+ except Exception as e:
265
+ return f"[Error] {type(e).__name__}: {str(e)}"
266
  finally:
267
+ # Cleanup
268
  try:
269
  del model
270
+ except:
271
  pass
272
  gc.collect()
273
+ if torch.cuda.is_available():
274
+ torch.cuda.empty_cache()
 
 
 
275
 
276
  # -----------------------
277
+ # Gradio handlers
278
  # -----------------------
 
 
 
 
 
 
 
 
 
279
 
280
  def chat_to_messages(history: List[Any], system_prompt: str) -> List[Dict[str, str]]:
281
  msgs: List[Dict[str, str]] = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
 
288
  if isinstance(item, (list, tuple)) and len(item) == 2:
289
  u, a = item
290
  if u is not None:
291
+ msgs.append({"role": "user", "content": str(u)})
292
  if a:
293
+ msgs.append({"role": "assistant", "content": str(a)})
294
  return msgs
295
 
296
+ def generate_response(message: Any, history: List[Any], system_prompt: str,
 
297
  temperature: float, top_p: float, top_k: int, max_new_tokens: int,
298
  do_sample: bool, seed: Optional[int],
299
+ rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
300
+ rose_tokens: str, rose_json: str,
301
+ show_thinking: bool = False):
302
+ """
303
+ Generate response with proper CoT handling.
304
  """
305
  try:
306
  # Normalize message and build Harmony prompt
307
  if isinstance(message, dict):
308
  message = message.get("content", "")
309
+
310
  msgs = chat_to_messages(history, system_prompt)
311
  msgs.append({"role": "user", "content": str(message)})
312
  prompt = to_harmony_prompt(msgs)
313
 
314
+ # Build Rose map if enabled
315
  rose_map: Optional[Dict[str, float]] = None
316
  if rose_enable:
317
  rose_map = {}
 
322
  k, v = p.split(":", 1)
323
  try:
324
  rose_map[k.strip()] = float(v)
325
+ except:
326
  pass
327
  if rose_json:
328
  try:
 
331
  for k, v in j.items():
332
  try:
333
  rose_map[str(k)] = float(v)
334
+ except:
335
  pass
336
+ except:
337
  pass
338
  if not rose_map:
339
  rose_map = None
340
 
341
+ # Generate with model
342
+ full_output = zerogpu_generate(
343
  prompt,
344
  {
345
  "do_sample": bool(do_sample),
346
  "temperature": float(temperature),
347
  "top_p": float(top_p),
348
+ "top_k": int(top_k) if top_k > 0 else None,
349
  "max_new_tokens": int(max_new_tokens),
350
  },
351
  rose_map,
352
  float(rose_alpha),
353
  float(rose_score) if rose_score is not None else None,
354
  int(seed) if seed is not None else None,
355
+ stream=False
356
  )
357
+
358
+ # Extract final response from CoT output
359
+ if show_thinking:
360
+ # Show the full chain-of-thought process
361
+ return f"**Full Output (with thinking):**\n```\n{full_output}\n```\n\n**Final Response:**\n{extract_final_channel(full_output)}"
362
+ else:
363
+ # Just show the final response
364
+ return extract_final_channel(full_output)
365
+
366
  except Exception as e:
367
+ return f"[Error] {type(e).__name__}: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
368
 
369
  # -----------------------
370
  # UI
 
372
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
373
  gr.Markdown(
374
  """
375
+ # Mirel – Harmony Inference (ZeroGPU-ready)
376
+
377
+ Chain-of-thought OSS-20B model with Harmony formatting.
378
+ The model thinks through problems internally before providing a final response.
379
+
380
+ **Note:** Set your HF token as `HF_TOKEN` in Space secrets for private model access.
381
+ """
 
382
  )
383
 
384
  with gr.Row():
385
+ system_prompt = gr.Textbox(
386
+ label="System Prompt",
387
+ value=SYSTEM_DEF,
388
+ lines=2
389
+ )
390
+
391
+ with gr.Accordion("Generation Settings", open=False):
392
  with gr.Row():
393
+ temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
394
+ top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
395
+ top_k = gr.Slider(0, 200, value=0, step=1, label="Top-k (0=disabled)")
 
 
 
 
396
  with gr.Row():
397
+ max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
398
+ do_sample = gr.Checkbox(value=True, label="Do sample")
399
+ seed = gr.Number(value=None, label="Seed (optional)", precision=0)
400
+ show_thinking = gr.Checkbox(
401
+ value=False,
402
+ label="Show thinking process (CoT channels)",
403
+ info="Display the model's internal reasoning channels"
404
+ )
405
+
406
+ with gr.Accordion("Rose Guidance (Optional)", open=False):
407
+ gr.Markdown("Fine-tune generation with token biases")
408
+ with gr.Row():
409
+ rose_enable = gr.Checkbox(value=False, label="Enable Rose bias")
410
+ rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="Alpha (strength)")
411
+ rose_score = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="Score multiplier")
412
+ rose_tokens = gr.Textbox(
413
+ label="Token:weight pairs",
414
+ placeholder="example:1.5, test:-0.5",
415
+ value=""
416
+ )
417
+ rose_json = gr.Textbox(
418
+ label="JSON weights",
419
+ placeholder='{"token": 1.0, "another": -0.5}',
420
+ value=""
421
+ )
422
 
423
+ # Chat interface
424
  chat = gr.ChatInterface(
425
+ fn=generate_response,
426
  type="messages",
427
+ additional_inputs=[
428
+ system_prompt, temperature, top_p, top_k, max_new,
429
+ do_sample, seed, rose_enable, rose_alpha, rose_score,
430
+ rose_tokens, rose_json, show_thinking
431
+ ],
432
+ title="Chat with Mirel",
433
+ description="A chain-of-thought model that thinks before responding",
434
  cache_examples=False,
435
+ retry_btn="Retry",
436
+ undo_btn="Undo",
437
+ clear_btn="Clear",
438
  )
439
 
 
 
440
  gr.Markdown(
441
  """
442
+ ---
443
+ ### Configuration Notes:
444
+ - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
445
+ - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER` for PEFT adapters
446
+ - **ZeroGPU**: Set `ZEROGPU=1` for Spaces with ZeroGPU
447
+ - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
448
+ - **4-bit**: Set `LOAD_4BIT=1` to enable 4-bit quantization
449
+
450
+ The model uses internal "thinking" channels before producing a final response.
451
+ Enable "Show thinking process" to see the full chain-of-thought.
452
+ """
453
  )
454
 
455
  if __name__ == "__main__":
456
+ demo.queue(max_size=8 if ZEROGPU else 32).launch(
457
+ server_name="0.0.0.0",
458
+ server_port=7860,
459
+ share=False
460
+ )