Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 10 days ago

Commit

2e87c77

1 Parent(s): f10571d

claude helping instead of gpt 5 now

Browse files

Files changed (1) hide show

app.py +241 -168

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """
 Mirel Harmony Inference – HF Space (Gradio)
 ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
 Single file: app.py
 """
 from __future__ import annotations
@@ -10,6 +11,7 @@ from typing import List, Dict, Optional, Any
 import gradio as gr
 import spaces  # required for ZeroGPU
 from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
 # -----------------------
 # Config & runtime modes
@@ -21,46 +23,47 @@ ADAPTER_ID        = os.getenv("ADAPTER_ID") or None
 ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
 ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
 DTYPE             = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
-SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant..")
-MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "256"))
 ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
 LOAD_4BIT         = os.getenv("LOAD_4BIT", "0") == "1"
-# Optional: HF auth for private/private repos (Spaces Secrets friendly)
-HF_TOKEN: Optional[str] = None
-#def _hf_login() -> None:
-#    """Login to HF Hub using common env secret names.
-#    Works on Spaces with a single secret set. No CUDA touched here.
-#    """
-#    global HF_TOKEN
-#    HF_TOKEN = (
-#        os.getenv("HF_TOKEN")
-#        or os.getenv("HUGGING_FACE_HUB_TOKEN")
-#        or os.getenv("HUGGINGFACEHUB_API_TOKEN")
-#    )
-#    if HF_TOKEN:
-#        try:
-#            from huggingface_hub import login, whoami
-#            login(token=HF_TOKEN, add_to_git_credential=True)
-#            try:
-#                who = whoami(token=HF_TOKEN)
-#                print(f"[hf] logged in as: {who.get('name') or who.get('email') or who.get('id')}")
-#            except Exception:
-#                pass
-#        except Exception as e:
-#            print(f"[hf] login failed: {e}")
-#    else:
-#        print("[hf] no token found; accessing only public repos")
-#
-#_hf_login()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Is HF OAuth configured for this Space? (set automatically when README has `hf_oauth: true`)
-OAUTH_READY = bool(os.getenv("OAUTH_CLIENT_ID"))
 # Tokenizer is lightweight; load once (pass token for private models)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
 # -----------------------
 # Lazy model loader (ZeroGPU-friendly)
@@ -82,6 +85,7 @@ def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
         attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
         trust_remote_code=True,
         low_cpu_mem_usage=True,
     )
     # Only enable 4-bit when not explicitly CPU-bound
     if LOAD_4BIT and device_map != "cpu":
@@ -96,25 +100,30 @@ def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
 def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, token=HF_TOKEN, **_build_model_kwargs(device_map))
     if ADAPTER_ID:
         if not _HAS_PEFT:
             raise RuntimeError("peft is required when ADAPTER_ID is set.")
-        peft_kwargs: Dict[str, Any] = {}
         if ADAPTER_SUBFOLDER:
             peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
-        model = PeftModel.from_pretrained(model, ADAPTER_ID, is_trainable=False, token=HF_TOKEN, **peft_kwargs)
-    model.eval(); model.config.use_cache = True
     return model
 # -----------------------
-# Harmony formatting
 # -----------------------
 def to_harmony_prompt(messages: List[Dict[str, str]]) -> str:
     """
     Strict Harmony: rely on the tokenizer's official chat template.
-    If the template is missing, raise clearly so the Space uses a Harmony-enabled checkpoint.
     """
     tmpl = getattr(tokenizer, "chat_template", None)
     if not tmpl:
@@ -123,13 +132,38 @@ def to_harmony_prompt(messages: List[Dict[str, str]]) -> str:
         )
     return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
 # -----------------------
 # Optional Rose guidance (logits bias)
-# ----------------------- (logits bias)
 # -----------------------
 def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
-    """Create vocab bias from {token: weight}. Unknown tokens ignored. Positive promotes, negative demotes."""
     vocab_size = len(tokenizer)
     bias = torch.zeros(vocab_size, dtype=torch.float32)
     for tok, w in mapping.items():
@@ -140,7 +174,7 @@ def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor
             for t in tid:
                 if isinstance(t, int) and t >= 0:
                     bias[t] += float(w) / max(1, len(tid))
-        elif isinstance(tid, int) and t >= 0:
             bias[tid] += float(w)
     return bias
@@ -149,83 +183,99 @@ class RoseGuidedLogits(torch.nn.Module):
         super().__init__()
         self.bias_vec = bias_vec
         self.alpha = float(alpha)
     def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         return scores + self.alpha * self.bias_vec.to(scores.device)
-@spaces.GPU
 def zerogpu_generate(full_prompt: str,
                     gen_kwargs: Dict[str, Any],
                     rose_map: Optional[Dict[str, float]],
                     rose_alpha: float,
                     rose_score: Optional[float],
-                    seed: Optional[int]) -> str:
-    """Run **entire** inference on GPU (ZeroGPU-safe). No CUDA touches in main process."""
-    if seed is not None:
-        torch.manual_seed(int(seed))
-    # Load base + adapter directly on GPU inside the GPU context
-    model = _load_model_on("auto")
     try:
         logits_processor = None
         if rose_map:
             bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
             eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
             logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
         inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
-        out_ids = model.generate(
-            **inputs,
-            do_sample=bool(gen_kwargs.get("do_sample", True)),
-            temperature=float(gen_kwargs.get("temperature", 0.7)),
-            top_p=float(gen_kwargs.get("top_p", 0.9)),
-            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") else None),
-            max_new_tokens=int(gen_kwargs.get("max_new_tokens", 512)),
-            pad_token_id=tokenizer.eos_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            logits_processor=logits_processor,
-        )
-        # Decode only the generated tail (exclude prompt) and extract the `final` channel
-        prompt_len = int(inputs["input_ids"].shape[1])
-        gen_ids = out_ids[0][prompt_len:]
-        decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
-        fb, ret, end = "<|channel|>final<|message|>", "<|return|>", "<|end|>"
-        idx = decoded.rfind(fb)
-        if idx != -1:
-            s = decoded[idx + len(fb):]
-            stop = s.find(ret)
-            if stop == -1:
-                stop = s.find(end)
-            if stop != -1:
-                s = s[:stop]
-            text = s.strip()
         else:
-            text = decoded.strip()
-        return text
     finally:
-        # Ensure no GPU state leaks back to the main process
         try:
             del model
-        except Exception:
             pass
         gc.collect()
-        try:
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-        except Exception:
-            pass
 # -----------------------
-# Gradio handlers and UI
 # -----------------------
-@dataclass
-class GenCfg:
-    temperature: float
-    top_p: float
-    top_k: Optional[int]
-    max_new_tokens: int
-    do_sample: bool
-    seed: Optional[int]
 def chat_to_messages(history: List[Any], system_prompt: str) -> List[Dict[str, str]]:
     msgs: List[Dict[str, str]] = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
@@ -238,28 +288,30 @@ def chat_to_messages(history: List[Any], system_prompt: str) -> List[Dict[str, s
         if isinstance(item, (list, tuple)) and len(item) == 2:
             u, a = item
             if u is not None:
-                msgs.append({"role": "user", "content": u})
             if a:
-                msgs.append({"role": "assistant", "content": a})
     return msgs
-def generate_stream(message: Any, history: List[Any], system_prompt: str,
                     temperature: float, top_p: float, top_k: int, max_new_tokens: int,
                     do_sample: bool, seed: Optional[int],
-                    rose_enable: bool, rose_alpha: float, rose_score: Optional[float], rose_tokens: str, rose_json: str):
-    """ZeroGPU generator (non-streaming): do all CUDA work inside `zerogpu_generate` and
-    return a single string. This avoids h11 Content-Length issues on exceptions mid-stream.
     """
     try:
         # Normalize message and build Harmony prompt
         if isinstance(message, dict):
             message = message.get("content", "")
         msgs = chat_to_messages(history, system_prompt)
         msgs.append({"role": "user", "content": str(message)})
         prompt = to_harmony_prompt(msgs)
-        # Rose map
         rose_map: Optional[Dict[str, float]] = None
         if rose_enable:
             rose_map = {}
@@ -270,7 +322,7 @@ def generate_stream(message: Any, history: List[Any], system_prompt: str,
                         k, v = p.split(":", 1)
                         try:
                             rose_map[k.strip()] = float(v)
-                        except Exception:
                             pass
             if rose_json:
                 try:
@@ -279,55 +331,40 @@ def generate_stream(message: Any, history: List[Any], system_prompt: str,
                         for k, v in j.items():
                             try:
                                 rose_map[str(k)] = float(v)
-                            except Exception:
                                 pass
-                except Exception:
                     pass
             if not rose_map:
                 rose_map = None
-        # Always use the GPU entrypoint; return once
-        text = zerogpu_generate(
             prompt,
             {
                 "do_sample": bool(do_sample),
                 "temperature": float(temperature),
                 "top_p": float(top_p),
-                "top_k": (int(top_k) if int(top_k) > 0 else None),
                 "max_new_tokens": int(max_new_tokens),
             },
             rose_map,
             float(rose_alpha),
             float(rose_score) if rose_score is not None else None,
             int(seed) if seed is not None else None,
         )
-        return text
     except Exception as e:
-        # Return error as plain text (no streaming) to avoid Content-Length mismatches
-        return f"[error] {type(e).__name__}: {e}"
-# -----------------------
-# Helper: login status banner (HF OAuth)
-# -----------------------
-#def _login_status(profile: gr.OAuthProfile | None) -> str:
-#    """Show whether the visitor is logged in to Hugging Face.
-#    This affects ZeroGPU quotas (logged-in users get their own token/quota).
-#    Requires the Space to have `hf_oauth: true` in README metadata.
-#    """
-#    # If OAuth isn't configured on the Space, inform clearly
-#    if not os.getenv("OAUTH_CLIENT_ID"):
-#        return (
-#            "ℹ️ OAuth is not configured on this Space. Add `hf_oauth: true` to README metadata "
-#            "so users can sign in and ZeroGPU can use their account quota."
-#        )
-#    if profile is None:
-#        return (
-#            "🔒 Not signed in to Hugging Face — ZeroGPU will count as anonymous (lower quota). "
-#            "Click **Sign in with HF** above."
-#        )
-#    name = getattr(profile, "name", None) or getattr(profile, "preferred_username", None) or getattr(profile, "id", "user")
-#    return f"🔓 Signed in as **{name}** — ZeroGPU will use your account quota."
 # -----------------------
 # UI
@@ -335,53 +372,89 @@ def generate_stream(message: Any, history: List[Any], system_prompt: str,
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
-    # Mirel – Harmony Inference (ZeroGPU-ready)
-    OSS-20B + optional Rose-SFT adapter. Harmony chat template is applied automatically.
-    """
-    )
-    # Sign-in note
-    login_status = gr.Markdown(
-        "If you're logged into huggingface.co in this browser, ZeroGPU will use *your* quota automatically."
     )
     with gr.Row():
-        system_prompt = gr.Textbox(label="System", value=SYSTEM_DEF)
-    with gr.Accordion("Generation settings", open=False):
         with gr.Row():
-            temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="temperature")
-            top_p       = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="top_p")
-            top_k       = gr.Slider(0, 200, value=0, step=1, label="top_k (0=off)")
-            max_new     = gr.Slider(16, 2048, value=MAX_DEF, step=8, label="max_new_tokens")
-            do_sample   = gr.Checkbox(value=True, label="do_sample")
-            seed        = gr.Number(value=None, label="seed (optional)")
-    with gr.Accordion("Rose guidance (optional)", open=False):
         with gr.Row():
-            rose_enable = gr.Checkbox(value=False, label="Enable Rose bias at decode")
-            rose_alpha  = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="rose alpha (strength)")
-            rose_score  = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="rose score (0–1)")
-        rose_tokens = gr.Textbox(label="token:weight list (comma-separated)", value="")
-        rose_json   = gr.Textbox(label="JSON {token: weight}", value="")
     chat = gr.ChatInterface(
-        fn=generate_stream,
         type="messages",
-        additional_inputs=[system_prompt, temperature, top_p, top_k, max_new, do_sample, seed, rose_enable, rose_alpha, rose_score, rose_tokens, rose_json],
-        title="Mirel",
         cache_examples=False,
     )
     gr.Markdown(
         """
-    **Notes**
-    - Set env `ZEROGPU=1` for just-in-time GPU allocation via @spaces.GPU.
-    - Set `ADAPTER_ID=AbstractPhil/mirel-gpt-oss-20b` and `ADAPTER_SUBFOLDER=checkpoints/checkpoint-516` to use the provided adapter.
-    - Use `torch==2.4.0` for ZeroGPU.
-    - Rose guidance biases logits; it does not change weights.
-    """
     )
 if __name__ == "__main__":
-    demo.queue(max_size=8 if ZEROGPU else 32).launch(server_name="0.0.0.0", server_port=7860)

 """
 Mirel Harmony Inference – HF Space (Gradio)
 ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
+Chain-of-thought model with proper channel extraction
 Single file: app.py
 """
 from __future__ import annotations
 import gradio as gr
 import spaces  # required for ZeroGPU
 from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
+from threading import Thread
 # -----------------------
 # Config & runtime modes
 ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
 ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
 DTYPE             = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
+SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
+MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "512"))
 ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
 LOAD_4BIT         = os.getenv("LOAD_4BIT", "0") == "1"
+# HF Auth - properly handle multiple token env var names
+HF_TOKEN: Optional[str] = (
+    os.getenv("HF_TOKEN")
+    or os.getenv("HUGGING_FACE_HUB_TOKEN")
+    or os.getenv("HUGGINGFACEHUB_API_TOKEN")
+    or os.getenv("HF_ACCESS_TOKEN")
+)
+def _hf_login() -> None:
+    """Login to HF Hub using common env secret names."""
+    if HF_TOKEN:
+        try:
+            from huggingface_hub import login, whoami
+            login(token=HF_TOKEN, add_to_git_credential=True)
+            try:
+                who = whoami(token=HF_TOKEN)
+                print(f"[HF Auth] Logged in as: {who.get('name') or who.get('fullname') or who.get('id', 'unknown')}")
+            except Exception:
+                print("[HF Auth] Login successful but couldn't get user info")
+        except Exception as e:
+            print(f"[HF Auth] Login failed: {e}")
+    else:
+        print("[HF Auth] No token found in environment variables (HF_TOKEN, HUGGING_FACE_HUB_TOKEN, etc.)")
+# Login before loading any models
+_hf_login()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # Tokenizer is lightweight; load once (pass token for private models)
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
+    print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
+except Exception as e:
+    print(f"[Model] Failed to load tokenizer: {e}")
+    raise
 # -----------------------
 # Lazy model loader (ZeroGPU-friendly)
         attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
         trust_remote_code=True,
         low_cpu_mem_usage=True,
+        token=HF_TOKEN,  # Add token here for private model access
     )
     # Only enable 4-bit when not explicitly CPU-bound
     if LOAD_4BIT and device_map != "cpu":
 def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
+    print(f"[Model] Loading base model from {MODEL_ID}...")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **_build_model_kwargs(device_map))
     if ADAPTER_ID:
         if not _HAS_PEFT:
             raise RuntimeError("peft is required when ADAPTER_ID is set.")
+        print(f"[Model] Loading adapter from {ADAPTER_ID}...")
+        peft_kwargs: Dict[str, Any] = {"token": HF_TOKEN}
         if ADAPTER_SUBFOLDER:
             peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
+        model = PeftModel.from_pretrained(model, ADAPTER_ID, is_trainable=False, **peft_kwargs)
+    model.eval()
+    model.config.use_cache = True
+    print("[Model] Model loaded successfully")
     return model
 # -----------------------
+# Harmony formatting & CoT extraction
 # -----------------------
 def to_harmony_prompt(messages: List[Dict[str, str]]) -> str:
     """
     Strict Harmony: rely on the tokenizer's official chat template.
     """
     tmpl = getattr(tokenizer, "chat_template", None)
     if not tmpl:
         )
     return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+def extract_final_channel(text: str) -> str:
+    """
+    Extract the final channel from chain-of-thought output.
+    The model outputs thinking in internal channels and final response in final channel.
+    """
+    # Look for the final channel marker
+    final_marker = "<|channel|>final<|message|>"
+    if final_marker in text:
+        # Extract everything after the final channel marker
+        parts = text.split(final_marker)
+        if len(parts) > 1:
+            final_text = parts[-1]
+            # Clean up end markers
+            end_markers = ["<|return|>", "<|end|>", "<|endoftext|>"]
+            for marker in end_markers:
+                if marker in final_text:
+                    final_text = final_text.split(marker)[0]
+            return final_text.strip()
+    # If no channel markers found, return the cleaned text
+    # (might be a non-CoT response or error)
+    return text.strip()
 # -----------------------
 # Optional Rose guidance (logits bias)
 # -----------------------
 def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
+    """Create vocab bias from {token: weight}. Unknown tokens ignored."""
     vocab_size = len(tokenizer)
     bias = torch.zeros(vocab_size, dtype=torch.float32)
     for tok, w in mapping.items():
             for t in tid:
                 if isinstance(t, int) and t >= 0:
                     bias[t] += float(w) / max(1, len(tid))
+        elif isinstance(tid, int) and tid >= 0:
             bias[tid] += float(w)
     return bias
         super().__init__()
         self.bias_vec = bias_vec
         self.alpha = float(alpha)
     def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         return scores + self.alpha * self.bias_vec.to(scores.device)
+@spaces.GPU(duration=120)  # Give enough time for longer CoT generations
 def zerogpu_generate(full_prompt: str,
                     gen_kwargs: Dict[str, Any],
                     rose_map: Optional[Dict[str, float]],
                     rose_alpha: float,
                     rose_score: Optional[float],
+                    seed: Optional[int],
+                    stream: bool = False) -> str:
+    """Run inference on GPU (ZeroGPU-safe)."""
     try:
+        if seed is not None:
+            torch.manual_seed(int(seed))
+        # Load model
+        model = _load_model_on("auto")
+        # Setup logits processor for Rose guidance
         logits_processor = None
         if rose_map:
             bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
             eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
             logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
+        # Tokenize input
         inputs = tokenizer(full_prompt, return_tensors="pt").to(next(model.parameters()).device)
+        if stream:
+            # Streaming generation (for future use)
+            streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
+            generation_kwargs = dict(
+                **inputs,
+                streamer=streamer,
+                do_sample=bool(gen_kwargs.get("do_sample", True)),
+                temperature=float(gen_kwargs.get("temperature", 0.7)),
+                top_p=float(gen_kwargs.get("top_p", 0.9)),
+                top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
+                max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                logits_processor=logits_processor,
+            )
+            thread = Thread(target=model.generate, kwargs=generation_kwargs)
+            thread.start()
+            generated_text = ""
+            for new_text in streamer:
+                generated_text += new_text
+                # Could yield here for real streaming
+            thread.join()
+            return generated_text
         else:
+            # Non-streaming generation
+            out_ids = model.generate(
+                **inputs,
+                do_sample=bool(gen_kwargs.get("do_sample", True)),
+                temperature=float(gen_kwargs.get("temperature", 0.7)),
+                top_p=float(gen_kwargs.get("top_p", 0.9)),
+                top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
+                max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
+                pad_token_id=tokenizer.eos_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                logits_processor=logits_processor,
+            )
+            # Decode the full output (including special tokens for CoT)
+            prompt_len = int(inputs["input_ids"].shape[1])
+            gen_ids = out_ids[0][prompt_len:]
+            decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
+            return decoded
+    except Exception as e:
+        return f"[Error] {type(e).__name__}: {str(e)}"
     finally:
+        # Cleanup
         try:
             del model
+        except:
             pass
         gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
 # -----------------------
+# Gradio handlers
 # -----------------------
 def chat_to_messages(history: List[Any], system_prompt: str) -> List[Dict[str, str]]:
     msgs: List[Dict[str, str]] = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
         if isinstance(item, (list, tuple)) and len(item) == 2:
             u, a = item
             if u is not None:
+                msgs.append({"role": "user", "content": str(u)})
             if a:
+                msgs.append({"role": "assistant", "content": str(a)})
     return msgs
+def generate_response(message: Any, history: List[Any], system_prompt: str,
                     temperature: float, top_p: float, top_k: int, max_new_tokens: int,
                     do_sample: bool, seed: Optional[int],
+                    rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
+                    rose_tokens: str, rose_json: str,
+                    show_thinking: bool = False):
+    """
+    Generate response with proper CoT handling.
     """
     try:
         # Normalize message and build Harmony prompt
         if isinstance(message, dict):
             message = message.get("content", "")
         msgs = chat_to_messages(history, system_prompt)
         msgs.append({"role": "user", "content": str(message)})
         prompt = to_harmony_prompt(msgs)
+        # Build Rose map if enabled
         rose_map: Optional[Dict[str, float]] = None
         if rose_enable:
             rose_map = {}
                         k, v = p.split(":", 1)
                         try:
                             rose_map[k.strip()] = float(v)
+                        except:
                             pass
             if rose_json:
                 try:
                         for k, v in j.items():
                             try:
                                 rose_map[str(k)] = float(v)
+                            except:
                                 pass
+                except:
                     pass
             if not rose_map:
                 rose_map = None
+        # Generate with model
+        full_output = zerogpu_generate(
             prompt,
             {
                 "do_sample": bool(do_sample),
                 "temperature": float(temperature),
                 "top_p": float(top_p),
+                "top_k": int(top_k) if top_k > 0 else None,
                 "max_new_tokens": int(max_new_tokens),
             },
             rose_map,
             float(rose_alpha),
             float(rose_score) if rose_score is not None else None,
             int(seed) if seed is not None else None,
+            stream=False
         )
+        # Extract final response from CoT output
+        if show_thinking:
+            # Show the full chain-of-thought process
+            return f"**Full Output (with thinking):**\n```\n{full_output}\n```\n\n**Final Response:**\n{extract_final_channel(full_output)}"
+        else:
+            # Just show the final response
+            return extract_final_channel(full_output)
     except Exception as e:
+        return f"[Error] {type(e).__name__}: {str(e)}"
 # -----------------------
 # UI
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
+        # Mirel – Harmony Inference (ZeroGPU-ready)
+        Chain-of-thought OSS-20B model with Harmony formatting.
+        The model thinks through problems internally before providing a final response.
+        **Note:** Set your HF token as `HF_TOKEN` in Space secrets for private model access.
+        """
     )
     with gr.Row():
+        system_prompt = gr.Textbox(
+            label="System Prompt",
+            value=SYSTEM_DEF,
+            lines=2
+        )
+    with gr.Accordion("Generation Settings", open=False):
         with gr.Row():
+            temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
+            top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
+            top_k = gr.Slider(0, 200, value=0, step=1, label="Top-k (0=disabled)")
         with gr.Row():
+            max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
+            do_sample = gr.Checkbox(value=True, label="Do sample")
+            seed = gr.Number(value=None, label="Seed (optional)", precision=0)
+        show_thinking = gr.Checkbox(
+            value=False,
+            label="Show thinking process (CoT channels)",
+            info="Display the model's internal reasoning channels"
+        )
+    with gr.Accordion("Rose Guidance (Optional)", open=False):
+        gr.Markdown("Fine-tune generation with token biases")
+        with gr.Row():
+            rose_enable = gr.Checkbox(value=False, label="Enable Rose bias")
+            rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="Alpha (strength)")
+            rose_score = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="Score multiplier")
+        rose_tokens = gr.Textbox(
+            label="Token:weight pairs",
+            placeholder="example:1.5, test:-0.5",
+            value=""
+        )
+        rose_json = gr.Textbox(
+            label="JSON weights",
+            placeholder='{"token": 1.0, "another": -0.5}',
+            value=""
+        )
+    # Chat interface
     chat = gr.ChatInterface(
+        fn=generate_response,
         type="messages",
+        additional_inputs=[
+            system_prompt, temperature, top_p, top_k, max_new,
+            do_sample, seed, rose_enable, rose_alpha, rose_score,
+            rose_tokens, rose_json, show_thinking
+        ],
+        title="Chat with Mirel",
+        description="A chain-of-thought model that thinks before responding",
         cache_examples=False,
+        retry_btn="Retry",
+        undo_btn="Undo",
+        clear_btn="Clear",
     )
     gr.Markdown(
         """
+        ---
+        ### Configuration Notes:
+        - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
+        - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER` for PEFT adapters
+        - **ZeroGPU**: Set `ZEROGPU=1` for Spaces with ZeroGPU
+        - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
+        - **4-bit**: Set `LOAD_4BIT=1` to enable 4-bit quantization
+        The model uses internal "thinking" channels before producing a final response.
+        Enable "Show thinking process" to see the full chain-of-thought.
+        """
     )
 if __name__ == "__main__":
+    demo.queue(max_size=8 if ZEROGPU else 32).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )