Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 8 days ago

Commit

ae231bc

1 Parent(s): 2a83e65

probably works-ish

Browse files

Files changed (1) hide show

app.py +702 -71

app.py CHANGED Viewed

@@ -1,97 +1,728 @@
 from __future__ import annotations
-import os, gc, torch
-from typing import Optional, Dict, Any, List
 import gradio as gr
-import spaces
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from peft import PeftModel
-MODEL_ID = "openai/gpt-oss-20b"
-ADAPTER_ID = "AbstractPhil/mirel-gpt-oss-20b"
-ADAPTER_SUBFOLDER = "checkpoints/checkpoint-516"
 HF_TOKEN: Optional[str] = (
-    os.getenv("HF_TOKEN")
-    or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HUGGINGFACEHUB_API_TOKEN")
     or os.getenv("HF_ACCESS_TOKEN")
 )
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Load tokenizer on CPU
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
-if tokenizer.pad_token_id is None:
-    tokenizer.pad_token_id = tokenizer.eos_token_id
 @spaces.GPU(duration=120)
-def gpu_generate(prompt_str: str, max_new_tokens: int = 512) -> str:
-    torch.set_grad_enabled(False)
-    model = None
     try:
-        model_kwargs = dict(
-            attn_implementation="eager",
-            torch_dtype="auto",
-            use_cache=True,
-            device_map="auto",
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-            token=HF_TOKEN,
         )
-        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **model_kwargs)
-        # Apply Rose LoRA (minimal)
-        if ADAPTER_ID:
-            peft_kwargs: Dict[str, Any] = {"is_trainable": False, "token": HF_TOKEN}
-            if ADAPTER_SUBFOLDER:
-                peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
-            model = PeftModel.from_pretrained(model, ADAPTER_ID, **peft_kwargs)
-            model = model.merge_and_unload()
-        model.eval()
-        model.config.pad_token_id = tokenizer.pad_token_id
-        enc = tokenizer(prompt_str, return_tensors="pt")
-        input_ids = enc["input_ids"].to(model.device)
-        attention_mask = (input_ids != tokenizer.pad_token_id).long().to(model.device)
-        prompt_len = input_ids.shape[-1]
-        output_ids = model.generate(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            max_new_tokens=max_new_tokens,
-            pad_token_id=tokenizer.pad_token_id,
         )
-        new_ids = output_ids[0, prompt_len:]
-        return tokenizer.decode(new_ids, skip_special_tokens=True)
     except Exception as e:
-        return f"[Error] {type(e).__name__}: {e}"
     finally:
-        del model
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-def ui_generate(message, history):
-    msgs: List[Dict[str, str]] = []
-    if isinstance(history, list):
-        for m in history:
-            if isinstance(m, dict) and "role" in m:
-                msgs.append(m)
-            elif isinstance(m, (list, tuple)) and len(m) >= 2:
-                if m[0]:
-                    msgs.append({"role": "user", "content": str(m[0])})
-                if m[1]:
-                    msgs.append({"role": "assistant", "content": str(m[1])})
-    if isinstance(message, dict):
-        msgs.append(message)
-    else:
-        msgs.append({"role": "user", "content": str(message)})
-    prompt = tokenizer.apply_chat_template(msgs, add_generation_prompt=True, tokenize=False)
-    return gpu_generate(prompt)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""# Mirel – GPT‑OSS‑20B + Rose LoRA (ZeroGPU, Minimal)""")
-    gr.ChatInterface(fn=ui_generate, type="messages", title="Mirel", cache_examples=False)
-if __name__ == "__main__":
-    demo.queue().launch(server_name="0.0.0.0", server_port=7860)

+"""
+Mirel Harmony Inference – HF Space (Gradio)
+ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
+Chain-of-thought model with proper channel extraction using openai_harmony
+Single file: app.py
+"""
 from __future__ import annotations
+import os, gc, json, threading, torch
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Any
+from datetime import datetime
 import gradio as gr
+import spaces  # required for ZeroGPU
+from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
+# Import Harmony components
+try:
+    from openai_harmony import (
+        Author,
+        Conversation,
+        HarmonyEncodingName,
+        Message,
+        Role,
+        SystemContent,
+        DeveloperContent,
+        load_harmony_encoding,
+        ReasoningEffort
+    )
+    HARMONY_AVAILABLE = True
+except ImportError:
+    print("[WARNING] openai_harmony not installed. Install with: pip install openai-harmony")
+    HARMONY_AVAILABLE = False
+# -----------------------
+# Config & runtime modes
+# -----------------------
+DTYPE_MAP = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
+MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
+ADAPTER_ID        = os.getenv("ADAPTER_ID") or None
+ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
+ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
+DTYPE             = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
+SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
+MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "256"))
+ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
+LOAD_4BIT         = os.getenv("LOAD_4BIT", "0") == "1"
+# Harmony channels for CoT
+REQUIRED_CHANNELS = ["analysis", "final"]
+# HF Auth - properly handle multiple token env var names
 HF_TOKEN: Optional[str] = (
+    os.getenv("HF_TOKEN")
+    or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HUGGINGFACEHUB_API_TOKEN")
     or os.getenv("HF_ACCESS_TOKEN")
 )
+def _hf_login() -> None:
+    """Login to HF Hub using common env secret names."""
+    if HF_TOKEN:
+        try:
+            from huggingface_hub import login, whoami
+            login(token=HF_TOKEN, add_to_git_credential=True)
+            try:
+                who = whoami(token=HF_TOKEN)
+                print(f"[HF Auth] Logged in as: {who.get('name') or who.get('fullname') or who.get('id', 'unknown')}")
+            except Exception:
+                print("[HF Auth] Login successful but couldn't get user info")
+        except Exception as e:
+            print(f"[HF Auth] Login failed: {e}")
+    else:
+        print("[HF Auth] No token found in environment variables")
+# Login before loading any models
+_hf_login()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Load Harmony encoding if available
+if HARMONY_AVAILABLE:
+    harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+else:
+    harmony_encoding = None
+# Stop tokens per Harmony spec: <|return|> (200002), <|call|> (200012)
+HARMONY_STOP_IDS = harmony_encoding.stop_tokens_for_assistant_actions() if HARMONY_AVAILABLE else []
+# Tokenizer is lightweight; load once
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
+    print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
+except Exception as e:
+    print(f"[Model] Failed to load tokenizer: {e}")
+    raise
+# -----------------------
+# Model loading
+# -----------------------
+try:
+    from peft import PeftModel
+    _HAS_PEFT = True
+except Exception:
+    _HAS_PEFT = False
+def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
+    kw: Dict[str, Any] = dict(
+        torch_dtype=DTYPE,
+        device_map=device_map,
+        attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+        token=HF_TOKEN,
+    )
+    if LOAD_4BIT and device_map != "cpu":
+        try:
+            import bitsandbytes as _bnb
+            kw.update(load_in_4bit=True)
+            if kw["device_map"] is None:
+                kw["device_map"] = "auto"
+        except Exception:
+            pass
+    return kw
+def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
+    print(f"[Model] Loading base model from {MODEL_ID}...")
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **_build_model_kwargs(device_map))
+    if ADAPTER_ID:
+        if not _HAS_PEFT:
+            raise RuntimeError("peft is required when ADAPTER_ID is set.")
+        print(f"[Model] Loading adapter from {ADAPTER_ID}...")
+        peft_kwargs: Dict[str, Any] = {"token": HF_TOKEN}
+        if ADAPTER_SUBFOLDER:
+            peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
+        model = PeftModel.from_pretrained(model, ADAPTER_ID, is_trainable=False, **peft_kwargs)
+    model.eval()
+    # Ensure a valid pad_token_id is set; some OSS checkpoints reuse eos as pad
+    if getattr(model.config, "pad_token_id", None) is None:
+        model.config.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
+    model.config.use_cache = True
+    print("[Model] Model loaded successfully")
+    return model
+# -----------------------
+# Harmony formatting
+# -----------------------
+def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str = "high") -> Any:
+    """Build a Harmony-formatted prompt. If Harmony is available, return **token IDs**
+    rendered by `openai_harmony` (authoritative). Otherwise fall back to the
+    tokenizer's chat template and return a string.
+    """
+    if HARMONY_AVAILABLE and harmony_encoding is not None:
+        effort_map = {"low": ReasoningEffort.LOW, "medium": ReasoningEffort.MEDIUM, "high": ReasoningEffort.HIGH}
+        effort = effort_map.get(str(reasoning_effort).lower(), ReasoningEffort.HIGH)
+        system_content = (
+            SystemContent.new()
+            .with_model_identity("You are ChatGPT, a large language model trained by OpenAI.")
+            .with_reasoning_effort(effort)
+            .with_conversation_start_date(datetime.now().strftime("%Y-%m-%d"))
+            .with_knowledge_cutoff("2024-06")
+            .with_required_channels(REQUIRED_CHANNELS)
+        )
+        # Use first system message as developer instructions if present, else SYSTEM_DEF
+        sys_text = SYSTEM_DEF
+        rest: List[Dict[str, str]] = messages or []
+        if rest and rest[0].get("role") == "system":
+            sys_text = rest[0].get("content") or SYSTEM_DEF
+            rest = rest[1:]
+        harmony_messages = [Message.from_role_and_content(Role.SYSTEM, system_content)]
+        dev = DeveloperContent.new().with_instructions(sys_text)
+        harmony_messages.append(Message.from_role_and_content(Role.DEVELOPER, dev))
+        for m in rest:
+            role = m.get("role"); content = m.get("content", "")
+            if role == "user":
+                harmony_messages.append(Message.from_role_and_content(Role.USER, content))
+            elif role == "assistant":
+                harmony_messages.append(
+                    Message.from_role_and_content(Role.ASSISTANT, content).with_channel("final")
+                )
+        convo = Conversation.from_messages(harmony_messages)
+        rendered = harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
+        # Ensure assistant header includes a final channel + message start to avoid 'assistantassistant...' loops
+        try:
+            _tail = tokenizer.decode(list(rendered)[-64:], skip_special_tokens=False)
+            if '<|channel|>final<|message|>' not in _tail:
+                rendered = list(rendered) + tokenizer.encode('<|channel|>final<|message|>', add_special_tokens=False)
+        except Exception:
+            rendered = list(rendered)
+        return rendered
+    # Fallback: tokenizer chat template -> string prompt
+    if not messages or messages[0].get("role") != "system":
+        messages = [{"role": "system", "content": SYSTEM_DEF}] + (messages or [])
+    return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
+    """Parse response tokens using Harmony format to extract channels."""
+    if not HARMONY_AVAILABLE:
+        # Fallback: just decode and extract final channel manually
+        text = tokenizer.decode(tokens, skip_special_tokens=False)
+        return {"final": extract_final_channel_fallback(text), "raw": text}
+    # Parse messages from completion tokens
+    parsed_messages = harmony_encoding.parse_messages_from_completion_tokens(tokens, Role.ASSISTANT)
+    # Extract content by channel
+    channels = {}
+    for msg in parsed_messages:
+        channel = msg.channel if hasattr(msg, 'channel') else "final"
+        if channel not in channels:
+            channels[channel] = ""
+        channels[channel] += "".join([getattr(part, "text", str(part)) for part in (msg.content if isinstance(msg.content, list) else [msg.content])])
+    # Ensure we have a final channel
+    if "final" not in channels:
+        channels["final"] = " ".join(channels.values())
+    return channels
+def extract_final_channel_fallback(text: str) -> str:
+    """Robustly extract the <final> channel from decoded Harmony text.
+    Works even if parsing fails or the model emits extra headers.
+    """
+    try:
+        chunks: Dict[str, str] = {}
+        pieces = text.split("<|channel|>")
+        for seg in pieces[1:]:
+            name_end = seg.find("<|message|>")
+            if name_end <= 0:
+                continue
+            ch = seg[:name_end].strip()
+            body_start = name_end + len("<|message|>")
+            # end at next channel/end/return marker
+            next_pos = len(seg)
+            for delim in ("<|channel|>", "<|end|>", "<|return|>"):
+                p = seg.find(delim, body_start)
+                if p != -1:
+                    next_pos = min(next_pos, p)
+            body = seg[body_start:next_pos]
+            chunks[ch] = chunks.get(ch, "") + body
+        final_txt = (chunks.get("final", "").strip())
+        if final_txt:
+            return final_txt
+        # Fallback: everything after last final marker up to a terminator
+        if "<|channel|>final<|message|>" in text:
+            tail = text.split("<|channel|>final<|message|>")[-1]
+            for delim in ("<|return|>", "<|end|>", "<|channel|>"):
+                idx = tail.find(delim)
+                if idx != -1:
+                    tail = tail[:idx]
+                    break
+            return tail.strip()
+    except Exception:
+        pass
+    return text.strip()
+# -----------------------
+# Rose guidance
+# -----------------------
+def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
+    """Create vocab bias from {token: weight}."""
+    vocab_size = len(tokenizer)
+    bias = torch.zeros(vocab_size, dtype=torch.float32)
+    for tok, w in mapping.items():
+        if tok is None:
+            continue
+        tid = tokenizer.convert_tokens_to_ids(tok)
+        if isinstance(tid, list):
+            for t in tid:
+                if isinstance(t, int) and t >= 0:
+                    bias[t] += float(w) / max(1, len(tid))
+        elif isinstance(tid, int) and t >= 0:
+            bias[tid] += float(w)
+    return bias
+class RoseGuidedLogits(torch.nn.Module):
+    def __init__(self, bias_vec: torch.Tensor, alpha: float = 1.0):
+        super().__init__()
+        self.bias_vec = bias_vec
+        self.alpha = float(alpha)
+    def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        return scores + self.alpha * self.bias_vec.to(scores.device)
+class StopOnTokens(StoppingCriteria):
+    def __init__(self, stop_ids: List[int]):
+        self.stop_ids = set(int(s) for s in (stop_ids or []))
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
+        return int(input_ids[0, -1]) in self.stop_ids
 @spaces.GPU(duration=120)
+def zerogpu_generate(full_prompt,
+                    gen_kwargs: Dict[str, Any],
+                    rose_map: Optional[Dict[str, float]],
+                    rose_alpha: float,
+                    rose_score: Optional[float],
+                    seed: Optional[int]) -> Dict[str, str]:
+    """Run inference on GPU and return parsed channels."""
     try:
+        if seed is not None:
+            torch.manual_seed(int(seed))
+        # Load model
+        model = _load_model_on("auto")
+        # Setup logits processor for Rose guidance
+        logits_processor = None
+        if rose_map:
+            bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
+            eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
+            logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
+        # Tokenize / prepare inputs
+        device = next(model.parameters()).device
+        if HARMONY_AVAILABLE and not isinstance(full_prompt, str):
+            # Accept list/tuple or any iterable of ints from openai_harmony
+            try:
+                token_list = list(full_prompt)
+            except TypeError:
+                token_list = list(getattr(full_prompt, "ids", getattr(full_prompt, "token_ids", [])))
+            if not token_list:
+                raise ValueError("Harmony prompt produced no tokens")
+            input_ids = torch.tensor([token_list], dtype=torch.long, device=device)
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)
+            inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            prompt_len = input_ids.shape[1]
+        else:
+            enc = tokenizer(full_prompt, return_tensors="pt")
+            inputs = {k: v.to(device) for k, v in enc.items()}
+            prompt_len = int(inputs["input_ids"].shape[1])
+            if "attention_mask" not in inputs:
+                inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
+        # Prepare stopping
+        sc = None
+        if HARMONY_AVAILABLE and HARMONY_STOP_IDS:
+            sc = StoppingCriteriaList([StopOnTokens(HARMONY_STOP_IDS)])
+        # Generate
+        # Disallow degenerate header loops
+        bad_words_ids = None
+        try:
+            _B = []
+            for s in ("assistantassistant", "assistant", "<|assistant|>"):
+                ids = tokenizer.encode(s, add_special_tokens=False)
+                if ids:
+                    _B.append(ids)
+            bad_words_ids = _B if _B else None
+        except Exception:
+            pass
+        out_ids = model.generate(
+            **inputs,
+            do_sample=bool(gen_kwargs.get("do_sample", True)),
+            temperature=float(gen_kwargs.get("temperature", 0.7)),
+            top_p=float(gen_kwargs.get("top_p", 0.9)),
+            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
+            max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
+            pad_token_id=model.config.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            bad_words_ids=bad_words_ids,
+            logits_processor=logits_processor,
+            repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.2)),
+            no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 8)),
+            stopping_criteria=sc,
         )
+        # Extract generated tokens only
+        out_list = out_ids[0].tolist()
+        gen_ids = out_list[prompt_len:]
+        # Truncate at first Harmony stop token if present
+        if HARMONY_AVAILABLE:
+            for sid in HARMONY_STOP_IDS:
+                if sid in gen_ids:
+                    gen_ids = gen_ids[:gen_ids.index(sid)]
+                    break
+        # Parse response with Harmony
+        if HARMONY_AVAILABLE:
+            try:
+                channels = parse_harmony_response(gen_ids)
+            except Exception:
+                # Fallback to text parsing if Harmony parser fails
+                decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
+                channels = {
+                    "final": extract_final_channel_fallback(decoded),
+                    "raw": decoded
+                }
+        else:
+            # Fallback decode + channels
+            decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
+            channels = {
+                "final": extract_final_channel_fallback(decoded),
+                "raw": decoded
+            }
+        return channels
+    except Exception as e:
+        return {"final": f"[Error] {type(e).__name__}: {str(e)}", "raw": str(e)}
+    finally:
+        # Cleanup
+        try:
+            del model
+        except:
+            pass
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+# -----------------------
+# GPU Debug: Harmony Inspector
+# -----------------------
+@spaces.GPU(duration=120)
+def zerogpu_generate_debug(full_prompt, gen_kwargs: Dict[str, Any]) -> Dict[str, Any]:
+    """Minimal GPU path to run a single prompt and return Harmony-parsed output
+    along with short token previews for debugging. Does not use Rose for clarity."""
+    model = None
+    try:
+        model = _load_model_on("auto")
+        device = next(model.parameters()).device
+        # Prepare inputs (tokens if Harmony renderer used, else string -> encode)
+        if HARMONY_AVAILABLE and not isinstance(full_prompt, str):
+            token_list = list(full_prompt)
+            if not token_list:
+                raise ValueError("Harmony prompt produced no tokens")
+            input_ids = torch.tensor([token_list], dtype=torch.long, device=device)
+            attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)
+            inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            prompt_len = input_ids.shape[1]
+        else:
+            enc = tokenizer(full_prompt, return_tensors="pt")
+            inputs = {k: v.to(device) for k, v in enc.items()}
+            if "attention_mask" not in inputs:
+                inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
+            prompt_len = int(inputs["input_ids"].shape[1])
+        # Harmony stop via stopping criteria
+        sc = StoppingCriteriaList([StopOnTokens(HARMONY_STOP_IDS)]) if (HARMONY_AVAILABLE and HARMONY_STOP_IDS) else None
+        out_ids = model.generate(
+            **inputs,
+            do_sample=bool(gen_kwargs.get("do_sample", True)),
+            temperature=float(gen_kwargs.get("temperature", 0.7)),
+            top_p=float(gen_kwargs.get("top_p", 0.9)),
+            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
+            max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
+            pad_token_id=model.config.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+            bad_words_ids=bad_words_ids,
+            stopping_criteria=sc,
+            repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.15)),
+            no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
         )
+        out_list = out_ids[0].tolist()
+        gen_ids = out_list[prompt_len:]
+        # Truncate at first Harmony stop token if present
+        if HARMONY_AVAILABLE and HARMONY_STOP_IDS:
+            for sid in HARMONY_STOP_IDS:
+                if sid in gen_ids:
+                    gen_ids = gen_ids[:gen_ids.index(sid)]
+                    break
+        # Parse channels
+        if HARMONY_AVAILABLE:
+            try:
+                channels = parse_harmony_response(gen_ids)
+            except Exception:
+                decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
+                channels = {"final": extract_final_channel_fallback(decoded), "raw": decoded}
+        else:
+            decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
+            channels = {"final": extract_final_channel_fallback(decoded), "raw": decoded}
+        # Small previews (avoid flooding logs/UI)
+        preview = {
+            "prompt_len": int(prompt_len),
+            "stop_ids": list(HARMONY_STOP_IDS) if HARMONY_AVAILABLE else [],
+            "gen_len": int(len(gen_ids)),
+            "gen_ids_head": gen_ids[:48],
+            "decoded_head": tokenizer.decode(gen_ids[:256], skip_special_tokens=False),
+            "channels": channels,
+        }
+        return preview
     except Exception as e:
+        return {"error": f"{type(e).__name__}: {e}"}
     finally:
+        try:
+            del model
+        except Exception:
+            pass
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+# -----------------------
+# Gradio handlers
+# -----------------------
+def generate_response(message: str, history: List[List[str]], system_prompt: str,
+                    temperature: float, top_p: float, top_k: int, max_new_tokens: int,
+                    do_sample: bool, seed: Optional[int],
+                    rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
+                    rose_tokens: str, rose_json: str,
+                    show_thinking: bool = False,
+                    reasoning_effort: str = "high") -> str:
+    """
+    Generate response with proper CoT handling using Harmony format.
+    """
+    try:
+        # Build message list
+        messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
+        # Add history
+        if history:
+            for turn in history:
+                if isinstance(turn, (list, tuple)) and len(turn) >= 2:
+                    user_msg, assistant_msg = turn[0], turn[1]
+                    if user_msg:
+                        messages.append({"role": "user", "content": str(user_msg)})
+                    if assistant_msg:
+                        messages.append({"role": "assistant", "content": str(assistant_msg)})
+        # Add current message
+        messages.append({"role": "user", "content": str(message)})
+        # Create Harmony-formatted prompt
+        if HARMONY_AVAILABLE:
+            prompt = create_harmony_prompt(messages, reasoning_effort)  # returns token IDs
+        else:
+            # Fallback to tokenizer template (string)
+            prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        # Build Rose map if enabled
+        rose_map: Optional[Dict[str, float]] = None
+        if rose_enable:
+            rose_map = {}
+            tok_str = (rose_tokens or "").strip()
+            if tok_str:
+                for p in [p.strip() for p in tok_str.split(",") if p.strip()]:
+                    if ":" in p:
+                        k, v = p.split(":", 1)
+                        try:
+                            rose_map[k.strip()] = float(v)
+                        except:
+                            pass
+            if rose_json:
+                try:
+                    j = json.loads(rose_json)
+                    if isinstance(j, dict):
+                        for k, v in j.items():
+                            try:
+                                rose_map[str(k)] = float(v)
+                            except:
+                                pass
+                except:
+                    pass
+            if not rose_map:
+                rose_map = None
+        # Generate with model
+        channels = zerogpu_generate(
+            prompt,
+            {
+                "do_sample": bool(do_sample),
+                "temperature": float(temperature),
+                "top_p": float(top_p),
+                "top_k": int(top_k) if top_k > 0 else None,
+                "max_new_tokens": int(max_new_tokens),
+            },
+            rose_map,
+            float(rose_alpha),
+            float(rose_score) if rose_score is not None else None,
+            int(seed) if seed is not None else None,
+        )
+        # Format response
+        if show_thinking:
+            # Show all channels
+            response = "## Chain of Thought:\n\n"
+            for channel, content in channels.items():
+                if channel != "final" and content:
+                    response += f"### {channel.capitalize()} Channel:\n{content}\n\n"
+            response += f"### Final Response:\n{channels.get('final', 'No final response generated')}"
+            return response
+        else:
+            # Just show the final response
+            return channels.get("final", "No final response generated")
+    except Exception as e:
+        return f"[Error] {type(e).__name__}: {str(e)}"
+# -----------------------
+# Extra handler: Harmony Inspector wrapper
+# -----------------------
+def harmony_inspect_handler(user_prompt: str, system_prompt: str, reasoning_effort: str):
+    try:
+        msgs = [{"role": "system", "content": system_prompt or SYSTEM_DEF}, {"role": "user", "content": user_prompt or "What is 2+2?"}]
+        prompt = create_harmony_prompt(msgs, reasoning_effort)
+        return zerogpu_generate_debug(
+            prompt,
+            {"do_sample": True, "temperature": 0.7, "top_p": 0.9, "top_k": 0, "max_new_tokens": MAX_DEF}
+        )
+    except Exception as e:
+        return {"error": f"{type(e).__name__}: {e}"}
+# -----------------------
+# UI
+# -----------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # Mirel – Harmony Chain-of-Thought Inference
+        OSS-20B model using Harmony format with thinking channels.
+        The model thinks through problems in internal channels before providing a final response.
+        **Note:** Install `openai-harmony` for full Harmony support: `pip install openai-harmony`
+        """
+    )
+    with gr.Row():
+        system_prompt = gr.Textbox(
+            label="System Prompt",
+            value=SYSTEM_DEF,
+            lines=2
+        )
+    with gr.Accordion("Generation Settings", open=False):
+        with gr.Row():
+            temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
+            top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
+            top_k = gr.Slider(0, 200, value=0, step=1, label="Top-k (0=disabled)")
+        with gr.Row():
+            max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
+            do_sample = gr.Checkbox(value=True, label="Do sample")
+            seed = gr.Number(value=None, label="Seed (optional)", precision=0)
+        with gr.Row():
+            reasoning_effort = gr.Radio(
+                choices=["low", "medium", "high"],
+                value="high",
+                label="Reasoning Effort",
+                info="How much thinking the model should do"
+            )
+            show_thinking = gr.Checkbox(
+                value=False,
+                label="Show thinking channels",
+                info="Display all internal reasoning channels"
+            )
+    with gr.Accordion("Rose Guidance (Optional)", open=False):
+        gr.Markdown("Fine-tune generation with token biases")
+        with gr.Row():
+            rose_enable = gr.Checkbox(value=False, label="Enable Rose bias")
+            rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="Alpha (strength)")
+            rose_score = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="Score multiplier")
+        rose_tokens = gr.Textbox(
+            label="Token:weight pairs",
+            placeholder="example:1.5, test:-0.5",
+            value=""
+        )
+        rose_json = gr.Textbox(
+            label="JSON weights",
+            placeholder='{"token": 1.0, "another": -0.5}',
+            value=""
+        )
+    # --- Harmony Inspector UI ---
+    with gr.Accordion("Harmony Inspector", open=False):
+        debug_prompt = gr.Textbox(label="Debug prompt", value="What is 2+2? Reply with just the number.")
+        run_debug = gr.Button("Run Harmony Inspect")
+        debug_out = gr.JSON(label="Parsed Harmony output", value={})
+        run_debug.click(harmony_inspect_handler, inputs=[debug_prompt, system_prompt, reasoning_effort], outputs=[debug_out])
+    # Chat interface - using only valid parameters
+    chat = gr.ChatInterface(
+        fn=generate_response,
+        type="messages",
+        additional_inputs=[
+            system_prompt, temperature, top_p, top_k, max_new,
+            do_sample, seed, rose_enable, rose_alpha, rose_score,
+            rose_tokens, rose_json, show_thinking, reasoning_effort
+        ],
+        title="Chat with Mirel",
+        description="A chain-of-thought model using Harmony format",
+        examples=[
+            ["Hello! Can you introduce yourself?"],
+            ["What is the capital of France?"],
+            ["Explain quantum computing in simple terms"],
+            ["Solve: If a train travels 120 miles in 2 hours, what is its average speed?"],
+        ],
+        cache_examples=False,
+    )
+    gr.Markdown(
+        """
+        ---
+        ### Configuration:
+        - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
+        - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER`
+        - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
+        - **Harmony**: Install with `pip install openai-harmony` for proper channel support
+        The model uses Harmony format with thinking channels (`thinking`, `analysis`, `final`).
+        """
+    )
+if __name__ == "__main__":
+    demo.queue(max_size=8 if ZEROGPU else 32).launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False
+    )