Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on 8 days ago

Commit

5c9afc5

1 Parent(s): a2f6c58

claude helps again

Browse files

Files changed (1) hide show

app.py +249 -89

app.py CHANGED Viewed

@@ -1,17 +1,22 @@
 """
 Mirel Harmony Inference – HF Space (Gradio)
-ZeroGPU-ready, Harmony formatting, optional Rose-guided decoding
-Chain-of-thought model with proper channel extraction using openai_harmony
 Single file: app.py
 """
 from __future__ import annotations
-import os, gc, json, threading, torch
 from dataclasses import dataclass
-from typing import List, Dict, Optional, Any
 from datetime import datetime
 import gradio as gr
 import spaces  # required for ZeroGPU
-from transformers import AutoTokenizer, AutoModelForCausalLM
 # Import Harmony components
 try:
@@ -34,22 +39,23 @@ except ImportError:
 # -----------------------
 # Config & runtime modes
 # -----------------------
-DTYPE_MAP = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
 MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
 ADAPTER_ID        = os.getenv("ADAPTER_ID") or None
 ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
 ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
-DTYPE             = DTYPE_MAP.get(os.getenv("DTYPE", "bf16").lower(), torch.bfloat16)
 SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
 MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "256"))
 ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
-LOAD_4BIT         = os.getenv("LOAD_4BIT", "0") == "1"
 # Harmony channels for CoT
 REQUIRED_CHANNELS = ["analysis", "commentary", "final"]
-# HF Auth - properly handle multiple token env var names
 HF_TOKEN: Optional[str] = (
     os.getenv("HF_TOKEN")
     or os.getenv("HUGGING_FACE_HUB_TOKEN")
@@ -96,65 +102,231 @@ except Exception as e:
     raise
 # -----------------------
-# Model loading
 # -----------------------
 try:
-    from peft import PeftModel
     _HAS_PEFT = True
 except Exception:
     _HAS_PEFT = False
 def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
     kw: Dict[str, Any] = dict(
-        torch_dtype=DTYPE,
         device_map=device_map,
-        attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
         trust_remote_code=True,
         low_cpu_mem_usage=True,
         token=HF_TOKEN,
     )
-    if LOAD_4BIT and device_map != "cpu":
-        try:
-            import bitsandbytes as _bnb
-            kw.update(load_in_4bit=True)
-            if kw["device_map"] is None:
-                kw["device_map"] = "auto"
-        except Exception:
-            pass
     return kw
 def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
     print(f"[Model] Loading base model from {MODEL_ID}...")
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **_build_model_kwargs(device_map))
-    #if ADAPTER_ID:
-    #    if not _HAS_PEFT:
-    #        raise RuntimeError("peft is required when ADAPTER_ID is set.")
-    #    print(f"[Model] Loading adapter from {ADAPTER_ID}...")
-    #    peft_kwargs: Dict[str, Any] = {"token": HF_TOKEN}
-    #    if ADAPTER_SUBFOLDER:
-    #        peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
-    #    model = PeftModel.from_pretrained(model, ADAPTER_ID, is_trainable=False, **peft_kwargs)
     model.eval()
-    # Ensure a valid pad_token_id is set; some OSS checkpoints reuse eos as pad
     if getattr(model.config, "pad_token_id", None) is None:
         model.config.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
     model.config.use_cache = True
-    print("[Model] Model loaded successfully")
     return model
 # -----------------------
 # Harmony formatting
 # -----------------------
 def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str = "high") -> Any:
-    """Build a Harmony-formatted prompt. If Harmony is available, return **token IDs**
-    rendered by `openai_harmony` (authoritative). Otherwise fall back to the
-    tokenizer's chat template and return a string.
-    """
     if HARMONY_AVAILABLE and harmony_encoding is not None:
         effort_map = {"low": ReasoningEffort.LOW, "medium": ReasoningEffort.MEDIUM, "high": ReasoningEffort.HIGH}
         effort = effort_map.get(str(reasoning_effort).lower(), ReasoningEffort.HIGH)
@@ -168,7 +340,6 @@ def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str
             .with_required_channels(REQUIRED_CHANNELS)
         )
-        # Use first system message as developer instructions if present, else SYSTEM_DEF
         sys_text = SYSTEM_DEF
         rest: List[Dict[str, str]] = messages or []
         if rest and rest[0].get("role") == "system":
@@ -191,7 +362,7 @@ def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str
         convo = Conversation.from_messages(harmony_messages)
         return harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
-    # Fallback: tokenizer chat template -> string prompt
     if not messages or messages[0].get("role") != "system":
         messages = [{"role": "system", "content": SYSTEM_DEF}] + (messages or [])
     return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
@@ -199,14 +370,11 @@ def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str
 def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
     """Parse response tokens using Harmony format to extract channels."""
     if not HARMONY_AVAILABLE:
-        # Fallback: just decode and extract final channel manually
         text = tokenizer.decode(tokens, skip_special_tokens=False)
         return {"final": extract_final_channel_fallback(text), "raw": text}
-    # Parse messages from completion tokens
     parsed_messages = harmony_encoding.parse_messages_from_completion_tokens(tokens, Role.ASSISTANT)
-    # Extract content by channel
     channels = {}
     for msg in parsed_messages:
         channel = msg.channel if hasattr(msg, 'channel') else "final"
@@ -214,16 +382,13 @@ def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
             channels[channel] = ""
         channels[channel] += "".join([getattr(part, "text", str(part)) for part in (msg.content if isinstance(msg.content, list) else [msg.content])])
-    # Ensure we have a final channel
     if "final" not in channels:
         channels["final"] = " ".join(channels.values())
     return channels
 def extract_final_channel_fallback(text: str) -> str:
-    """Robustly extract the <final> channel from decoded Harmony text.
-    Works even if parsing fails or the model emits extra headers.
-    """
     try:
         chunks: Dict[str, str] = {}
         pieces = text.split("<|channel|>")
@@ -233,7 +398,6 @@ def extract_final_channel_fallback(text: str) -> str:
                 continue
             ch = seg[:name_end].strip()
             body_start = name_end + len("<|message|>")
-            # end at next channel/end/return marker
             next_pos = len(seg)
             for delim in ("<|channel|>", "<|end|>", "<|return|>"):
                 p = seg.find(delim, body_start)
@@ -244,7 +408,6 @@ def extract_final_channel_fallback(text: str) -> str:
         final_txt = (chunks.get("final", "").strip())
         if final_txt:
             return final_txt
-        # Fallback: everything after last final marker up to a terminator
         if "<|channel|>final<|message|>" in text:
             tail = text.split("<|channel|>final<|message|>")[-1]
             for delim in ("<|return|>", "<|end|>", "<|channel|>"):
@@ -260,7 +423,6 @@ def extract_final_channel_fallback(text: str) -> str:
 # -----------------------
 # Rose guidance
 # -----------------------
 def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
     """Create vocab bias from {token: weight}."""
     vocab_size = len(tokenizer)
@@ -286,6 +448,9 @@ class RoseGuidedLogits(torch.nn.Module):
     def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         return scores + self.alpha * self.bias_vec.to(scores.device)
 @spaces.GPU(duration=120)
 def zerogpu_generate(full_prompt,
                     gen_kwargs: Dict[str, Any],
@@ -293,12 +458,12 @@ def zerogpu_generate(full_prompt,
                     rose_alpha: float,
                     rose_score: Optional[float],
                     seed: Optional[int]) -> Dict[str, str]:
-    """Run inference on GPU and return parsed channels."""
     try:
         if seed is not None:
             torch.manual_seed(int(seed))
-        # Load model
         model = _load_model_on("auto")
         # Setup logits processor for Rose guidance
@@ -308,7 +473,7 @@ def zerogpu_generate(full_prompt,
             eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
             logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
-        # Tokenize / prepare inputs
         device = next(model.parameters()).device
         if HARMONY_AVAILABLE and isinstance(full_prompt, list):
             input_ids = torch.tensor([full_prompt], dtype=torch.long, device=device)
@@ -319,11 +484,10 @@ def zerogpu_generate(full_prompt,
             enc = tokenizer(full_prompt, return_tensors="pt")
             inputs = enc.to(device)
             prompt_len = int(inputs["input_ids"].shape[1])
-            # Guarantee attention_mask exists; avoids pad==eos ambiguity warnings
             if "attention_mask" not in inputs:
                 inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
         # Generate
-        # Build EOS list: use ONLY Harmony assistant-action stops (per OpenAI docs)
         eos_ids = HARMONY_STOP_IDS if HARMONY_AVAILABLE else tokenizer.eos_token_id
         out_ids = model.generate(
@@ -341,29 +505,28 @@ def zerogpu_generate(full_prompt,
             min_new_tokens=1,
         )
-        # Extract generated tokens only
         out_list = out_ids[0].tolist()
         gen_ids = out_list[prompt_len:]
-        # Truncate at first Harmony stop token if present
         if HARMONY_AVAILABLE:
             for sid in HARMONY_STOP_IDS:
                 if sid in gen_ids:
                     gen_ids = gen_ids[:gen_ids.index(sid)]
                     break
-        # Parse response with Harmony
         if HARMONY_AVAILABLE:
             try:
                 channels = parse_harmony_response(gen_ids)
             except Exception:
-                # Fallback to text parsing if Harmony parser fails
                 decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
                 channels = {
                     "final": extract_final_channel_fallback(decoded),
                     "raw": decoded
                 }
         else:
-            # Fallback decode + channels
             decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
             channels = {
                 "final": extract_final_channel_fallback(decoded),
@@ -373,7 +536,10 @@ def zerogpu_generate(full_prompt,
         return channels
     except Exception as e:
-        return {"final": f"[Error] {type(e).__name__}: {str(e)}", "raw": str(e)}
     finally:
         # Cleanup
         try:
@@ -387,7 +553,6 @@ def zerogpu_generate(full_prompt,
 # -----------------------
 # Gradio handlers
 # -----------------------
 def generate_response(message: str, history: List[List[str]], system_prompt: str,
                     temperature: float, top_p: float, top_k: int, max_new_tokens: int,
                     do_sample: bool, seed: Optional[int],
@@ -395,14 +560,11 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
                     rose_tokens: str, rose_json: str,
                     show_thinking: bool = False,
                     reasoning_effort: str = "high") -> str:
-    """
-    Generate response with proper CoT handling using Harmony format.
-    """
     try:
-        # Build message list
         messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
-        # Add history
         if history:
             for turn in history:
                 if isinstance(turn, (list, tuple)) and len(turn) >= 2:
@@ -412,17 +574,15 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
                     if assistant_msg:
                         messages.append({"role": "assistant", "content": str(assistant_msg)})
-        # Add current message
         messages.append({"role": "user", "content": str(message)})
-        # Create Harmony-formatted prompt
         if HARMONY_AVAILABLE:
-            prompt = create_harmony_prompt(messages, reasoning_effort)  # returns token IDs
         else:
-            # Fallback to tokenizer template (string)
             prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        # Build Rose map if enabled
         rose_map: Optional[Dict[str, float]] = None
         if rose_enable:
             rose_map = {}
@@ -449,7 +609,7 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
             if not rose_map:
                 rose_map = None
-        # Generate with model
         channels = zerogpu_generate(
             prompt,
             {
@@ -458,6 +618,8 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
                 "top_p": float(top_p),
                 "top_k": int(top_k) if top_k > 0 else None,
                 "max_new_tokens": int(max_new_tokens),
             },
             rose_map,
             float(rose_alpha),
@@ -467,7 +629,6 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
         # Format response
         if show_thinking:
-            # Show all channels
             response = "## Chain of Thought:\n\n"
             for channel, content in channels.items():
                 if channel != "final" and content:
@@ -475,24 +636,25 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
             response += f"### Final Response:\n{channels.get('final', 'No final response generated')}"
             return response
         else:
-            # Just show the final response
             return channels.get("final", "No final response generated")
     except Exception as e:
-        return f"[Error] {type(e).__name__}: {str(e)}"
 # -----------------------
 # UI
 # -----------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
-        """
         # Mirel – Harmony Chain-of-Thought Inference
-        OSS-20B model using Harmony format with thinking channels.
-        The model thinks through problems in internal channels before providing a final response.
-        **Note:** Install `openai-harmony` for full Harmony support: `pip install openai-harmony`
         """
     )
@@ -542,7 +704,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             value=""
         )
-    # Chat interface - using only valid parameters
     chat = gr.ChatInterface(
         fn=generate_response,
         type="messages",
@@ -552,7 +714,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             rose_tokens, rose_json, show_thinking, reasoning_effort
         ],
         title="Chat with Mirel",
-        description="A chain-of-thought model using Harmony format",
         examples=[
             ["Hello! Can you introduce yourself?"],
             ["What is the capital of France?"],
@@ -566,12 +728,10 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         """
         ---
         ### Configuration:
-        - **Model**: Set `MODEL_ID` env var (default: openai/gpt-oss-20b)
-        - **Adapter**: Set `ADAPTER_ID` and optionally `ADAPTER_SUBFOLDER`
         - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
-        - **Harmony**: Install with `pip install openai-harmony` for proper channel support
-        The model uses Harmony format with thinking channels (`thinking`, `analysis`, `final`).
         """
     )
@@ -580,4 +740,4 @@ if __name__ == "__main__":
         server_name="0.0.0.0",
         server_port=7860,
         share=False
-    )

 """
 Mirel Harmony Inference – HF Space (Gradio)
+ZeroGPU-ready, Harmony formatting, MX format support for GPT-OSS-20B
+Proper LoRA adapter loading and conversion for MX compatibility
 Single file: app.py
 """
 from __future__ import annotations
+import os, gc, json, threading, torch, warnings
 from dataclasses import dataclass
+from typing import List, Dict, Optional, Any, Union
 from datetime import datetime
 import gradio as gr
 import spaces  # required for ZeroGPU
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
+import numpy as np
+# Suppress warnings about MX format
+warnings.filterwarnings("ignore", message=".*microscaling.*")
+warnings.filterwarnings("ignore", message=".*mx.*")
 # Import Harmony components
 try:
 # -----------------------
 # Config & runtime modes
 # -----------------------
+# MX format uses special dtypes - we need to handle this properly
 MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
 ADAPTER_ID        = os.getenv("ADAPTER_ID") or None
 ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER") or None
 ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
 SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
 MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "256"))
 ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
+# For GPT-OSS models, we need specific handling
+IS_GPT_OSS = "gpt-oss" in MODEL_ID.lower()
+USE_MX_FORMAT = os.getenv("USE_MX_FORMAT", "1" if IS_GPT_OSS else "0") == "1"
 # Harmony channels for CoT
 REQUIRED_CHANNELS = ["analysis", "commentary", "final"]
+# HF Auth
 HF_TOKEN: Optional[str] = (
     os.getenv("HF_TOKEN")
     or os.getenv("HUGGING_FACE_HUB_TOKEN")
     raise
 # -----------------------
+# PEFT and MX Format Support
 # -----------------------
 try:
+    from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model
     _HAS_PEFT = True
 except Exception:
     _HAS_PEFT = False
+    print("[Warning] PEFT not available. Install with: pip install peft")
+# Try to import microscaling support if available
+try:
+    import msamp
+    _HAS_MSAMP = True
+    print("[Info] Microsoft AMP (msamp) available for MX format support")
+except ImportError:
+    _HAS_MSAMP = False
+    print("[Info] msamp not available - using fallback MX handling")
+# -----------------------
+# MX Format Conversion
+# -----------------------
+def convert_fp32_lora_to_mx_compatible(lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+    """
+    Convert fp32 LoRA weights to be compatible with MX format base model.
+    MX models expect specific dtype handling.
+    """
+    converted = {}
+    for key, tensor in lora_state_dict.items():
+        if tensor is None:
+            converted[key] = tensor
+            continue
+        # LoRA weights (lora_A, lora_B) need special handling
+        if 'lora_' in key:
+            # For MX compatibility, we keep weights in fp32 but ensure proper scaling
+            # MX format internally handles quantization, we just need clean fp32 inputs
+            if tensor.dtype != torch.float32:
+                tensor = tensor.to(torch.float32)
+            # Ensure weights are in reasonable range for MX quantization
+            # MX format works best with weights in [-1, 1] range
+            if 'lora_A' in key:
+                # Input projection - initialize with small values
+                std = 1.0 / torch.sqrt(torch.tensor(tensor.shape[1], dtype=torch.float32))
+                if tensor.std() > std * 10:  # If weights are too large
+                    print(f"[MX Convert] Scaling down {key} from std={tensor.std():.4f} to {std:.4f}")
+                    tensor = tensor * (std / tensor.std())
+            elif 'lora_B' in key:
+                # Output projection - should be near zero initially
+                if tensor.abs().max() > 0.1:
+                    print(f"[MX Convert] Scaling down {key} max={tensor.abs().max():.4f}")
+                    tensor = tensor * 0.01
+            converted[key] = tensor
+        else:
+            # Non-LoRA weights (like embeddings) stay as-is
+            converted[key] = tensor
+    return converted
+def prepare_model_for_mx_lora(model, adapter_path: str):
+    """
+    Prepare and attach LoRA adapter to MX format model.
+    Handles the special requirements of GPT-OSS MX models.
+    """
+    if not _HAS_PEFT:
+        raise RuntimeError("PEFT is required for LoRA adapters. Install with: pip install peft")
+    print(f"[LoRA] Loading adapter from {adapter_path}")
+    # Load the LoRA config
+    peft_config = PeftConfig.from_pretrained(adapter_path, token=HF_TOKEN)
+    # Load the LoRA weights
+    from safetensors.torch import load_file
+    import os.path as osp
+    adapter_weights_path = osp.join(adapter_path, "adapter_model.safetensors")
+    if not osp.exists(adapter_weights_path):
+        adapter_weights_path = osp.join(adapter_path, "adapter_model.bin")
+        if osp.exists(adapter_weights_path):
+            adapter_weights = torch.load(adapter_weights_path, map_location="cpu")
+        else:
+            raise FileNotFoundError(f"No adapter weights found at {adapter_path}")
+    else:
+        adapter_weights = load_file(adapter_weights_path)
+    # Convert weights for MX compatibility
+    print("[LoRA] Converting fp32 weights for MX format compatibility...")
+    adapter_weights = convert_fp32_lora_to_mx_compatible(adapter_weights)
+    # Create PEFT model with special handling for MX
+    print("[LoRA] Attaching LoRA to base model...")
+    # For MX models, we need to be careful about dtype
+    # The base model uses MX format internally, but the interface should be fp32
+    model = PeftModel.from_pretrained(
+        model,
+        adapter_path,
+        is_trainable=False,
+        token=HF_TOKEN,
+        # Don't specify torch_dtype here - let it match the base model
+    )
+    # Manually update the adapter weights with our converted versions
+    model.load_state_dict(adapter_weights, strict=False)
+    print("[LoRA] Successfully attached LoRA adapter with MX compatibility")
+    return model
+# -----------------------
+# Model loading with MX support
+# -----------------------
 def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
+    """Build kwargs for model loading with MX format support."""
     kw: Dict[str, Any] = dict(
         device_map=device_map,
         trust_remote_code=True,
         low_cpu_mem_usage=True,
         token=HF_TOKEN,
     )
+    if IS_GPT_OSS and USE_MX_FORMAT:
+        # GPT-OSS models use MX format
+        # Don't specify torch_dtype - let the model use its native MX format
+        print("[Model] Using MX format for GPT-OSS model")
+        kw.update({
+            "attn_implementation": ATTN_IMPL if device_map != "cpu" else "eager",
+            # MX models handle their own dtype internally
+            # Don't force a dtype here
+        })
+    else:
+        # Non-MX models
+        kw.update({
+            "torch_dtype": torch.float16,  # Use fp16 for non-MX models
+            "attn_implementation": ATTN_IMPL if device_map != "cpu" else "eager",
+        })
     return kw
 def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
+    """Load model with proper MX format handling."""
     print(f"[Model] Loading base model from {MODEL_ID}...")
+    # Load config first to check for MX format
+    config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
+    # Check if this is an MX model
+    is_mx_model = (
+        IS_GPT_OSS or
+        hasattr(config, 'quantization_config') and 'mx' in str(config.quantization_config).lower() or
+        hasattr(config, 'torch_dtype') and 'mx' in str(config.torch_dtype).lower()
+    )
+    if is_mx_model:
+        print("[Model] Detected MX format model - using special loading")
+        # For MX models, we need special handling
+        # The model internally uses MX quantization
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            config=config,
+            trust_remote_code=True,
+            device_map=device_map,
+            low_cpu_mem_usage=True,
+            token=HF_TOKEN,
+            # Let the model handle its own dtype
+            attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
+        )
+        # Verify the model loaded correctly
+        print(f"[Model] Model dtype: {next(model.parameters()).dtype}")
+        print(f"[Model] Model device: {next(model.parameters()).device}")
+    else:
+        # Standard model loading
+        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **_build_model_kwargs(device_map))
+    # Load and attach LoRA adapter if specified
+    if ADAPTER_ID:
+        try:
+            if is_mx_model:
+                # Use special MX-compatible LoRA loading
+                model = prepare_model_for_mx_lora(model, ADAPTER_ID)
+            else:
+                # Standard PEFT loading for non-MX models
+                if not _HAS_PEFT:
+                    raise RuntimeError("PEFT is required when ADAPTER_ID is set.")
+                print(f"[Model] Loading adapter from {ADAPTER_ID} (standard mode)...")
+                model = PeftModel.from_pretrained(
+                    model,
+                    ADAPTER_ID,
+                    is_trainable=False,
+                    token=HF_TOKEN
+                )
+            print("[Model] Successfully loaded with LoRA adapter")
+            # Optionally merge adapter for better performance
+            merge_adapter = os.getenv("MERGE_ADAPTER", "0") == "1"
+            if merge_adapter and hasattr(model, 'merge_and_unload'):
+                print("[Model] Merging adapter into base model...")
+                model = model.merge_and_unload()
+                print("[Model] Adapter merged successfully")
+        except Exception as e:
+            print(f"[Error] Failed to load adapter: {e}")
+            print("[Warning] Continuing with base model only")
     model.eval()
+    # Ensure proper config
     if getattr(model.config, "pad_token_id", None) is None:
         model.config.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
     model.config.use_cache = True
+    print(f"[Model] Model loaded successfully - Type: {'MX Format' if is_mx_model else 'Standard'}")
     return model
 # -----------------------
 # Harmony formatting
 # -----------------------
 def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str = "high") -> Any:
+    """Build a Harmony-formatted prompt."""
     if HARMONY_AVAILABLE and harmony_encoding is not None:
         effort_map = {"low": ReasoningEffort.LOW, "medium": ReasoningEffort.MEDIUM, "high": ReasoningEffort.HIGH}
         effort = effort_map.get(str(reasoning_effort).lower(), ReasoningEffort.HIGH)
             .with_required_channels(REQUIRED_CHANNELS)
         )
         sys_text = SYSTEM_DEF
         rest: List[Dict[str, str]] = messages or []
         if rest and rest[0].get("role") == "system":
         convo = Conversation.from_messages(harmony_messages)
         return harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
+    # Fallback: tokenizer chat template
     if not messages or messages[0].get("role") != "system":
         messages = [{"role": "system", "content": SYSTEM_DEF}] + (messages or [])
     return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
 def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
     """Parse response tokens using Harmony format to extract channels."""
     if not HARMONY_AVAILABLE:
         text = tokenizer.decode(tokens, skip_special_tokens=False)
         return {"final": extract_final_channel_fallback(text), "raw": text}
     parsed_messages = harmony_encoding.parse_messages_from_completion_tokens(tokens, Role.ASSISTANT)
     channels = {}
     for msg in parsed_messages:
         channel = msg.channel if hasattr(msg, 'channel') else "final"
             channels[channel] = ""
         channels[channel] += "".join([getattr(part, "text", str(part)) for part in (msg.content if isinstance(msg.content, list) else [msg.content])])
     if "final" not in channels:
         channels["final"] = " ".join(channels.values())
     return channels
 def extract_final_channel_fallback(text: str) -> str:
+    """Extract the <final> channel from decoded Harmony text."""
     try:
         chunks: Dict[str, str] = {}
         pieces = text.split("<|channel|>")
                 continue
             ch = seg[:name_end].strip()
             body_start = name_end + len("<|message|>")
             next_pos = len(seg)
             for delim in ("<|channel|>", "<|end|>", "<|return|>"):
                 p = seg.find(delim, body_start)
         final_txt = (chunks.get("final", "").strip())
         if final_txt:
             return final_txt
         if "<|channel|>final<|message|>" in text:
             tail = text.split("<|channel|>final<|message|>")[-1]
             for delim in ("<|return|>", "<|end|>", "<|channel|>"):
 # -----------------------
 # Rose guidance
 # -----------------------
 def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
     """Create vocab bias from {token: weight}."""
     vocab_size = len(tokenizer)
     def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         return scores + self.alpha * self.bias_vec.to(scores.device)
+# -----------------------
+# Generation
+# -----------------------
 @spaces.GPU(duration=120)
 def zerogpu_generate(full_prompt,
                     gen_kwargs: Dict[str, Any],
                     rose_alpha: float,
                     rose_score: Optional[float],
                     seed: Optional[int]) -> Dict[str, str]:
+    """Run inference on GPU with MX format support."""
     try:
         if seed is not None:
             torch.manual_seed(int(seed))
+        # Load model with MX support
         model = _load_model_on("auto")
         # Setup logits processor for Rose guidance
             eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
             logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
+        # Prepare inputs
         device = next(model.parameters()).device
         if HARMONY_AVAILABLE and isinstance(full_prompt, list):
             input_ids = torch.tensor([full_prompt], dtype=torch.long, device=device)
             enc = tokenizer(full_prompt, return_tensors="pt")
             inputs = enc.to(device)
             prompt_len = int(inputs["input_ids"].shape[1])
             if "attention_mask" not in inputs:
                 inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
         # Generate
         eos_ids = HARMONY_STOP_IDS if HARMONY_AVAILABLE else tokenizer.eos_token_id
         out_ids = model.generate(
             min_new_tokens=1,
         )
+        # Extract generated tokens
         out_list = out_ids[0].tolist()
         gen_ids = out_list[prompt_len:]
+        # Truncate at stop tokens
         if HARMONY_AVAILABLE:
             for sid in HARMONY_STOP_IDS:
                 if sid in gen_ids:
                     gen_ids = gen_ids[:gen_ids.index(sid)]
                     break
+        # Parse response
         if HARMONY_AVAILABLE:
             try:
                 channels = parse_harmony_response(gen_ids)
             except Exception:
                 decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
                 channels = {
                     "final": extract_final_channel_fallback(decoded),
                     "raw": decoded
                 }
         else:
             decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
             channels = {
                 "final": extract_final_channel_fallback(decoded),
         return channels
     except Exception as e:
+        import traceback
+        error_trace = traceback.format_exc()
+        print(f"[Error] Generation failed:\n{error_trace}")
+        return {"final": f"[Error] {type(e).__name__}: {str(e)}", "raw": error_trace}
     finally:
         # Cleanup
         try:
 # -----------------------
 # Gradio handlers
 # -----------------------
 def generate_response(message: str, history: List[List[str]], system_prompt: str,
                     temperature: float, top_p: float, top_k: int, max_new_tokens: int,
                     do_sample: bool, seed: Optional[int],
                     rose_tokens: str, rose_json: str,
                     show_thinking: bool = False,
                     reasoning_effort: str = "high") -> str:
+    """Generate response with CoT handling."""
     try:
+        # Build messages
         messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
         if history:
             for turn in history:
                 if isinstance(turn, (list, tuple)) and len(turn) >= 2:
                     if assistant_msg:
                         messages.append({"role": "assistant", "content": str(assistant_msg)})
         messages.append({"role": "user", "content": str(message)})
+        # Create prompt
         if HARMONY_AVAILABLE:
+            prompt = create_harmony_prompt(messages, reasoning_effort)
         else:
             prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+        # Build Rose map
         rose_map: Optional[Dict[str, float]] = None
         if rose_enable:
             rose_map = {}
             if not rose_map:
                 rose_map = None
+        # Generate
         channels = zerogpu_generate(
             prompt,
             {
                 "top_p": float(top_p),
                 "top_k": int(top_k) if top_k > 0 else None,
                 "max_new_tokens": int(max_new_tokens),
+                "repetition_penalty": 1.1,
+                "no_repeat_ngram_size": 6,
             },
             rose_map,
             float(rose_alpha),
         # Format response
         if show_thinking:
             response = "## Chain of Thought:\n\n"
             for channel, content in channels.items():
                 if channel != "final" and content:
             response += f"### Final Response:\n{channels.get('final', 'No final response generated')}"
             return response
         else:
             return channels.get("final", "No final response generated")
     except Exception as e:
+        import traceback
+        return f"[Error] {type(e).__name__}: {str(e)}\n{traceback.format_exc()}"
 # -----------------------
 # UI
 # -----------------------
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
+        f"""
         # Mirel – Harmony Chain-of-Thought Inference
+        **Model**: {MODEL_ID} {'(MX Format)' if USE_MX_FORMAT else ''}
+        **Adapter**: {ADAPTER_ID or 'None'}
+        **Status**: {'✅ Harmony Available' if HARMONY_AVAILABLE else '⚠️ Harmony Not Installed'}
+        The model uses internal thinking channels before providing final responses.
         """
     )
             value=""
         )
+    # Chat interface
     chat = gr.ChatInterface(
         fn=generate_response,
         type="messages",
             rose_tokens, rose_json, show_thinking, reasoning_effort
         ],
         title="Chat with Mirel",
+        description="Chain-of-thought model with MX format support",
         examples=[
             ["Hello! Can you introduce yourself?"],
             ["What is the capital of France?"],
         """
         ---
         ### Configuration:
+        - **MX Format**: Automatically detected for GPT-OSS models
+        - **LoRA Support**: fp32 LoRA adapters are converted for MX compatibility
+        - **Merge Adapter**: Set `MERGE_ADAPTER=1` to merge LoRA into base model
         - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
         """
     )
         server_name="0.0.0.0",
         server_port=7860,
         share=False
+    )