Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on about 14 hours ago

Commit

6228595

1 Parent(s): 4ab6146

yes

Browse files

Files changed (4) hide show

app.py +478 -637
install.sh +97 -0
requirements.txt +19 -4
setup.py +31 -0

app.py CHANGED Viewed

@@ -3,9 +3,54 @@ Mirel Harmony Inference – HF Space (Gradio)
 ZeroGPU-ready, Harmony formatting, MX format support for GPT-OSS-20B
 Proper LoRA adapter loading and conversion for MX compatibility
 Single file: app.py
 """
 from __future__ import annotations
-import os, gc, json, threading, torch, warnings
 from dataclasses import dataclass
 from typing import List, Dict, Optional, Any, Union
 from datetime import datetime
@@ -14,7 +59,7 @@ import spaces  # required for ZeroGPU
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 import numpy as np
-# Suppress warnings about MX format
 warnings.filterwarnings("ignore", message=".*microscaling.*")
 warnings.filterwarnings("ignore", message=".*mx.*")
@@ -32,770 +77,566 @@ try:
         ReasoningEffort
     )
     HARMONY_AVAILABLE = True
 except ImportError:
-    print("[WARNING] openai_harmony not installed. Install with: pip install openai-harmony")
     HARMONY_AVAILABLE = False
-# -----------------------
-# Config & runtime modes
-# -----------------------
-# MX format uses special dtypes - we need to handle this properly
 MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
-ADAPTER_ID        = os.getenv("ADAPTER_ID", "AbstractPhil/mirel-gpt-oss-20b")  # Default to your adapter
-ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER", "checkpoints/checkpoint-516")  # Default to the subfolder
 ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
-SYSTEM_DEF        = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
-MAX_DEF           = int(os.getenv("MAX_NEW_TOKENS", "256"))
-ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "0")) == "1"
-# For GPT-OSS models, we need specific handling
 IS_GPT_OSS = "gpt-oss" in MODEL_ID.lower()
-USE_MX_FORMAT = os.getenv("USE_MX_FORMAT", "1" if IS_GPT_OSS else "0") == "1"
-# Harmony channels for CoT
 REQUIRED_CHANNELS = ["analysis", "commentary", "final"]
-# HF Auth
-HF_TOKEN: Optional[str] = (
     os.getenv("HF_TOKEN")
     or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HUGGINGFACEHUB_API_TOKEN")
     or os.getenv("HF_ACCESS_TOKEN")
 )
-def _hf_login() -> None:
-    """Login to HF Hub using common env secret names."""
     if HF_TOKEN:
         try:
             from huggingface_hub import login, whoami
             login(token=HF_TOKEN, add_to_git_credential=True)
             try:
-                who = whoami(token=HF_TOKEN)
-                print(f"[HF Auth] Logged in as: {who.get('name') or who.get('fullname') or who.get('id', 'unknown')}")
-            except Exception:
-                print("[HF Auth] Login successful but couldn't get user info")
         except Exception as e:
-            print(f"[HF Auth] Login failed: {e}")
     else:
-        print("[HF Auth] No token found in environment variables")
-# Login before loading any models
 _hf_login()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-# Load Harmony encoding if available
-if HARMONY_AVAILABLE:
-    harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
-else:
-    harmony_encoding = None
-# Stop tokens per Harmony spec: <|return|> (200002), <|call|> (200012)
-HARMONY_STOP_IDS = harmony_encoding.stop_tokens_for_assistant_actions() if HARMONY_AVAILABLE else []
-# Tokenizer is lightweight; load once
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
-    print(f"[Model] Successfully loaded tokenizer from {MODEL_ID}")
 except Exception as e:
-    print(f"[Model] Failed to load tokenizer: {e}")
     raise
-# -----------------------
-# PEFT and MX Format Support
-# -----------------------
-try:
-    from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model
-    _HAS_PEFT = True
-except Exception:
-    _HAS_PEFT = False
-    print("[Warning] PEFT not available. Install with: pip install peft")
-# Try to import microscaling support if available
-try:
-    import msamp
-    _HAS_MSAMP = True
-    print("[Info] Microsoft AMP (msamp) available for MX format support")
-except ImportError:
-    _HAS_MSAMP = False
-    print("[Info] msamp not available - using fallback MX handling")
-# -----------------------
-# MX Format Conversion
-# -----------------------
-def convert_fp32_lora_to_mx_compatible(lora_state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-    """
-    Convert fp32 LoRA weights to be compatible with MX format base model.
-    MX models expect specific dtype handling.
-    """
-    converted = {}
-    for key, tensor in lora_state_dict.items():
-        if tensor is None:
-            converted[key] = tensor
-            continue
-        # LoRA weights (lora_A, lora_B) need special handling
-        if 'lora_' in key:
-            # For MX compatibility, we keep weights in fp32 but ensure proper scaling
-            # MX format internally handles quantization, we just need clean fp32 inputs
-            if tensor.dtype != torch.float32:
-                tensor = tensor.to(torch.float32)
-            # Ensure weights are in reasonable range for MX quantization
-            # MX format works best with weights in [-1, 1] range
-            if 'lora_A' in key:
-                # Input projection - initialize with small values
-                std = 1.0 / torch.sqrt(torch.tensor(tensor.shape[1], dtype=torch.float32))
-                if tensor.std() > std * 10:  # If weights are too large
-                    print(f"[MX Convert] Scaling down {key} from std={tensor.std():.4f} to {std:.4f}")
-                    tensor = tensor * (std / tensor.std())
-            elif 'lora_B' in key:
-                # Output projection - should be near zero initially
-                if tensor.abs().max() > 0.1:
-                    print(f"[MX Convert] Scaling down {key} max={tensor.abs().max():.4f}")
-                    tensor = tensor * 0.01
-            converted[key] = tensor
-        else:
-            # Non-LoRA weights (like embeddings) stay as-is
-            converted[key] = tensor
-    return converted
-def prepare_model_for_mx_lora(model, adapter_path: str, subfolder: Optional[str] = None):
-    """
-    Prepare and attach LoRA adapter to MX format model.
-    Handles the special requirements of GPT-OSS MX models.
-    """
-    if not _HAS_PEFT:
-        raise RuntimeError("PEFT is required for LoRA adapters. Install with: pip install peft")
-    # Build the full path including subfolder
-    full_adapter_path = adapter_path
-    if subfolder:
-        print(f"[LoRA] Loading adapter from {adapter_path} (subfolder: {subfolder})")
-    else:
-        print(f"[LoRA] Loading adapter from {adapter_path}")
-    # Load the LoRA config with subfolder support
-    peft_kwargs = {"token": HF_TOKEN}
-    if subfolder:
-        peft_kwargs["subfolder"] = subfolder
-    peft_config = PeftConfig.from_pretrained(adapter_path, **peft_kwargs)
-    # Load the LoRA weights - need to check in the right location
-    from safetensors.torch import load_file
-    import os.path as osp
-    from huggingface_hub import hf_hub_download
-    try:
-        # Try to download from HF Hub with subfolder
-        if subfolder:
-            # Download the adapter weights file
-            try:
-                adapter_weights_path = hf_hub_download(
-                    repo_id=adapter_path,
-                    filename="adapter_model.safetensors",
-                    subfolder=subfolder,
-                    token=HF_TOKEN
-                )
-                adapter_weights = load_file(adapter_weights_path)
-                print(f"[LoRA] Loaded safetensors weights from {subfolder}")
-            except Exception:
-                # Try .bin format
-                adapter_weights_path = hf_hub_download(
-                    repo_id=adapter_path,
-                    filename="adapter_model.bin",
-                    subfolder=subfolder,
-                    token=HF_TOKEN
-                )
-                adapter_weights = torch.load(adapter_weights_path, map_location="cpu")
-                print(f"[LoRA] Loaded bin weights from {subfolder}")
         else:
-            # No subfolder - try local path first, then HF Hub
-            local_safetensors = osp.join(adapter_path, "adapter_model.safetensors")
-            local_bin = osp.join(adapter_path, "adapter_model.bin")
-            if osp.exists(local_safetensors):
-                adapter_weights = load_file(local_safetensors)
-                print("[LoRA] Loaded local safetensors weights")
-            elif osp.exists(local_bin):
-                adapter_weights = torch.load(local_bin, map_location="cpu")
-                print("[LoRA] Loaded local bin weights")
-            else:
-                # Try downloading from HF Hub
-                try:
-                    adapter_weights_path = hf_hub_download(
-                        repo_id=adapter_path,
-                        filename="adapter_model.safetensors",
-                        token=HF_TOKEN
-                    )
-                    adapter_weights = load_file(adapter_weights_path)
-                    print("[LoRA] Downloaded safetensors weights from Hub")
-                except Exception:
-                    adapter_weights_path = hf_hub_download(
-                        repo_id=adapter_path,
-                        filename="adapter_model.bin",
-                        token=HF_TOKEN
-                    )
-                    adapter_weights = torch.load(adapter_weights_path, map_location="cpu")
-                    print("[LoRA] Downloaded bin weights from Hub")
-    except Exception as e:
-        raise FileNotFoundError(f"Could not load adapter weights: {e}")
-    # Convert weights for MX compatibility
-    print("[LoRA] Converting fp32 weights for MX format compatibility...")
-    adapter_weights = convert_fp32_lora_to_mx_compatible(adapter_weights)
-    # Create PEFT model with special handling for MX
-    print("[LoRA] Attaching LoRA to base model...")
-    # For MX models, we need to be careful about dtype
-    # The base model uses MX format internally, but the interface should be fp32
-    model = PeftModel.from_pretrained(
-        model,
-        adapter_path,
-        is_trainable=False,
-        **peft_kwargs  # This includes token and subfolder
-    )
-    # Manually update the adapter weights with our converted versions
-    model.load_state_dict(adapter_weights, strict=False)
-    print("[LoRA] Successfully attached LoRA adapter with MX compatibility")
     return model
-# -----------------------
-# Model loading with MX support
-# -----------------------
-def _build_model_kwargs(device_map: Optional[str]) -> Dict[str, Any]:
-    """Build kwargs for model loading with MX format support."""
-    kw: Dict[str, Any] = dict(
-        device_map=device_map,
-        trust_remote_code=True,
-        low_cpu_mem_usage=True,
-        token=HF_TOKEN,
-    )
-    if IS_GPT_OSS and USE_MX_FORMAT:
-        # GPT-OSS models use MX format
-        # Don't specify torch_dtype - let the model use its native MX format
-        print("[Model] Using MX format for GPT-OSS model")
-        kw.update({
-            "attn_implementation": ATTN_IMPL if device_map != "cpu" else "eager",
-            # MX models handle their own dtype internally
-            # Don't force a dtype here
-        })
-    else:
-        # Non-MX models
-        kw.update({
-            "torch_dtype": torch.float16,  # Use fp16 for non-MX models
-            "attn_implementation": ATTN_IMPL if device_map != "cpu" else "eager",
-        })
-    return kw
-def _load_model_on(device_map: Optional[str]) -> AutoModelForCausalLM:
-    """Load model with proper MX format handling."""
-    print(f"[Model] Loading base model from {MODEL_ID}...")
-    # Load config first to check for MX format
-    config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
-    # Check if this is an MX model
-    is_mx_model = (
-        IS_GPT_OSS or
-        hasattr(config, 'quantization_config') and 'mx' in str(config.quantization_config).lower() or
-        hasattr(config, 'torch_dtype') and 'mx' in str(config.torch_dtype).lower()
-    )
-    if is_mx_model:
-        print("[Model] Detected MX format model - using special loading")
-        # For MX models, we need special handling
-        # The model internally uses MX quantization
-        model = AutoModelForCausalLM.from_pretrained(
-            MODEL_ID,
-            config=config,
-            trust_remote_code=True,
-            device_map=device_map,
-            low_cpu_mem_usage=True,
-            token=HF_TOKEN,
-            # Let the model handle its own dtype
-            attn_implementation=ATTN_IMPL if device_map != "cpu" else "eager",
-        )
-        # Verify the model loaded correctly
-        print(f"[Model] Model dtype: {next(model.parameters()).dtype}")
-        print(f"[Model] Model device: {next(model.parameters()).device}")
-    else:
-        # Standard model loading
-        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **_build_model_kwargs(device_map))
-    # Load and attach LoRA adapter if specified
-    if ADAPTER_ID:
-        try:
-            if is_mx_model:
-                # Use special MX-compatible LoRA loading with subfolder support
-                model = prepare_model_for_mx_lora(model, ADAPTER_ID, ADAPTER_SUBFOLDER)
-            else:
-                # Standard PEFT loading for non-MX models
-                if not _HAS_PEFT:
-                    raise RuntimeError("PEFT is required when ADAPTER_ID is set.")
-                print(f"[Model] Loading adapter from {ADAPTER_ID} (standard mode)...")
-                peft_kwargs = {"token": HF_TOKEN, "is_trainable": False}
-                if ADAPTER_SUBFOLDER:
-                    peft_kwargs["subfolder"] = ADAPTER_SUBFOLDER
-                    print(f"[Model] Using subfolder: {ADAPTER_SUBFOLDER}")
-                model = PeftModel.from_pretrained(
-                    model,
-                    ADAPTER_ID,
-                    **peft_kwargs
-                )
-            print("[Model] Successfully loaded with LoRA adapter")
-            # Optionally merge adapter for better performance
-            merge_adapter = os.getenv("MERGE_ADAPTER", "0") == "1"
-            if merge_adapter and hasattr(model, 'merge_and_unload'):
-                print("[Model] Merging adapter into base model...")
-                model = model.merge_and_unload()
-                print("[Model] Adapter merged successfully")
-        except Exception as e:
-            print(f"[Error] Failed to load adapter: {e}")
-            print("[Warning] Continuing with base model only")
-    model.eval()
-    # Ensure proper config
-    if getattr(model.config, "pad_token_id", None) is None:
-        model.config.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
-    model.config.use_cache = True
-    print(f"[Model] Model loaded successfully - Type: {'MX Format' if is_mx_model else 'Standard'}")
-    return model
-# -----------------------
-# Harmony formatting
-# -----------------------
-def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str = "high") -> Any:
-    """Build a Harmony-formatted prompt."""
-    if HARMONY_AVAILABLE and harmony_encoding is not None:
-        effort_map = {"low": ReasoningEffort.LOW, "medium": ReasoningEffort.MEDIUM, "high": ReasoningEffort.HIGH}
-        effort = effort_map.get(str(reasoning_effort).lower(), ReasoningEffort.HIGH)
-        system_content = (
-            SystemContent.new()
-            .with_model_identity("You are ChatGPT, a large language model trained by OpenAI.")
-            .with_reasoning_effort(effort)
-            .with_conversation_start_date(datetime.now().strftime("%Y-%m-%d"))
-            .with_knowledge_cutoff("2024-06")
-            .with_required_channels(REQUIRED_CHANNELS)
         )
-        sys_text = SYSTEM_DEF
-        rest: List[Dict[str, str]] = messages or []
-        if rest and rest[0].get("role") == "system":
-            sys_text = rest[0].get("content") or SYSTEM_DEF
-            rest = rest[1:]
-        harmony_messages = [Message.from_role_and_content(Role.SYSTEM, system_content)]
-        dev = DeveloperContent.new().with_instructions(sys_text)
-        harmony_messages.append(Message.from_role_and_content(Role.DEVELOPER, dev))
-        for m in rest:
-            role = m.get("role"); content = m.get("content", "")
-            if role == "user":
-                harmony_messages.append(Message.from_role_and_content(Role.USER, content))
-            elif role == "assistant":
-                harmony_messages.append(
-                    Message.from_role_and_content(Role.ASSISTANT, content).with_channel("final")
-                )
-        convo = Conversation.from_messages(harmony_messages)
-        return harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
-    # Fallback: tokenizer chat template
-    if not messages or messages[0].get("role") != "system":
-        messages = [{"role": "system", "content": SYSTEM_DEF}] + (messages or [])
-    return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
 def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
-    """Parse response tokens using Harmony format to extract channels."""
-    if not HARMONY_AVAILABLE:
         text = tokenizer.decode(tokens, skip_special_tokens=False)
-        return {"final": extract_final_channel_fallback(text), "raw": text}
-    parsed_messages = harmony_encoding.parse_messages_from_completion_tokens(tokens, Role.ASSISTANT)
-    channels = {}
-    for msg in parsed_messages:
-        channel = msg.channel if hasattr(msg, 'channel') else "final"
-        if channel not in channels:
-            channels[channel] = ""
-        channels[channel] += "".join([getattr(part, "text", str(part)) for part in (msg.content if isinstance(msg.content, list) else [msg.content])])
-    if "final" not in channels:
-        channels["final"] = " ".join(channels.values())
-    return channels
-def extract_final_channel_fallback(text: str) -> str:
-    """Extract the <final> channel from decoded Harmony text."""
     try:
-        chunks: Dict[str, str] = {}
-        pieces = text.split("<|channel|>")
-        for seg in pieces[1:]:
-            name_end = seg.find("<|message|>")
-            if name_end <= 0:
-                continue
-            ch = seg[:name_end].strip()
-            body_start = name_end + len("<|message|>")
-            next_pos = len(seg)
-            for delim in ("<|channel|>", "<|end|>", "<|return|>"):
-                p = seg.find(delim, body_start)
-                if p != -1:
-                    next_pos = min(next_pos, p)
-            body = seg[body_start:next_pos]
-            chunks[ch] = chunks.get(ch, "") + body
-        final_txt = (chunks.get("final", "").strip())
-        if final_txt:
-            return final_txt
-        if "<|channel|>final<|message|>" in text:
-            tail = text.split("<|channel|>final<|message|>")[-1]
-            for delim in ("<|return|>", "<|end|>", "<|channel|>"):
-                idx = tail.find(delim)
-                if idx != -1:
-                    tail = tail[:idx]
-                    break
-            return tail.strip()
-    except Exception:
-        pass
     return text.strip()
-# -----------------------
-# Rose guidance
-# -----------------------
-def build_bias_from_tokens(tokenizer, mapping: Dict[str, float]) -> torch.Tensor:
-    """Create vocab bias from {token: weight}."""
-    vocab_size = len(tokenizer)
-    bias = torch.zeros(vocab_size, dtype=torch.float32)
-    for tok, w in mapping.items():
-        if tok is None:
-            continue
-        tid = tokenizer.convert_tokens_to_ids(tok)
-        if isinstance(tid, list):
-            for t in tid:
-                if isinstance(t, int) and t >= 0:
-                    bias[t] += float(w) / max(1, len(tid))
-        elif isinstance(tid, int) and tid >= 0:
-            bias[tid] += float(w)
-    return bias
-class RoseGuidedLogits(torch.nn.Module):
-    def __init__(self, bias_vec: torch.Tensor, alpha: float = 1.0):
-        super().__init__()
-        self.bias_vec = bias_vec
-        self.alpha = float(alpha)
-    def forward(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        return scores + self.alpha * self.bias_vec.to(scores.device)
-# -----------------------
-# Generation
-# -----------------------
 @spaces.GPU(duration=120)
-def zerogpu_generate(full_prompt,
-                    gen_kwargs: Dict[str, Any],
-                    rose_map: Optional[Dict[str, float]],
-                    rose_alpha: float,
-                    rose_score: Optional[float],
-                    seed: Optional[int]) -> Dict[str, str]:
-    """Run inference on GPU with MX format support."""
     try:
         if seed is not None:
             torch.manual_seed(int(seed))
-        # Load model with MX support
-        model = _load_model_on("auto")
-        # Setup logits processor for Rose guidance
-        logits_processor = None
-        if rose_map:
-            bias = build_bias_from_tokens(tokenizer, rose_map).to(next(model.parameters()).device)
-            eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
-            logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
         # Prepare inputs
         device = next(model.parameters()).device
-        if HARMONY_AVAILABLE and isinstance(full_prompt, list):
-            input_ids = torch.tensor([full_prompt], dtype=torch.long, device=device)
-            attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)
-            inputs = {"input_ids": input_ids, "attention_mask": attention_mask}
-            prompt_len = input_ids.shape[1]
         else:
-            enc = tokenizer(full_prompt, return_tensors="pt")
-            inputs = enc.to(device)
-            prompt_len = int(inputs["input_ids"].shape[1])
-            if "attention_mask" not in inputs:
-                inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
         # Generate
-        eos_ids = HARMONY_STOP_IDS if HARMONY_AVAILABLE else tokenizer.eos_token_id
-        out_ids = model.generate(
-            **inputs,
-            do_sample=bool(gen_kwargs.get("do_sample", True)),
-            temperature=float(gen_kwargs.get("temperature", 0.7)),
-            top_p=float(gen_kwargs.get("top_p", 0.9)),
-            top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
-            max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
-            pad_token_id=model.config.pad_token_id,
-            eos_token_id=eos_ids,
-            logits_processor=logits_processor,
-            repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.1)),
-            no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
-            min_new_tokens=1,
-        )
         # Extract generated tokens
-        out_list = out_ids[0].tolist()
-        gen_ids = out_list[prompt_len:]
         # Truncate at stop tokens
-        if HARMONY_AVAILABLE:
-            for sid in HARMONY_STOP_IDS:
-                if sid in gen_ids:
-                    gen_ids = gen_ids[:gen_ids.index(sid)]
-                    break
         # Parse response
-        if HARMONY_AVAILABLE:
-            try:
-                channels = parse_harmony_response(gen_ids)
-            except Exception:
-                decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
-                channels = {
-                    "final": extract_final_channel_fallback(decoded),
-                    "raw": decoded
-                }
-        else:
-            decoded = tokenizer.decode(gen_ids, skip_special_tokens=False)
-            channels = {
-                "final": extract_final_channel_fallback(decoded),
-                "raw": decoded
-            }
         return channels
     except Exception as e:
-        import traceback
-        error_trace = traceback.format_exc()
-        print(f"[Error] Generation failed:\n{error_trace}")
-        return {"final": f"[Error] {type(e).__name__}: {str(e)}", "raw": error_trace}
     finally:
         # Cleanup
-        try:
             del model
-        except:
-            pass
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
-# -----------------------
-# Gradio handlers
-# -----------------------
-def generate_response(message: str, history: List[List[str]], system_prompt: str,
-                    temperature: float, top_p: float, top_k: int, max_new_tokens: int,
-                    do_sample: bool, seed: Optional[int],
-                    rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
-                    rose_tokens: str, rose_json: str,
-                    show_thinking: bool = False,
-                    reasoning_effort: str = "high") -> str:
-    """Generate response with CoT handling."""
     try:
-        # Build messages
-        messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
-        if history:
-            for turn in history:
-                if isinstance(turn, (list, tuple)) and len(turn) >= 2:
-                    user_msg, assistant_msg = turn[0], turn[1]
-                    if user_msg:
-                        messages.append({"role": "user", "content": str(user_msg)})
-                    if assistant_msg:
-                        messages.append({"role": "assistant", "content": str(assistant_msg)})
-        messages.append({"role": "user", "content": str(message)})
         # Create prompt
-        if HARMONY_AVAILABLE:
-            prompt = create_harmony_prompt(messages, reasoning_effort)
-        else:
-            prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        # Build Rose map
-        rose_map: Optional[Dict[str, float]] = None
-        if rose_enable:
-            rose_map = {}
-            tok_str = (rose_tokens or "").strip()
-            if tok_str:
-                for p in [p.strip() for p in tok_str.split(",") if p.strip()]:
-                    if ":" in p:
-                        k, v = p.split(":", 1)
-                        try:
-                            rose_map[k.strip()] = float(v)
-                        except:
-                            pass
-            if rose_json:
-                try:
-                    j = json.loads(rose_json)
-                    if isinstance(j, dict):
-                        for k, v in j.items():
-                            try:
-                                rose_map[str(k)] = float(v)
-                            except:
-                                pass
-                except:
-                    pass
-            if not rose_map:
-                rose_map = None
         # Generate
-        channels = zerogpu_generate(
             prompt,
-            {
-                "do_sample": bool(do_sample),
-                "temperature": float(temperature),
-                "top_p": float(top_p),
-                "top_k": int(top_k) if top_k > 0 else None,
-                "max_new_tokens": int(max_new_tokens),
-                "repetition_penalty": 1.1,
-                "no_repeat_ngram_size": 6,
-            },
-            rose_map,
-            float(rose_alpha),
-            float(rose_score) if rose_score is not None else None,
-            int(seed) if seed is not None else None,
         )
         # Format response
-        if show_thinking:
             response = "## Chain of Thought:\n\n"
             for channel, content in channels.items():
                 if channel != "final" and content:
-                    response += f"### {channel.capitalize()} Channel:\n{content}\n\n"
-            response += f"### Final Response:\n{channels.get('final', 'No final response generated')}"
-            return response
         else:
-            return channels.get("final", "No final response generated")
-    except Exception as e:
-        import traceback
-        return f"[Error] {type(e).__name__}: {str(e)}\n{traceback.format_exc()}"
-# -----------------------
-# UI
-# -----------------------
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        f"""
-        # Mirel – Harmony Chain-of-Thought Inference
-        **Model**: {MODEL_ID} {'(MX Format)' if USE_MX_FORMAT else ''}
-        **Adapter**: {ADAPTER_ID or 'None'}
-        **Status**: {'✅ Harmony Available' if HARMONY_AVAILABLE else '⚠️ Harmony Not Installed'}
-        The model uses internal thinking channels before providing final responses.
-        """
-    )
-    with gr.Row():
-        system_prompt = gr.Textbox(
-            label="System Prompt",
-            value=SYSTEM_DEF,
-            lines=2
-        )
-    with gr.Accordion("Generation Settings", open=False):
         with gr.Row():
             temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
-            top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
-            top_k = gr.Slider(0, 200, value=0, step=1, label="Top-k (0=disabled)")
         with gr.Row():
-            max_new = gr.Slider(16, 4096, value=MAX_DEF, step=16, label="Max new tokens")
-            do_sample = gr.Checkbox(value=True, label="Do sample")
             seed = gr.Number(value=None, label="Seed (optional)", precision=0)
         with gr.Row():
             reasoning_effort = gr.Radio(
-                choices=["low", "medium", "high"],
                 value="high",
-                label="Reasoning Effort",
-                info="How much thinking the model should do"
-            )
-            show_thinking = gr.Checkbox(
-                value=False,
-                label="Show thinking channels",
-                info="Display all internal reasoning channels"
             )
-    with gr.Accordion("Rose Guidance (Optional)", open=False):
-        gr.Markdown("Fine-tune generation with token biases")
-        with gr.Row():
-            rose_enable = gr.Checkbox(value=False, label="Enable Rose bias")
-            rose_alpha = gr.Slider(0.0, 5.0, value=1.0, step=0.05, label="Alpha (strength)")
-            rose_score = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="Score multiplier")
-        rose_tokens = gr.Textbox(
-            label="Token:weight pairs",
-            placeholder="example:1.5, test:-0.5",
-            value=""
-        )
-        rose_json = gr.Textbox(
-            label="JSON weights",
-            placeholder='{"token": 1.0, "another": -0.5}',
-            value=""
-        )
     # Chat interface
     chat = gr.ChatInterface(
-        fn=generate_response,
-        type="messages",
         additional_inputs=[
-            system_prompt, temperature, top_p, top_k, max_new,
-            do_sample, seed, rose_enable, rose_alpha, rose_score,
-            rose_tokens, rose_json, show_thinking, reasoning_effort
         ],
-        title="Chat with Mirel",
-        description="Chain-of-thought model with MX format support",
         examples=[
             ["Hello! Can you introduce yourself?"],
-            ["What is the capital of France?"],
-            ["Explain quantum computing in simple terms"],
-            ["Solve: If a train travels 120 miles in 2 hours, what is its average speed?"],
         ],
         cache_examples=False,
     )
-    gr.Markdown(
-        """
-        ---
-        ### Configuration:
-        - **MX Format**: Automatically detected for GPT-OSS models
-        - **LoRA Support**: fp32 LoRA adapters are converted for MX compatibility
-        - **Merge Adapter**: Set `MERGE_ADAPTER=1` to merge LoRA into base model
-        - **Auth**: Set `HF_TOKEN` in Space secrets for private model access
-        """
-    )
 if __name__ == "__main__":
-    demo.queue(max_size=8 if ZEROGPU else 32).launch(
-        server_name="0.0.0.0",
         server_port=7860,
         share=False
     )

 ZeroGPU-ready, Harmony formatting, MX format support for GPT-OSS-20B
 Proper LoRA adapter loading and conversion for MX compatibility
 Single file: app.py
+Requirements:
+huggingface_hub>=0.34.0
+transformers>=4.55.0
+accelerate>=0.33.0
+peft>=0.11.0
+torch>=2.4.0
+bitsandbytes>=0.43.1
+openai-harmony
+gradio>=5.42.0
+triton>=3.4.0
+git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels
 """
+# ===== SETUP: Ensure triton_kernels is installed for MX format =====
+import subprocess
+import sys
+def ensure_triton_kernels():
+    """Ensure triton_kernels is installed for MX format support on H200."""
+    try:
+        import triton_kernels
+        print("✓ triton_kernels already installed - MX format supported")
+        return True
+    except ImportError:
+        print("Installing triton_kernels for MX format support...")
+        try:
+            subprocess.check_call([
+                sys.executable, "-m", "pip", "install",
+                "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"
+            ])
+            print("✓ triton_kernels installed successfully")
+            # Force reimport
+            import importlib
+            import site
+            importlib.reload(site)
+            return True
+        except subprocess.CalledProcessError as e:
+            print(f"✗ Failed to install triton_kernels: {e}")
+            print("ERROR: MX format will NOT work properly without triton_kernels!")
+            return False
+# Install triton_kernels before other imports
+_TRITON_INSTALL_SUCCESS = ensure_triton_kernels()
+# ===== MAIN IMPORTS =====
 from __future__ import annotations
+import os, gc, json, torch, warnings, traceback
 from dataclasses import dataclass
 from typing import List, Dict, Optional, Any, Union
 from datetime import datetime
 from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
 import numpy as np
+# Suppress warnings
 warnings.filterwarnings("ignore", message=".*microscaling.*")
 warnings.filterwarnings("ignore", message=".*mx.*")
         ReasoningEffort
     )
     HARMONY_AVAILABLE = True
+    print("✓ OpenAI Harmony loaded successfully")
 except ImportError:
+    print("⚠ openai_harmony not installed. Install with: pip install openai-harmony")
     HARMONY_AVAILABLE = False
+# Import PEFT for LoRA support
+try:
+    from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model
+    _HAS_PEFT = True
+    print("✓ PEFT loaded successfully")
+except Exception:
+    _HAS_PEFT = False
+    print("⚠ PEFT not available. Install with: pip install peft")
+# Check for triton_kernels (required for MX format)
+try:
+    import triton_kernels
+    _HAS_TRITON_KERNELS = True
+    print("✓ triton_kernels loaded - MX format enabled")
+except ImportError:
+    _HAS_TRITON_KERNELS = False
+    print("✗ triton_kernels not available - MX format disabled!")
+# ===== CONFIGURATION =====
 MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
+ADAPTER_ID        = os.getenv("ADAPTER_ID", "AbstractPhil/mirel-gpt-oss-20b")
+ADAPTER_SUBFOLDER = os.getenv("ADAPTER_SUBFOLDER", "checkpoints/checkpoint-516")
 ATTN_IMPL         = os.getenv("ATTN_IMPL", "eager")
+SYSTEM_PROMPT     = os.getenv("SYSTEM_PROMPT", "You are Mirel, a memory-stable symbolic assistant.")
+MAX_NEW_TOKENS    = int(os.getenv("MAX_NEW_TOKENS", "512"))
+ZEROGPU           = os.getenv("ZEROGPU", os.getenv("ZERO_GPU", "1")) == "1"
+MERGE_ADAPTER     = os.getenv("MERGE_ADAPTER", "0") == "1"
+# Detect if using GPT-OSS model
 IS_GPT_OSS = "gpt-oss" in MODEL_ID.lower()
+USE_MX_FORMAT = IS_GPT_OSS and _HAS_TRITON_KERNELS
+# Harmony channels for chain-of-thought
 REQUIRED_CHANNELS = ["analysis", "commentary", "final"]
+# HF Authentication
+HF_TOKEN = (
     os.getenv("HF_TOKEN")
     or os.getenv("HUGGING_FACE_HUB_TOKEN")
     or os.getenv("HUGGINGFACEHUB_API_TOKEN")
     or os.getenv("HF_ACCESS_TOKEN")
 )
+def _hf_login():
+    """Login to HuggingFace Hub."""
     if HF_TOKEN:
         try:
             from huggingface_hub import login, whoami
             login(token=HF_TOKEN, add_to_git_credential=True)
             try:
+                user = whoami(token=HF_TOKEN)
+                print(f"✓ Logged in as: {user.get('name', user.get('id', 'unknown'))}")
+            except:
+                print("✓ HF login successful")
         except Exception as e:
+            print(f"⚠ HF login failed: {e}")
     else:
+        print("⚠ No HF_TOKEN found in environment")
+# Login before loading models
 _hf_login()
+# Disable tokenizer parallelism warning
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# ===== LOAD TOKENIZER =====
 try:
     tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
+    print(f"✓ Tokenizer loaded from {MODEL_ID}")
 except Exception as e:
+    print(f"✗ Failed to load tokenizer: {e}")
     raise
+# ===== HARMONY SETUP =====
+if HARMONY_AVAILABLE:
+    harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    HARMONY_STOP_IDS = harmony_encoding.stop_tokens_for_assistant_actions()
+else:
+    harmony_encoding = None
+    HARMONY_STOP_IDS = []
+# ===== MODEL LOADING WITH MX FORMAT SUPPORT =====
+def detect_mx_format(model) -> bool:
+    """Check if model is using native MX format."""
+    if not hasattr(model, 'model') or not hasattr(model.model, 'layers'):
+        return False
+    try:
+        first_layer = model.model.layers[0]
+        if hasattr(first_layer, 'block_sparse_moe'):
+            expert = first_layer.block_sparse_moe.experts[0]
+            if hasattr(expert, 'w1'):
+                # Check for MX format scale tensors
+                return hasattr(expert.w1, 'scales')
+    except:
+        pass
+    return False
+def load_base_model(device_map: Optional[str] = "auto") -> AutoModelForCausalLM:
+    """Load the base model with proper MX format handling."""
+    print(f"\n{'='*50}")
+    print(f"Loading model: {MODEL_ID}")
+    print(f"MX Format Available: {_HAS_TRITON_KERNELS}")
+    print(f"{'='*50}\n")
+    # Load config to check model type
+    config = AutoConfig.from_pretrained(MODEL_ID, trust_remote_code=True, token=HF_TOKEN)
+    # Build loading kwargs
+    load_kwargs = {
+        "trust_remote_code": True,
+        "device_map": device_map,
+        "low_cpu_mem_usage": True,
+        "token": HF_TOKEN,
+        "attn_implementation": ATTN_IMPL if device_map != "cpu" else "eager",
+    }
+    if IS_GPT_OSS:
+        if _HAS_TRITON_KERNELS:
+            print("→ Loading with native MX format support")
+            load_kwargs["torch_dtype"] = "auto"  # Let model use native MX
         else:
+            print("⚠ No triton_kernels - falling back to bf16 (dequantized)")
+            print("  This will likely cause LoRA compatibility issues!")
+            load_kwargs["torch_dtype"] = torch.bfloat16
+    else:
+        # Non-GPT-OSS models
+        load_kwargs["torch_dtype"] = torch.bfloat16
+    # Load the model
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)
+    # Verify format
+    print(f"Model loaded - dtype: {next(model.parameters()).dtype}")
+    if IS_GPT_OSS:
+        is_mx = detect_mx_format(model)
+        if is_mx:
+            print("✓ Confirmed: Using native MX format")
+        else:
+            print("⚠ Model dequantized to bf16 - LoRA may fail")
+    # Set model config
+    if getattr(model.config, "pad_token_id", None) is None:
+        model.config.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
+    model.config.use_cache = True
     return model
+def load_lora_adapter(model, adapter_id: str, subfolder: Optional[str] = None):
+    """Load and attach LoRA adapter with MX format handling."""
+    if not _HAS_PEFT:
+        raise RuntimeError("PEFT is required for LoRA adapters")
+    print(f"\n{'='*50}")
+    print(f"Loading LoRA: {adapter_id}")
+    if subfolder:
+        print(f"Subfolder: {subfolder}")
+    print(f"{'='*50}\n")
+    # Check if model is using MX format
+    is_mx = detect_mx_format(model) if IS_GPT_OSS else False
+    # Prepare kwargs for PEFT
+    peft_kwargs = {"token": HF_TOKEN, "is_trainable": False}
+    if subfolder:
+        peft_kwargs["subfolder"] = subfolder
+    try:
+        # Load adapter configuration
+        peft_config = PeftConfig.from_pretrained(adapter_id, **peft_kwargs)
+        print(f"LoRA config: r={peft_config.r}, alpha={peft_config.lora_alpha}")
+        # Load the adapter
+        model = PeftModel.from_pretrained(model, adapter_id, **peft_kwargs)
+        if not is_mx and IS_GPT_OSS:
+            print("⚠ WARNING: Model is bf16 but LoRA was likely trained on MX format")
+            print("  Reducing LoRA influence to 10% to prevent corruption")
+            # Scale down LoRA weights
+            for name, param in model.named_parameters():
+                if 'lora_' in name:
+                    param.data *= 0.1
+        print("✓ LoRA adapter loaded successfully")
+        # Optionally merge adapter
+        if MERGE_ADAPTER and hasattr(model, 'merge_and_unload'):
+            print("Merging adapter into base model...")
+            model = model.merge_and_unload()
+            print("✓ Adapter merged")
+        return model
+    except Exception as e:
+        print(f"✗ Failed to load LoRA: {e}")
+        print("Continuing with base model only")
+        return model
+# ===== HARMONY FORMATTING =====
+def create_harmony_prompt(messages: List[Dict[str, str]], reasoning_effort: str = "high"):
+    """Create Harmony-formatted prompt."""
+    if not HARMONY_AVAILABLE or not harmony_encoding:
+        # Fallback to chat template
+        if messages and messages[0].get("role") != "system":
+            messages = [{"role": "system", "content": SYSTEM_PROMPT}] + messages
+        return tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    # Map reasoning effort
+    effort_map = {
+        "low": ReasoningEffort.LOW,
+        "medium": ReasoningEffort.MEDIUM,
+        "high": ReasoningEffort.HIGH
+    }
+    effort = effort_map.get(reasoning_effort.lower(), ReasoningEffort.HIGH)
+    # Build Harmony conversation
+    system_content = (
+        SystemContent.new()
+        .with_model_identity("You are ChatGPT, a large language model trained by OpenAI.")
+        .with_reasoning_effort(effort)
+        .with_conversation_start_date(datetime.now().strftime("%Y-%m-%d"))
+        .with_knowledge_cutoff("2024-06")
+        .with_required_channels(REQUIRED_CHANNELS)
+    )
+    # Extract system prompt
+    sys_text = SYSTEM_PROMPT
+    rest = messages or []
+    if rest and rest[0].get("role") == "system":
+        sys_text = rest[0].get("content", SYSTEM_PROMPT)
+        rest = rest[1:]
+    # Build messages
+    harmony_messages = [
+        Message.from_role_and_content(Role.SYSTEM, system_content),
+        Message.from_role_and_content(
+            Role.DEVELOPER,
+            DeveloperContent.new().with_instructions(sys_text)
         )
+    ]
+    for msg in rest:
+        role = msg.get("role")
+        content = msg.get("content", "")
+        if role == "user":
+            harmony_messages.append(Message.from_role_and_content(Role.USER, content))
+        elif role == "assistant":
+            harmony_messages.append(
+                Message.from_role_and_content(Role.ASSISTANT, content).with_channel("final")
+            )
+    # Render to token IDs
+    convo = Conversation.from_messages(harmony_messages)
+    return harmony_encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
 def parse_harmony_response(tokens: List[int]) -> Dict[str, str]:
+    """Parse Harmony response tokens into channels."""
+    if not HARMONY_AVAILABLE or not harmony_encoding:
         text = tokenizer.decode(tokens, skip_special_tokens=False)
+        return {"final": extract_final_channel(text), "raw": text}
     try:
+        # Parse using Harmony
+        parsed = harmony_encoding.parse_messages_from_completion_tokens(tokens, Role.ASSISTANT)
+        channels = {}
+        for msg in parsed:
+            channel = getattr(msg, 'channel', 'final')
+            if channel not in channels:
+                channels[channel] = ""
+            # Extract text content
+            content = msg.content
+            if isinstance(content, list):
+                text = "".join([getattr(part, "text", str(part)) for part in content])
+            else:
+                text = getattr(content, "text", str(content))
+            channels[channel] += text
+        # Ensure final channel exists
+        if "final" not in channels:
+            channels["final"] = " ".join(channels.values())
+        return channels
+    except Exception as e:
+        print(f"Harmony parsing failed: {e}")
+        text = tokenizer.decode(tokens, skip_special_tokens=False)
+        return {"final": extract_final_channel(text), "raw": text}
+def extract_final_channel(text: str) -> str:
+    """Extract final channel from raw text."""
+    # Look for <|channel|>final<|message|>
+    if "<|channel|>final<|message|>" in text:
+        parts = text.split("<|channel|>final<|message|>")
+        if len(parts) > 1:
+            final = parts[-1]
+            # Truncate at next marker
+            for marker in ["<|channel|>", "<|end|>", "<|return|>"]:
+                if marker in final:
+                    final = final.split(marker)[0]
+            return final.strip()
+    # Fallback: return cleaned text
+    for marker in ["<|channel|>", "<|message|>", "<|end|>", "<|return|>"]:
+        text = text.replace(marker, " ")
     return text.strip()
+# ===== GENERATION =====
 @spaces.GPU(duration=120)
+def generate_on_gpu(
+    prompt,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    max_new_tokens: int,
+    do_sample: bool,
+    repetition_penalty: float,
+    seed: Optional[int]
+) -> Dict[str, str]:
+    """Run generation on GPU."""
     try:
+        # Set seed if provided
         if seed is not None:
             torch.manual_seed(int(seed))
+        # Load model
+        print("\nLoading model for generation...")
+        model = load_base_model("auto")
+        # Load LoRA if specified
+        if ADAPTER_ID:
+            model = load_lora_adapter(model, ADAPTER_ID, ADAPTER_SUBFOLDER)
+        model.eval()
         # Prepare inputs
         device = next(model.parameters()).device
+        if HARMONY_AVAILABLE and isinstance(prompt, list):
+            # Harmony returns token IDs
+            input_ids = torch.tensor([prompt], dtype=torch.long, device=device)
         else:
+            # String prompt
+            inputs = tokenizer(prompt, return_tensors="pt")
+            input_ids = inputs["input_ids"].to(device)
+        attention_mask = torch.ones_like(input_ids)
+        prompt_len = input_ids.shape[1]
         # Generate
+        print("Generating response...")
+        with torch.no_grad():
+            outputs = model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=max_new_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k if top_k > 0 else None,
+                do_sample=do_sample,
+                repetition_penalty=repetition_penalty,
+                pad_token_id=model.config.pad_token_id,
+                eos_token_id=HARMONY_STOP_IDS if HARMONY_STOP_IDS else tokenizer.eos_token_id,
+                no_repeat_ngram_size=3,
+            )
         # Extract generated tokens
+        gen_tokens = outputs[0][prompt_len:].tolist()
         # Truncate at stop tokens
+        for stop_id in HARMONY_STOP_IDS:
+            if stop_id in gen_tokens:
+                gen_tokens = gen_tokens[:gen_tokens.index(stop_id)]
+                break
         # Parse response
+        channels = parse_harmony_response(gen_tokens)
         return channels
     except Exception as e:
+        error_msg = f"Generation failed: {str(e)}\n{traceback.format_exc()}"
+        print(error_msg)
+        return {"final": f"Error: {str(e)}", "raw": error_msg}
     finally:
         # Cleanup
+        if 'model' in locals():
             del model
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
+# ===== GRADIO INTERFACE =====
+def chat_response(
+    message: str,
+    history: List[List[str]],
+    system_prompt: str,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    max_new_tokens: int,
+    do_sample: bool,
+    repetition_penalty: float,
+    seed: Optional[int],
+    reasoning_effort: str,
+    show_thinking: bool
+) -> str:
+    """Handle chat interaction."""
     try:
+        # Build conversation
+        messages = [{"role": "system", "content": system_prompt or SYSTEM_PROMPT}]
+        # Add history
+        for turn in history or []:
+            if isinstance(turn, (list, tuple)) and len(turn) >= 2:
+                user_msg, assistant_msg = turn[0], turn[1]
+                if user_msg:
+                    messages.append({"role": "user", "content": str(user_msg)})
+                if assistant_msg:
+                    messages.append({"role": "assistant", "content": str(assistant_msg)})
+        # Add current message
+        messages.append({"role": "user", "content": message})
         # Create prompt
+        prompt = create_harmony_prompt(messages, reasoning_effort)
         # Generate
+        channels = generate_on_gpu(
             prompt,
+            temperature,
+            top_p,
+            top_k,
+            max_new_tokens,
+            do_sample,
+            repetition_penalty,
+            seed
         )
         # Format response
+        if show_thinking and len(channels) > 1:
             response = "## Chain of Thought:\n\n"
             for channel, content in channels.items():
                 if channel != "final" and content:
+                    response += f"### {channel.capitalize()}:\n{content}\n\n"
+            response += f"### Final Response:\n{channels.get('final', 'No response generated')}"
         else:
+            response = channels.get("final", "No response generated")
+        return response
+    except Exception as e:
+        return f"Error: {str(e)}"
+# ===== BUILD UI =====
+with gr.Blocks(theme=gr.themes.Soft(), title="Mirel") as demo:
+    # Header with status
+    status_mx = "✅ MX Format" if _HAS_TRITON_KERNELS else "❌ No MX Support"
+    status_harmony = "✅ Harmony" if HARMONY_AVAILABLE else "❌ No Harmony"
+    gr.Markdown(f"""
+    # 🤖 Mirel – Chain-of-Thought Assistant
+    **Model:** `{MODEL_ID}` | **Adapter:** `{ADAPTER_ID or 'None'}`
+    **Status:** {status_mx} | {status_harmony} | {"✅ ZeroGPU" if ZEROGPU else "CPU Mode"}
+    {'''
+    ⚠️ **WARNING: MX Format Support Missing!**
+    Install with: `pip install git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels`
+    ''' if IS_GPT_OSS and not _HAS_TRITON_KERNELS else ''}
+    """)
+    # System prompt
+    system_prompt = gr.Textbox(
+        label="System Prompt",
+        value=SYSTEM_PROMPT,
+        lines=2
+    )
+    # Settings
+    with gr.Accordion("⚙️ Generation Settings", open=False):
         with gr.Row():
             temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
+            top_p = gr.Slider(0.0, 1.0, value=0.9, step=0.01, label="Top-p")
+            top_k = gr.Slider(0, 200, value=50, step=1, label="Top-k")
         with gr.Row():
+            max_new_tokens = gr.Slider(16, 2048, value=MAX_NEW_TOKENS, step=16, label="Max tokens")
+            repetition_penalty = gr.Slider(1.0, 1.5, value=1.1, step=0.01, label="Repetition penalty")
             seed = gr.Number(value=None, label="Seed (optional)", precision=0)
         with gr.Row():
+            do_sample = gr.Checkbox(value=True, label="Sample")
+            show_thinking = gr.Checkbox(value=False, label="Show thinking channels")
             reasoning_effort = gr.Radio(
+                ["low", "medium", "high"],
                 value="high",
+                label="Reasoning effort"
             )
     # Chat interface
     chat = gr.ChatInterface(
+        fn=chat_response,
         additional_inputs=[
+            system_prompt,
+            temperature,
+            top_p,
+            top_k,
+            max_new_tokens,
+            do_sample,
+            repetition_penalty,
+            seed,
+            reasoning_effort,
+            show_thinking
         ],
+        title=None,
         examples=[
             ["Hello! Can you introduce yourself?"],
+            ["What's the capital of France?"],
+            ["Explain quantum computing simply"],
+            ["Write a haiku about coding"],
         ],
         cache_examples=False,
     )
+    # Footer
+    gr.Markdown("""
+    ---
+    💡 **Tips:**
+    - Enable "Show thinking channels" to see the model's reasoning process
+    - Adjust "Reasoning effort" for faster responses (low) or better quality (high)
+    - The model uses MX format on H200 GPUs for optimal performance
+    """)
+# ===== LAUNCH =====
 if __name__ == "__main__":
+    print("\n" + "="*60)
+    print("MIREL READY TO LAUNCH")
+    print(f"Model: {MODEL_ID}")
+    print(f"Adapter: {ADAPTER_ID or 'None'}")
+    print(f"MX Format: {'ENABLED' if _HAS_TRITON_KERNELS else 'DISABLED'}")
+    print(f"Harmony: {'ENABLED' if HARMONY_AVAILABLE else 'DISABLED'}")
+    print("="*60 + "\n")
+    demo.queue(max_size=10).launch(
+        server_name="0.0.0.0",
         server_port=7860,
         share=False
     )

install.sh ADDED Viewed

	@@ -0,0 +1,97 @@

+#!/bin/bash
+# Complete installation script for Mirel with MX format support on H200
+echo "Installing Mirel dependencies for GPT-OSS with MX format support..."
+# Upgrade pip first
+pip install --upgrade pip
+# Install main requirements
+pip install huggingface_hub>=0.34.0
+pip install transformers>=4.55.0
+pip install accelerate>=0.33.0
+pip install torch>=2.4.0
+pip install gradio>=5.42.0
+pip install spaces
+# Install LoRA/PEFT support
+pip install peft>=0.11.0
+pip install bitsandbytes>=0.43.1
+# Install Harmony format
+pip install openai-harmony
+# Install Triton and MX format support
+pip install triton>=3.4.0
+# CRITICAL: Install triton_kernels from git subdirectory
+# This is REQUIRED for MX format on H200 GPUs
+echo "Installing triton_kernels (REQUIRED for MX format)..."
+pip install git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels
+# Optional but recommended
+pip install safetensors>=0.4.0
+pip install sentencepiece>=0.2.0
+pip install protobuf>=3.20.0
+pip install "numpy<2.0.0"
+# Verify critical imports
+echo "Verifying installation..."
+python -c "
+import sys
+errors = []
+try:
+    import torch
+    print(f'✓ PyTorch {torch.__version__}')
+except ImportError as e:
+    errors.append(f'✗ PyTorch: {e}')
+try:
+    import transformers
+    print(f'✓ Transformers {transformers.__version__}')
+except ImportError as e:
+    errors.append(f'✗ Transformers: {e}')
+try:
+    import peft
+    print(f'✓ PEFT {peft.__version__}')
+except ImportError as e:
+    errors.append(f'✗ PEFT: {e}')
+try:
+    import triton
+    print(f'✓ Triton {triton.__version__}')
+except ImportError as e:
+    errors.append(f'✗ Triton: {e}')
+try:
+    import triton_kernels
+    print('✓ Triton Kernels (MX format support)')
+except ImportError as e:
+    errors.append(f'✗ Triton Kernels (CRITICAL): {e}')
+    print('⚠️  WARNING: MX format will NOT work without triton_kernels!')
+try:
+    import openai_harmony
+    print('✓ OpenAI Harmony')
+except ImportError as e:
+    errors.append(f'✗ OpenAI Harmony: {e}')
+try:
+    import gradio
+    print(f'✓ Gradio {gradio.__version__}')
+except ImportError as e:
+    errors.append(f'✗ Gradio: {e}')
+if errors:
+    print('\n❌ Installation issues found:')
+    for error in errors:
+        print(f'  {error}')
+    sys.exit(1)
+else:
+    print('\n✅ All dependencies installed successfully!')
+    print('You can now run the Mirel app with MX format support on H200 GPUs')
+"
+echo "Installation complete!"

requirements.txt CHANGED Viewed

@@ -1,10 +1,25 @@
 huggingface_hub>=0.34.0
 transformers>=4.55.0
 accelerate>=0.33.0
 peft>=0.11.0
-torch>=2.4.0   # ZeroGPU-supported (2.3.x is NOT supported)
 bitsandbytes>=0.43.1
-openai_harmony
-gradio>=5.42.0
 triton>=3.4.0
-msamp

+# Core dependencies
 huggingface_hub>=0.34.0
 transformers>=4.55.0
 accelerate>=0.33.0
+torch>=2.4.0
+gradio>=5.42.0
+spaces
+# LoRA/PEFT support
 peft>=0.11.0
 bitsandbytes>=0.43.1
+# Harmony format for OpenAI GPT-OSS models
+openai-harmony
+# MX format support (REQUIRED for GPT-OSS-20B on H200)
 triton>=3.4.0
+# Note: triton_kernels must be installed separately from git:
+# pip install git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels
+# Optional but recommended
+safetensors>=0.4.0
+sentencepiece>=0.2.0
+protobuf>=3.20.0
+numpy<2.0.0  # Some dependencies may not support numpy 2.x yet

setup.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+setup.py - Run this at the start of app.py to ensure triton_kernels is installed
+Add this to the top of your app.py file in HF Spaces
+"""
+import subprocess
+import sys
+def ensure_triton_kernels():
+    """Ensure triton_kernels is installed for MX format support."""
+    try:
+        import triton_kernels
+        print("✓ triton_kernels already installed")
+        return True
+    except ImportError:
+        print("Installing triton_kernels for MX format support...")
+        try:
+            subprocess.check_call([
+                sys.executable, "-m", "pip", "install",
+                "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"
+            ])
+            print("✓ triton_kernels installed successfully")
+            return True
+        except subprocess.CalledProcessError as e:
+            print(f"✗ Failed to install triton_kernels: {e}")
+            print("WARNING: MX format will fall back to bf16, LoRA may not work!")
+            return False
+# Run at import time
+if __name__ != "__main__":  # When imported
+    ensure_triton_kernels()