Spaces:

AbstractPhil
/

GPT-OSS-20B-Mirel

Running on Zero

App Files Files Community

AbstractPhil commited on about 14 hours ago

Commit

f7e1fb5

1 Parent(s): 1189725

yes

Browse files

Files changed (3) hide show

app.py +67 -49
install.sh +14 -5
setup.py +51 -20

app.py CHANGED Viewed

@@ -16,40 +16,24 @@ gradio>=5.42.0
 triton>=3.4.0
 git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels
 """
-# ===== SETUP: Ensure triton_kernels is installed for MX format =====
-import subprocess
-import sys
-def ensure_triton_kernels():
-    """Ensure triton_kernels is installed for MX format support on H200."""
     try:
         import triton_kernels
-        print("✓ triton_kernels already installed - MX format supported")
-        return True
     except ImportError:
-        print("Installing triton_kernels for MX format support...")
-        try:
-            subprocess.check_call([
-                sys.executable, "-m", "pip", "install",
-                "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"
-            ])
-            print("✓ triton_kernels installed successfully")
-            # Force reimport
-            import importlib
-            import site
-            importlib.reload(site)
-            return True
-        except subprocess.CalledProcessError as e:
-            print(f"✗ Failed to install triton_kernels: {e}")
-            print("ERROR: MX format will NOT work properly without triton_kernels!")
-            return False
-# Install triton_kernels before other imports
-_TRITON_INSTALL_SUCCESS = ensure_triton_kernels()
 # ===== MAIN IMPORTS =====
 import os, gc, json, torch, warnings, traceback
 from dataclasses import dataclass
 from typing import List, Dict, Optional, Any, Union
 from datetime import datetime
@@ -90,14 +74,17 @@ except Exception:
     _HAS_PEFT = False
     print("⚠ PEFT not available. Install with: pip install peft")
-# Check for triton_kernels (required for MX format)
 try:
     import triton_kernels
     _HAS_TRITON_KERNELS = True
-    print("✓ triton_kernels loaded - MX format enabled")
-except ImportError:
     _HAS_TRITON_KERNELS = False
-    print("✗ triton_kernels not available - MX format disabled!")
 # ===== CONFIGURATION =====
 MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
@@ -202,33 +189,64 @@ def load_base_model(device_map: Optional[str] = "auto") -> AutoModelForCausalLM:
     if IS_GPT_OSS:
         if _HAS_TRITON_KERNELS:
             print("→ Loading with native MX format support")
-            load_kwargs["torch_dtype"] = "auto"  # Let model use native MX
         else:
             print("⚠ No triton_kernels - falling back to bf16 (dequantized)")
             print("  This will likely cause LoRA compatibility issues!")
             load_kwargs["torch_dtype"] = torch.bfloat16
     else:
         # Non-GPT-OSS models
         load_kwargs["torch_dtype"] = torch.bfloat16
-    # Load the model
-    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)
-    # Verify format
-    print(f"Model loaded - dtype: {next(model.parameters()).dtype}")
-    if IS_GPT_OSS:
-        is_mx = detect_mx_format(model)
-        if is_mx:
-            print("✓ Confirmed: Using native MX format")
         else:
-            print("⚠ Model dequantized to bf16 - LoRA may fail")
-    # Set model config
-    if getattr(model.config, "pad_token_id", None) is None:
-        model.config.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
-    model.config.use_cache = True
-    return model
 def load_lora_adapter(model, adapter_id: str, subfolder: Optional[str] = None):
     """Load and attach LoRA adapter with MX format handling."""

 triton>=3.4.0
 git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels
 """
+from __future__ import annotations
+# Import setup to fix Triton if needed
+try:
+    import setup  # This will run install.sh if Triton needs fixing
+except ImportError:
+    print("No setup.py found - checking Triton manually")
+    # Fallback check
     try:
         import triton_kernels
+        from triton.tools.ragged_tma import load_ragged
+        print("✓ Triton configured correctly")
     except ImportError:
+        print("⚠ Triton not configured for MX - run install.sh")
 # ===== MAIN IMPORTS =====
 import os, gc, json, torch, warnings, traceback
+import subprocess, sys
 from dataclasses import dataclass
 from typing import List, Dict, Optional, Any, Union
 from datetime import datetime
     _HAS_PEFT = False
     print("⚠ PEFT not available. Install with: pip install peft")
+# Check for triton_kernels after setup
 try:
     import triton_kernels
+    # Also check for the specific module that was missing
+    from triton.tools.ragged_tma import load_ragged, store_ragged
     _HAS_TRITON_KERNELS = True
+    print("✓ triton_kernels loaded with ragged_tma support - MX format enabled")
+except ImportError as e:
     _HAS_TRITON_KERNELS = False
+    print(f"✗ triton_kernels not fully functional: {e}")
+    print("MX format will fall back to bf16 - LoRA may not work correctly")
 # ===== CONFIGURATION =====
 MODEL_ID          = os.getenv("MODEL_ID", "openai/gpt-oss-20b")
     if IS_GPT_OSS:
         if _HAS_TRITON_KERNELS:
             print("→ Loading with native MX format support")
+            # For MX format, let the model handle its own dtype
+            load_kwargs["torch_dtype"] = "auto"
+            # Set environment variable to ensure MX is used
+            import os
+            os.environ["FORCE_MX_QUANTIZATION"] = "1"
         else:
             print("⚠ No triton_kernels - falling back to bf16 (dequantized)")
             print("  This will likely cause LoRA compatibility issues!")
             load_kwargs["torch_dtype"] = torch.bfloat16
+            # Explicitly disable MX
+            import os
+            os.environ["FORCE_MX_QUANTIZATION"] = "0"
     else:
         # Non-GPT-OSS models
         load_kwargs["torch_dtype"] = torch.bfloat16
+    try:
+        # Load the model
+        model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)
+        # Verify format
+        print(f"Model loaded - dtype: {next(model.parameters()).dtype}")
+        if IS_GPT_OSS:
+            is_mx = detect_mx_format(model)
+            if is_mx:
+                print("✓ Confirmed: Using native MX format")
+            else:
+                print("⚠ Model dequantized to bf16 - LoRA may fail")
+        # Set model config
+        if getattr(model.config, "pad_token_id", None) is None:
+            model.config.pad_token_id = tokenizer.pad_token_id or tokenizer.eos_token_id
+        model.config.use_cache = True
+        return model
+    except Exception as e:
+        if "ragged_tma" in str(e):
+            print("\n" + "="*60)
+            print("ERROR: Triton version incompatibility detected!")
+            print("The model requires a specific Triton version with ragged_tma support.")
+            print("\nTo fix this, run:")
+            print("pip uninstall -y triton triton_kernels")
+            print("pip install --index-url https://download.pytorch.org/whl/nightly/cu121 triton")
+            print("pip install git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels")
+            print("="*60 + "\n")
+            # Try to load without MX as fallback
+            print("Attempting to load model without MX format...")
+            load_kwargs["torch_dtype"] = torch.bfloat16
+            os.environ["FORCE_MX_QUANTIZATION"] = "0"
+            model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **load_kwargs)
+            print("✓ Model loaded in bf16 mode (degraded performance)")
+            return model
         else:
+            raise
 def load_lora_adapter(model, adapter_id: str, subfolder: Optional[str] = None):
     """Load and attach LoRA adapter with MX format handling."""

install.sh CHANGED Viewed

@@ -10,7 +10,6 @@ pip install --upgrade pip
 pip install huggingface_hub>=0.34.0
 pip install transformers>=4.55.0
 pip install accelerate>=0.33.0
-pip install torch>=2.4.0
 pip install gradio>=5.42.0
 pip install spaces
@@ -21,11 +20,21 @@ pip install bitsandbytes>=0.43.1
 # Install Harmony format
 pip install openai-harmony
-# Install Triton and MX format support
-pip install triton>=3.4.0
-# CRITICAL: Install triton_kernels from git subdirectory
-# This is REQUIRED for MX format on H200 GPUs
 echo "Installing triton_kernels (REQUIRED for MX format)..."
 pip install git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels

 pip install huggingface_hub>=0.34.0
 pip install transformers>=4.55.0
 pip install accelerate>=0.33.0
 pip install gradio>=5.42.0
 pip install spaces
 # Install Harmony format
 pip install openai-harmony
+# FIX TRITON FOR MX FORMAT
+# The standard triton doesn't have ragged_tma module needed for MX
+echo "Fixing Triton installation for MX format..."
+# Clean existing triton installations
+pip uninstall -y triton triton_kernels 2>/dev/null || true
+# Install PyTorch nightly (includes compatible Triton)
+echo "Installing PyTorch nightly with compatible Triton..."
+pip install --upgrade --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu121
+# Install Triton from PyTorch nightly
+pip install --upgrade --index-url https://download.pytorch.org/whl/nightly/cu121 triton
+# Install triton_kernels from source
 echo "Installing triton_kernels (REQUIRED for MX format)..."
 pip install git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels

setup.py CHANGED Viewed

@@ -1,31 +1,62 @@
 """
-setup.py - Run this at the start of app.py to ensure triton_kernels is installed
-Add this to the top of your app.py file in HF Spaces
 """
 import subprocess
 import sys
-def ensure_triton_kernels():
-    """Ensure triton_kernels is installed for MX format support."""
     try:
         import triton_kernels
-        print("✓ triton_kernels already installed")
         return True
     except ImportError:
-        print("Installing triton_kernels for MX format support...")
-        try:
-            subprocess.check_call([
-                sys.executable, "-m", "pip", "install",
-                "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"
-            ])
-            print("✓ triton_kernels installed successfully")
-            return True
-        except subprocess.CalledProcessError as e:
-            print(f"✗ Failed to install triton_kernels: {e}")
-            print("WARNING: MX format will fall back to bf16, LoRA may not work!")
-            return False
-# Run at import time
-if __name__ != "__main__":  # When imported
-    ensure_triton_kernels()

 """
+setup.py - Run this at the start of app.py to ensure proper Triton installation
+Add: import setup  # at the top of app.py after the docstring
 """
 import subprocess
 import sys
+import os
+def fix_triton_installation():
+    """Fix Triton for MX format by running install.sh if needed."""
     try:
+        # Check if we have the right triton
         import triton_kernels
+        from triton.tools.ragged_tma import load_ragged, store_ragged
+        print("✓ Triton already properly configured for MX format")
         return True
     except ImportError:
+        print("Triton not properly configured for MX format")
+        print("Running install.sh to fix dependencies...")
+        # Check if install.sh exists
+        if os.path.exists("install.sh"):
+            try:
+                # Make it executable and run it
+                subprocess.check_call(["chmod", "+x", "install.sh"])
+                subprocess.check_call(["./install.sh"])
+                print("✓ Dependencies installed via install.sh")
+                return True
+            except subprocess.CalledProcessError as e:
+                print(f"Error running install.sh: {e}")
+        else:
+            print("install.sh not found - trying direct pip fix...")
+            # Fallback: run key commands directly
+            try:
+                # Clean and reinstall triton
+                subprocess.run([sys.executable, "-m", "pip", "uninstall", "-y", "triton", "triton_kernels"],
+                             capture_output=True)
+                # Install nightly triton
+                subprocess.check_call([
+                    sys.executable, "-m", "pip", "install", "--upgrade",
+                    "--index-url", "https://download.pytorch.org/whl/nightly/cu121",
+                    "triton"
+                ])
+                # Install triton_kernels
+                subprocess.check_call([
+                    sys.executable, "-m", "pip", "install",
+                    "git+https://github.com/triton-lang/triton.git@main#subdirectory=python/triton_kernels"
+                ])
+                print("✓ Triton fixed via direct installation")
+                return True
+            except Exception as e:
+                print(f"Failed to fix Triton: {e}")
+                return False
+# Auto-run on import
+if __name__ != "__main__":
+    fix_triton_installation()