import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, BitsAndBytesConfig
import tempfile
from huggingface_hub import HfApi
from huggingface_hub import list_models
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from bitsandbytes.nn import Linear4bit
from packaging import version
import os
def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str:
if profile is None:
return "👋 Hello! Sign in to get started with the BitsAndBytes Quantizer."
return f"👋 Hello {profile.name}! Welcome to the BitsAndBytes Quantizer."
def check_model_exists(oauth_token: gr.OAuthToken | None, username, model_name, quantized_model_name):
"""Check if a model exists in the user's Hugging Face repository."""
try:
models = list_models(author=username, token=oauth_token.token)
model_names = [model.id for model in models]
if quantized_model_name :
repo_name = f"{username}/{quantized_model_name}"
else :
repo_name = f"{username}/{model_name.split('/')[-1]}-BNB-INT4"
if repo_name in model_names:
return f"Model '{repo_name}' already exists in your repository."
else:
return None # Model does not exist
except Exception as e:
return f"Error checking model existence: {str(e)}"
def create_model_card(model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4):
model_card = f"""---
base_model:
- {model_name}
---
# {model_name} (Quantized)
## Description
This model is a quantized version of the original model `{model_name}`. It has been quantized using int4 quantization with bitsandbytes.
## Quantization Details
- **Quantization Type**: int4
- **bnb_4bit_quant_type**: {quant_type_4}
- **bnb_4bit_use_double_quant**: {double_quant_4}
- **bnb_4bit_compute_dtype**: {compute_type_4}
- **bnb_4bit_quant_storage**: {quant_storage_4}
## Usage
You can use this model in your applications by loading it directly from the Hugging Face Hub:
```python
from transformers import AutoModel
model = AutoModel.from_pretrained("{model_name}")"""
return model_card
def load_model(model_name, quantization_config, auth_token) :
return AutoModel.from_pretrained(model_name, quantization_config=quantization_config, device_map="cpu", use_auth_token=auth_token.token)
DTYPE_MAPPING = {
"int8": torch.int8,
"uint8": torch.uint8,
"float16": torch.float16,
"float32": torch.float32,
"bfloat16": torch.bfloat16,
}
def quantize_model(model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, auth_token=None):
print(f"Quantizing model: {quant_type_4}")
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type=quant_type_4,
bnb_4bit_use_double_quant=True if double_quant_4 == "True" else False,
bnb_4bit_quant_storage=DTYPE_MAPPING[quant_storage_4],
bnb_4bit_compute_dtype=DTYPE_MAPPING[compute_type_4],
)
model = AutoModel.from_pretrained(model_name, quantization_config=quantization_config, device_map="cpu", use_auth_token=auth_token.token)
for _ , module in model.named_modules():
if isinstance(module, Linear4bit):
module.to("cuda")
module.to("cpu")
return model
def save_model(model, model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, username=None, auth_token=None, quantized_model_name=None, public=False):
print("Saving quantized model")
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname, safe_serialization=True, use_auth_token=auth_token.token)
if quantized_model_name :
repo_name = f"{username}/{quantized_model_name}"
else :
repo_name = f"{username}/{model_name.split('/')[-1]}-BNB-INT4"
model_card = create_model_card(repo_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4)
with open(os.path.join(tmpdirname, "README.md"), "w") as f:
f.write(model_card)
# Push to Hub
api = HfApi(token=auth_token.token)
api.create_repo(repo_name, exist_ok=True, private=not public)
api.upload_folder(
folder_path=tmpdirname,
repo_id=repo_name,
repo_type="model",
)
return f"""
"""
def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, quantized_model_name, public):
if oauth_token is None :
return """
❌ Authentication Error
Please sign in to your HuggingFace account to use the quantizer.
"""
if not profile:
return """
❌ Authentication Error
Please sign in to your HuggingFace account to use the quantizer.
"""
exists_message = check_model_exists(oauth_token, profile.username, model_name, quantized_model_name)
if exists_message :
return f"""
⚠️ Model Already Exists
{exists_message}
"""
try:
quantized_model = quantize_model(model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, oauth_token)
return save_model(quantized_model, model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, profile.username, oauth_token, quantized_model_name, public)
except Exception as e :
print(e)
return f"""
❌ Error Occurred
{str(e)}
"""
css = """
:root {
--primary: #6366f1;
--primary-light: #818cf8;
--primary-dark: #4f46e5;
--secondary: #10b981;
--accent: #f97316;
--background: #f8fafc;
--text: #1e293b;
--card-bg: #ffffff;
--input-bg: #f1f5f9;
--error: #ef4444;
--warning: #f59e0b;
--success: #10b981;
--border-radius: 12px;
--shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06);
--transition: all 0.3s ease;
}
body, .gradio-container {
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', sans-serif;
color: var(--text);
background-color: var(--background);
}
h1 {
font-size: 2.5rem !important;
font-weight: 800 !important;
text-align: center;
background: linear-gradient(45deg, var(--primary), var(--accent));
-webkit-background-clip: text;
background-clip: text;
color: transparent !important;
margin-bottom: 1rem !important;
padding: 1rem 0 !important;
}
h2 {
font-size: 1.75rem !important;
font-weight: 700 !important;
color: var(--primary-dark) !important;
margin-top: 1.5rem !important;
margin-bottom: 1rem !important;
}
h3 {
font-size: 1.25rem !important;
font-weight: 600 !important;
color: var(--primary) !important;
margin-top: 1rem !important;
margin-bottom: 0.5rem !important;
border-bottom: 2px solid var(--primary-light);
padding-bottom: 0.5rem;
width: fit-content;
}
/* Main container styling */
.main-container {
max-width: 1200px;
margin: 0 auto;
padding: 2rem;
background-color: var(--card-bg);
border-radius: var(--border-radius);
box-shadow: var(--shadow);
}
/* Button styling */
button {
border-radius: var(--border-radius) !important;
font-weight: 600 !important;
transition: var(--transition) !important;
text-transform: uppercase;
letter-spacing: 0.5px;
}
button.primary {
background: linear-gradient(135deg, var(--primary), var(--primary-dark)) !important;
border: none !important;
color: white !important;
padding: 12px 24px !important;
box-shadow: 0 4px 6px -1px rgba(99, 102, 241, 0.4) !important;
}
button.primary:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 15px -3px rgba(99, 102, 241, 0.5) !important;
}
/* Login button styling */
#login-button {
margin: 1.5rem auto !important;
min-width: 200px !important;
background: linear-gradient(135deg, var(--primary), var(--primary-dark)) !important;
color: white !important;
font-weight: 600 !important;
padding: 12px 24px !important;
border-radius: var(--border-radius) !important;
border: none !important;
box-shadow: 0 4px 6px -1px rgba(99, 102, 241, 0.4) !important;
transition: var(--transition) !important;
}
#login-button:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 15px -3px rgba(99, 102, 241, 0.5) !important;
}
/* Toggle button styling */
#toggle-button {
background: transparent !important;
color: var(--primary) !important;
border: 2px solid var(--primary-light) !important;
padding: 8px 16px !important;
margin: 1rem 0 !important;
border-radius: var(--border-radius) !important;
transition: var(--transition) !important;
font-weight: 600 !important;
}
#toggle-button:hover {
background-color: var(--primary-light) !important;
color: white !important;
}
/* Input fields styling */
input, select, textarea {
border-radius: var(--border-radius) !important;
border: 2px solid var(--input-bg) !important;
padding: 10px 16px !important;
background-color: var(--input-bg) !important;
transition: var(--transition) !important;
}
input:focus, select:focus, textarea:focus {
border-color: var(--primary-light) !important;
box-shadow: 0 0 0 2px rgba(99, 102, 241, 0.2) !important;
}
/* Dropdown styling with nice hover effects */
.gradio-dropdown > div {
border-radius: var(--border-radius) !important;
border: 2px solid var(--input-bg) !important;
overflow: hidden !important;
transition: var(--transition) !important;
}
.gradio-dropdown > div:hover {
border-color: var(--primary-light) !important;
}
/* Radio and checkbox styling */
.gradio-radio, .gradio-checkbox {
background-color: var(--card-bg) !important;
border-radius: var(--border-radius) !important;
padding: 12px !important;
margin-bottom: 16px !important;
transition: var(--transition) !important;
border: 2px solid var(--input-bg) !important;
}
.gradio-radio:hover, .gradio-checkbox:hover {
border-color: var(--primary-light) !important;
}
.gradio-radio input[type="radio"] + label {
padding: 8px 12px !important;
border-radius: 20px !important;
margin-right: 8px !important;
background-color: var(--input-bg) !important;
transition: var(--transition) !important;
}
.gradio-radio input[type="radio"]:checked + label {
background-color: var(--primary) !important;
color: white !important;
}
/* Custom spacing and layout */
.gradio-row {
margin-bottom: 24px !important;
}
.option-row {
display: flex !important;
gap: 16px !important;
margin-bottom: 16px !important;
}
/* Card-like sections */
.card-section {
background-color: var(--card-bg) !important;
border-radius: var(--border-radius) !important;
padding: 20px !important;
margin-bottom: 24px !important;
box-shadow: var(--shadow) !important;
border: 1px solid rgba(0, 0, 0, 0.05) !important;
}
/* Search box styling */
.search-box input {
border-radius: var(--border-radius) !important;
border: 2px solid var(--input-bg) !important;
padding: 12px 20px !important;
box-shadow: var(--shadow) !important;
transition: var(--transition) !important;
}
.search-box input:focus {
border-color: var(--primary) !important;
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.3) !important;
}
/* Model name textbox specific styling */
.model-name-textbox {
border: 2px solid var(--input-bg) !important;
border-radius: var(--border-radius) !important;
transition: var(--transition) !important;
}
.model-name-textbox:focus-within {
border-color: var(--primary) !important;
box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.3) !important;
}
/* Success, warning and error boxes */
.success-box, .warning-box, .error-box {
border-radius: var(--border-radius) !important;
padding: 20px !important;
margin: 20px 0 !important;
box-shadow: var(--shadow) !important;
animation: fadeIn 0.5s ease-in-out;
}
.success-box {
background-color: rgba(16, 185, 129, 0.1) !important;
border: 2px solid var(--success) !important;
}
.warning-box {
background-color: rgba(245, 158, 11, 0.1) !important;
border: 2px solid var(--warning) !important;
}
.error-box {
background-color: rgba(239, 68, 68, 0.1) !important;
border: 2px solid var(--error) !important;
}
/* Model link styling */
.model-link {
display: inline-block !important;
background: linear-gradient(135deg, var(--primary), var(--primary-dark)) !important;
color: white !important;
text-decoration: none !important;
padding: 12px 24px !important;
border-radius: var(--border-radius) !important;
font-weight: 600 !important;
margin-top: 16px !important;
box-shadow: 0 4px 6px -1px rgba(99, 102, 241, 0.4) !important;
transition: var(--transition) !important;
}
.model-link:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 15px -3px rgba(99, 102, 241, 0.5) !important;
}
/* Instructions section */
.instructions-container {
background-color: rgba(99, 102, 241, 0.05) !important;
border-left: 4px solid var(--primary) !important;
padding: 16px !important;
margin: 24px 0 !important;
border-radius: 0 var(--border-radius) var(--border-radius) 0 !important;
}
/* Animations */
@keyframes fadeIn {
from { opacity: 0; transform: translateY(10px); }
to { opacity: 1; transform: translateY(0); }
}
/* Responsive adjustments */
@media (max-width: 768px) {
.option-row {
flex-direction: column !important;
}
}
/* Add a nice gradient splash to the app */
.gradio-container::before {
content: "";
position: absolute;
top: 0;
left: 0;
right: 0;
height: 10px;
background: linear-gradient(90deg, var(--primary), var(--accent));
z-index: 100;
}
/* Stylish header */
.app-header {
display: flex;
flex-direction: column;
align-items: center;
margin-bottom: 2rem;
position: relative;
}
.app-header::after {
content: "";
position: absolute;
bottom: -10px;
left: 50%;
transform: translateX(-50%);
width: 80px;
height: 4px;
background: linear-gradient(90deg, var(--primary), var(--accent));
border-radius: 2px;
}
/* Section headers */
.section-header {
display: flex;
align-items: center;
margin-bottom: 1rem;
}
.section-header::before {
content: "⚙️";
margin-right: 8px;
font-size: 1.25rem;
}
/* Quantize button special styling */
#quantize-button {
background: linear-gradient(135deg, var(--primary), var(--accent)) !important;
color: white !important;
padding: 16px 32px !important;
font-size: 1.1rem !important;
font-weight: 700 !important;
border: none !important;
border-radius: var(--border-radius) !important;
box-shadow: 0 4px 15px -3px rgba(99, 102, 241, 0.5) !important;
transition: all 0.3s cubic-bezier(0.25, 0.8, 0.25, 1) !important;
position: relative;
overflow: hidden;
}
#quantize-button:hover {
transform: translateY(-3px) !important;
box-shadow: 0 7px 20px -2px rgba(99, 102, 241, 0.6) !important;
}
#quantize-button::after {
content: "";
position: absolute;
top: 0;
left: 0;
width: 100%;
height: 100%;
background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 0));
transform: translateY(-100%);
transition: transform 0.6s cubic-bezier(0.25, 0.8, 0.25, 1);
}
#quantize-button:hover::after {
transform: translateY(0);
}
"""
with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald"), css=css) as demo:
with gr.Column(elem_classes="main-container"):
with gr.Row(elem_classes="app-header"):
gr.Markdown(
"""
🤗 BitsAndBytes Model Quantizer
Welcome to the BitsAndBytes Model Quantizer!
"""
)
gr.LoginButton(elem_id="login-button", elem_classes="login-button")
welcome_msg = gr.Markdown(elem_classes="welcome-message")
demo.load(hello, inputs=None, outputs=welcome_msg)
instructions = gr.Markdown(
"""
📋 Instructions
- Login to your HuggingFace account
- Enter the name of the Hugging Face LLM model you want to quantize
- Configure quantization settings based on your needs
- Optionally, specify a custom name for the quantized model
- Click "Quantize Model" to start the process
Note: Processing time depends on model size and your hardware. Check container logs for progress!
""",
visible=False
)
instructions_visible = gr.State(False)
toggle_button = gr.Button("▼ Show Instructions", elem_id="toggle-button", elem_classes="toggle-button")
def toggle_instructions(instructions_visible):
new_visibility = not instructions_visible
new_label = "▲ Hide Instructions" if new_visibility else "▼ Show Instructions"
return gr.update(visible=new_visibility), new_visibility, gr.update(value=new_label)
toggle_button.click(toggle_instructions, instructions_visible, [instructions, instructions_visible, toggle_button])
with gr.Row(elem_classes="app-content"):
with gr.Column(scale=1, elem_classes="card-section"):
with gr.Row(elem_classes="search-section"):
model_name = HuggingfaceHubSearch(
label="🔍 Select Model",
placeholder=" Search for model on Huggingface Hub...",
search_type="model",
elem_classes="search-box"
)
with gr.Row(elem_classes="section-header"):
gr.Markdown("### Quantization Settings")
with gr.Column(elem_classes="settings-group"):
gr.Markdown("**Quantization Type**", elem_classes="setting-label")
quant_type_4 = gr.Dropdown(
choices=["fp4", "nf4"],
value="fp4",
label="Format",
info="The quantization data type in bnb.nn.Linear4Bit layers",
show_label=False
)
gr.Markdown("**Compute Settings**", elem_classes="setting-label")
compute_type_4 = gr.Dropdown(
choices=["float16", "bfloat16", "float32"],
value="float32",
label="Compute Type",
info="The compute dtype for matrix multiplication"
)
quant_storage_4 = gr.Dropdown(
choices=["float16", "float32", "int8", "uint8", "bfloat16"],
value="uint8",
label="Storage Type",
info="The storage type for quantized weights"
)
gr.Markdown("**Double Quantization**", elem_classes="setting-label")
double_quant_4 = gr.Radio(
["False", "True"],
label="Use Double Quantization",
info="Further compress model size with nested quantization",
value="False",
)
with gr.Row(elem_classes="section-header"):
gr.Markdown("### Output Settings")
with gr.Column(elem_classes="settings-group"):
quantized_model_name = gr.Textbox(
label="Custom Model Name (Optional)",
info="Leave blank to use default naming convention",
placeholder="my-quantized-model",
elem_classes="model-name-textbox"
)
public = gr.Checkbox(
label="Make model public",
info="If checked, your model will be publicly accessible on Hugging Face Hub",
value=False,
)
with gr.Column(scale=1, elem_classes="card-section"):
with gr.Row():
gr.Markdown("""
### 📊 Quantization Benefits
⚡ Lower Memory Usage: Reduce model size by up to 75%
🚀 Faster Inference: Achieve better performance on resource-constrained hardware
💻 Wider Compatibility: Run models on devices with limited VRAM
### 🔧 Configuration Guide
Quantization Type:
fp4
- 4-bit floating point (better for most cases)
nf4
- normalized float format (better for specific models)
Double Quantization: Enable for additional compression with minimal quality loss
""")
with gr.Row():
quantize_button = gr.Button("🚀 Quantize Model", variant="primary", elem_id="quantize-button")
output_link = gr.HTML(label="Results", elem_classes="results-container")
# Add interactive footer with links
gr.Markdown("""
""")
quantize_button.click(
fn=quantize_and_save,
inputs=[model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, quantized_model_name, public],
outputs=[output_link]
)
if __name__ == "__main__":
demo.launch(share=True)