import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel, BitsAndBytesConfig import tempfile from huggingface_hub import HfApi from huggingface_hub import list_models from gradio_huggingfacehub_search import HuggingfaceHubSearch from bitsandbytes.nn import Linear4bit from packaging import version import os def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str: if profile is None: return "👋 Hello! Sign in to get started with the BitsAndBytes Quantizer." return f"👋 Hello {profile.name}! Welcome to the BitsAndBytes Quantizer." def check_model_exists(oauth_token: gr.OAuthToken | None, username, model_name, quantized_model_name): """Check if a model exists in the user's Hugging Face repository.""" try: models = list_models(author=username, token=oauth_token.token) model_names = [model.id for model in models] if quantized_model_name : repo_name = f"{username}/{quantized_model_name}" else : repo_name = f"{username}/{model_name.split('/')[-1]}-BNB-INT4" if repo_name in model_names: return f"Model '{repo_name}' already exists in your repository." else: return None # Model does not exist except Exception as e: return f"Error checking model existence: {str(e)}" def create_model_card(model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4): model_card = f"""--- base_model: - {model_name} --- # {model_name} (Quantized) ## Description This model is a quantized version of the original model `{model_name}`. It has been quantized using int4 quantization with bitsandbytes. ## Quantization Details - **Quantization Type**: int4 - **bnb_4bit_quant_type**: {quant_type_4} - **bnb_4bit_use_double_quant**: {double_quant_4} - **bnb_4bit_compute_dtype**: {compute_type_4} - **bnb_4bit_quant_storage**: {quant_storage_4} ## Usage You can use this model in your applications by loading it directly from the Hugging Face Hub: ```python from transformers import AutoModel model = AutoModel.from_pretrained("{model_name}")""" return model_card def load_model(model_name, quantization_config, auth_token) : return AutoModel.from_pretrained(model_name, quantization_config=quantization_config, device_map="cpu", use_auth_token=auth_token.token) DTYPE_MAPPING = { "int8": torch.int8, "uint8": torch.uint8, "float16": torch.float16, "float32": torch.float32, "bfloat16": torch.bfloat16, } def quantize_model(model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, auth_token=None): print(f"Quantizing model: {quant_type_4}") quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type=quant_type_4, bnb_4bit_use_double_quant=True if double_quant_4 == "True" else False, bnb_4bit_quant_storage=DTYPE_MAPPING[quant_storage_4], bnb_4bit_compute_dtype=DTYPE_MAPPING[compute_type_4], ) model = AutoModel.from_pretrained(model_name, quantization_config=quantization_config, device_map="cpu", use_auth_token=auth_token.token) for _ , module in model.named_modules(): if isinstance(module, Linear4bit): module.to("cuda") module.to("cpu") return model def save_model(model, model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, username=None, auth_token=None, quantized_model_name=None, public=False): print("Saving quantized model") with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname, safe_serialization=True, use_auth_token=auth_token.token) if quantized_model_name : repo_name = f"{username}/{quantized_model_name}" else : repo_name = f"{username}/{model_name.split('/')[-1]}-BNB-INT4" model_card = create_model_card(repo_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4) with open(os.path.join(tmpdirname, "README.md"), "w") as f: f.write(model_card) # Push to Hub api = HfApi(token=auth_token.token) api.create_repo(repo_name, exist_ok=True, private=not public) api.upload_folder( folder_path=tmpdirname, repo_id=repo_name, repo_type="model", ) return f"""

🎉 Quantization Complete!

Your quantized model is now available at:

huggingface.co/{repo_name}
""" def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, quantized_model_name, public): if oauth_token is None : return """

❌ Authentication Error

Please sign in to your HuggingFace account to use the quantizer.

""" if not profile: return """

❌ Authentication Error

Please sign in to your HuggingFace account to use the quantizer.

""" exists_message = check_model_exists(oauth_token, profile.username, model_name, quantized_model_name) if exists_message : return f"""

⚠️ Model Already Exists

{exists_message}

""" try: quantized_model = quantize_model(model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, oauth_token) return save_model(quantized_model, model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, profile.username, oauth_token, quantized_model_name, public) except Exception as e : print(e) return f"""

❌ Error Occurred

{str(e)}

""" css = """ :root { --primary: #6366f1; --primary-light: #818cf8; --primary-dark: #4f46e5; --secondary: #10b981; --accent: #f97316; --background: #f8fafc; --text: #1e293b; --card-bg: #ffffff; --input-bg: #f1f5f9; --error: #ef4444; --warning: #f59e0b; --success: #10b981; --border-radius: 12px; --shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.1), 0 2px 4px -1px rgba(0, 0, 0, 0.06); --transition: all 0.3s ease; } body, .gradio-container { font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', sans-serif; color: var(--text); background-color: var(--background); } h1 { font-size: 2.5rem !important; font-weight: 800 !important; text-align: center; background: linear-gradient(45deg, var(--primary), var(--accent)); -webkit-background-clip: text; background-clip: text; color: transparent !important; margin-bottom: 1rem !important; padding: 1rem 0 !important; } h2 { font-size: 1.75rem !important; font-weight: 700 !important; color: var(--primary-dark) !important; margin-top: 1.5rem !important; margin-bottom: 1rem !important; } h3 { font-size: 1.25rem !important; font-weight: 600 !important; color: var(--primary) !important; margin-top: 1rem !important; margin-bottom: 0.5rem !important; border-bottom: 2px solid var(--primary-light); padding-bottom: 0.5rem; width: fit-content; } /* Main container styling */ .main-container { max-width: 1200px; margin: 0 auto; padding: 2rem; background-color: var(--card-bg); border-radius: var(--border-radius); box-shadow: var(--shadow); } /* Button styling */ button { border-radius: var(--border-radius) !important; font-weight: 600 !important; transition: var(--transition) !important; text-transform: uppercase; letter-spacing: 0.5px; } button.primary { background: linear-gradient(135deg, var(--primary), var(--primary-dark)) !important; border: none !important; color: white !important; padding: 12px 24px !important; box-shadow: 0 4px 6px -1px rgba(99, 102, 241, 0.4) !important; } button.primary:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 15px -3px rgba(99, 102, 241, 0.5) !important; } /* Login button styling */ #login-button { margin: 1.5rem auto !important; min-width: 200px !important; background: linear-gradient(135deg, var(--primary), var(--primary-dark)) !important; color: white !important; font-weight: 600 !important; padding: 12px 24px !important; border-radius: var(--border-radius) !important; border: none !important; box-shadow: 0 4px 6px -1px rgba(99, 102, 241, 0.4) !important; transition: var(--transition) !important; } #login-button:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 15px -3px rgba(99, 102, 241, 0.5) !important; } /* Toggle button styling */ #toggle-button { background: transparent !important; color: var(--primary) !important; border: 2px solid var(--primary-light) !important; padding: 8px 16px !important; margin: 1rem 0 !important; border-radius: var(--border-radius) !important; transition: var(--transition) !important; font-weight: 600 !important; } #toggle-button:hover { background-color: var(--primary-light) !important; color: white !important; } /* Input fields styling */ input, select, textarea { border-radius: var(--border-radius) !important; border: 2px solid var(--input-bg) !important; padding: 10px 16px !important; background-color: var(--input-bg) !important; transition: var(--transition) !important; } input:focus, select:focus, textarea:focus { border-color: var(--primary-light) !important; box-shadow: 0 0 0 2px rgba(99, 102, 241, 0.2) !important; } /* Dropdown styling with nice hover effects */ .gradio-dropdown > div { border-radius: var(--border-radius) !important; border: 2px solid var(--input-bg) !important; overflow: hidden !important; transition: var(--transition) !important; } .gradio-dropdown > div:hover { border-color: var(--primary-light) !important; } /* Radio and checkbox styling */ .gradio-radio, .gradio-checkbox { background-color: var(--card-bg) !important; border-radius: var(--border-radius) !important; padding: 12px !important; margin-bottom: 16px !important; transition: var(--transition) !important; border: 2px solid var(--input-bg) !important; } .gradio-radio:hover, .gradio-checkbox:hover { border-color: var(--primary-light) !important; } .gradio-radio input[type="radio"] + label { padding: 8px 12px !important; border-radius: 20px !important; margin-right: 8px !important; background-color: var(--input-bg) !important; transition: var(--transition) !important; } .gradio-radio input[type="radio"]:checked + label { background-color: var(--primary) !important; color: white !important; } /* Custom spacing and layout */ .gradio-row { margin-bottom: 24px !important; } .option-row { display: flex !important; gap: 16px !important; margin-bottom: 16px !important; } /* Card-like sections */ .card-section { background-color: var(--card-bg) !important; border-radius: var(--border-radius) !important; padding: 20px !important; margin-bottom: 24px !important; box-shadow: var(--shadow) !important; border: 1px solid rgba(0, 0, 0, 0.05) !important; } /* Search box styling */ .search-box input { border-radius: var(--border-radius) !important; border: 2px solid var(--input-bg) !important; padding: 12px 20px !important; box-shadow: var(--shadow) !important; transition: var(--transition) !important; } .search-box input:focus { border-color: var(--primary) !important; box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.3) !important; } /* Model name textbox specific styling */ .model-name-textbox { border: 2px solid var(--input-bg) !important; border-radius: var(--border-radius) !important; transition: var(--transition) !important; } .model-name-textbox:focus-within { border-color: var(--primary) !important; box-shadow: 0 0 0 3px rgba(99, 102, 241, 0.3) !important; } /* Success, warning and error boxes */ .success-box, .warning-box, .error-box { border-radius: var(--border-radius) !important; padding: 20px !important; margin: 20px 0 !important; box-shadow: var(--shadow) !important; animation: fadeIn 0.5s ease-in-out; } .success-box { background-color: rgba(16, 185, 129, 0.1) !important; border: 2px solid var(--success) !important; } .warning-box { background-color: rgba(245, 158, 11, 0.1) !important; border: 2px solid var(--warning) !important; } .error-box { background-color: rgba(239, 68, 68, 0.1) !important; border: 2px solid var(--error) !important; } /* Model link styling */ .model-link { display: inline-block !important; background: linear-gradient(135deg, var(--primary), var(--primary-dark)) !important; color: white !important; text-decoration: none !important; padding: 12px 24px !important; border-radius: var(--border-radius) !important; font-weight: 600 !important; margin-top: 16px !important; box-shadow: 0 4px 6px -1px rgba(99, 102, 241, 0.4) !important; transition: var(--transition) !important; } .model-link:hover { transform: translateY(-2px) !important; box-shadow: 0 8px 15px -3px rgba(99, 102, 241, 0.5) !important; } /* Instructions section */ .instructions-container { background-color: rgba(99, 102, 241, 0.05) !important; border-left: 4px solid var(--primary) !important; padding: 16px !important; margin: 24px 0 !important; border-radius: 0 var(--border-radius) var(--border-radius) 0 !important; } /* Animations */ @keyframes fadeIn { from { opacity: 0; transform: translateY(10px); } to { opacity: 1; transform: translateY(0); } } /* Responsive adjustments */ @media (max-width: 768px) { .option-row { flex-direction: column !important; } } /* Add a nice gradient splash to the app */ .gradio-container::before { content: ""; position: absolute; top: 0; left: 0; right: 0; height: 10px; background: linear-gradient(90deg, var(--primary), var(--accent)); z-index: 100; } /* Stylish header */ .app-header { display: flex; flex-direction: column; align-items: center; margin-bottom: 2rem; position: relative; } .app-header::after { content: ""; position: absolute; bottom: -10px; left: 50%; transform: translateX(-50%); width: 80px; height: 4px; background: linear-gradient(90deg, var(--primary), var(--accent)); border-radius: 2px; } /* Section headers */ .section-header { display: flex; align-items: center; margin-bottom: 1rem; } .section-header::before { content: "⚙️"; margin-right: 8px; font-size: 1.25rem; } /* Quantize button special styling */ #quantize-button { background: linear-gradient(135deg, var(--primary), var(--accent)) !important; color: white !important; padding: 16px 32px !important; font-size: 1.1rem !important; font-weight: 700 !important; border: none !important; border-radius: var(--border-radius) !important; box-shadow: 0 4px 15px -3px rgba(99, 102, 241, 0.5) !important; transition: all 0.3s cubic-bezier(0.25, 0.8, 0.25, 1) !important; position: relative; overflow: hidden; } #quantize-button:hover { transform: translateY(-3px) !important; box-shadow: 0 7px 20px -2px rgba(99, 102, 241, 0.6) !important; } #quantize-button::after { content: ""; position: absolute; top: 0; left: 0; width: 100%; height: 100%; background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 0)); transform: translateY(-100%); transition: transform 0.6s cubic-bezier(0.25, 0.8, 0.25, 1); } #quantize-button:hover::after { transform: translateY(0); } """ with gr.Blocks(theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald"), css=css) as demo: with gr.Column(elem_classes="main-container"): with gr.Row(elem_classes="app-header"): gr.Markdown( """

🤗 BitsAndBytes Model Quantizer

Welcome to the BitsAndBytes Model Quantizer!
""" ) gr.LoginButton(elem_id="login-button", elem_classes="login-button") welcome_msg = gr.Markdown(elem_classes="welcome-message") demo.load(hello, inputs=None, outputs=welcome_msg) instructions = gr.Markdown( """

📋 Instructions

  1. Login to your HuggingFace account
  2. Enter the name of the Hugging Face LLM model you want to quantize
  3. Configure quantization settings based on your needs
  4. Optionally, specify a custom name for the quantized model
  5. Click "Quantize Model" to start the process

Note: Processing time depends on model size and your hardware. Check container logs for progress!

""", visible=False ) instructions_visible = gr.State(False) toggle_button = gr.Button("▼ Show Instructions", elem_id="toggle-button", elem_classes="toggle-button") def toggle_instructions(instructions_visible): new_visibility = not instructions_visible new_label = "▲ Hide Instructions" if new_visibility else "▼ Show Instructions" return gr.update(visible=new_visibility), new_visibility, gr.update(value=new_label) toggle_button.click(toggle_instructions, instructions_visible, [instructions, instructions_visible, toggle_button]) with gr.Row(elem_classes="app-content"): with gr.Column(scale=1, elem_classes="card-section"): with gr.Row(elem_classes="search-section"): model_name = HuggingfaceHubSearch( label="🔍 Select Model", placeholder=" Search for model on Huggingface Hub...", search_type="model", elem_classes="search-box" ) with gr.Row(elem_classes="section-header"): gr.Markdown("### Quantization Settings") with gr.Column(elem_classes="settings-group"): gr.Markdown("**Quantization Type**", elem_classes="setting-label") quant_type_4 = gr.Dropdown( choices=["fp4", "nf4"], value="fp4", label="Format", info="The quantization data type in bnb.nn.Linear4Bit layers", show_label=False ) gr.Markdown("**Compute Settings**", elem_classes="setting-label") compute_type_4 = gr.Dropdown( choices=["float16", "bfloat16", "float32"], value="float32", label="Compute Type", info="The compute dtype for matrix multiplication" ) quant_storage_4 = gr.Dropdown( choices=["float16", "float32", "int8", "uint8", "bfloat16"], value="uint8", label="Storage Type", info="The storage type for quantized weights" ) gr.Markdown("**Double Quantization**", elem_classes="setting-label") double_quant_4 = gr.Radio( ["False", "True"], label="Use Double Quantization", info="Further compress model size with nested quantization", value="False", ) with gr.Row(elem_classes="section-header"): gr.Markdown("### Output Settings") with gr.Column(elem_classes="settings-group"): quantized_model_name = gr.Textbox( label="Custom Model Name (Optional)", info="Leave blank to use default naming convention", placeholder="my-quantized-model", elem_classes="model-name-textbox" ) public = gr.Checkbox( label="Make model public", info="If checked, your model will be publicly accessible on Hugging Face Hub", value=False, ) with gr.Column(scale=1, elem_classes="card-section"): with gr.Row(): gr.Markdown(""" ### 📊 Quantization Benefits

⚡ Lower Memory Usage: Reduce model size by up to 75%

🚀 Faster Inference: Achieve better performance on resource-constrained hardware

💻 Wider Compatibility: Run models on devices with limited VRAM

### 🔧 Configuration Guide

Quantization Type:

Double Quantization: Enable for additional compression with minimal quality loss

""") with gr.Row(): quantize_button = gr.Button("🚀 Quantize Model", variant="primary", elem_id="quantize-button") output_link = gr.HTML(label="Results", elem_classes="results-container") # Add interactive footer with links gr.Markdown("""

Powered by Hugging Face and BitsAndBytes

""") quantize_button.click( fn=quantize_and_save, inputs=[model_name, quant_type_4, double_quant_4, compute_type_4, quant_storage_4, quantized_model_name, public], outputs=[output_link] ) if __name__ == "__main__": demo.launch(share=True)