mobiuslabsgmbh
/

gemma-3-12b-it_4bitgs64_bfp16_hqq_hf

8-bit precision

Model card Files Files and versions Community

mobicham commited on Mar 19

Commit

666b3db

·

verified ·

1 Parent(s): 018ab02

Create README.md

Files changed (1) hide show

README.md +61 -0

README.md ADDED Viewed

	@@ -0,0 +1,61 @@

+---
+license: gemma
+base_model:
+- google/gemma-3-12b-it
+---
+This is an HQQ-quantized version (4-bit, group-size=64) of the <a href="https://huggingface.co/google/gemma-3-12b-it">gemma-3-12b-it</a> model.
+## Usage
+```Python
+import torch
+backend       = "torchao_int4"
+compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
+cache_dir     = None
+model_id      = 'mobiuslabsgmbh/gemma-3-12b-it_4bitgs64_bfp16_hqq_hf'
+#Load model
+from transformers import Gemma3ForConditionalGeneration, AutoProcessor
+processor = AutoProcessor.from_pretrained(model_id, cache_dir=cache_dir)
+model = Gemma3ForConditionalGeneration.from_pretrained(
+    model_id,
+    torch_dtype=compute_dtype,
+    attn_implementation="sdpa",
+    cache_dir=cache_dir,
+    device_map="cuda",
+)
+#Optimize
+from hqq.utils.patching import prepare_for_inference
+prepare_for_inference(model.language_model, backend=backend, verbose=True)
+############################################################################
+#Inference
+messages = [
+    {
+        "role": "system",
+        "content": [{"type": "text", "text": "You are a helpful assistant."}]
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+            {"type": "text", "text": "Describe this image in detail."}
+        ]
+    }
+]
+inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=compute_dtype)
+input_len = inputs["input_ids"].shape[-1]
+with torch.inference_mode():
+    generation = model.generate(**inputs, max_new_tokens=128, do_sample=False)[0][input_len:]
+    decoded    = processor.decode(generation, skip_special_tokens=True)
+print(decoded)
+```