mobicham commited on
Commit
666b3db
·
verified ·
1 Parent(s): 018ab02

Create README.md

Browse files
Files changed (1) hide show
  1. README.md +61 -0
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: gemma
3
+ base_model:
4
+ - google/gemma-3-12b-it
5
+ ---
6
+ This is an HQQ-quantized version (4-bit, group-size=64) of the <a href="https://huggingface.co/google/gemma-3-12b-it">gemma-3-12b-it</a> model.
7
+
8
+ ## Usage
9
+ ```Python
10
+ import torch
11
+ backend = "torchao_int4"
12
+ compute_dtype = torch.bfloat16 if backend=="torchao_int4" else torch.float16
13
+ cache_dir = None
14
+ model_id = 'mobiuslabsgmbh/gemma-3-12b-it_4bitgs64_bfp16_hqq_hf'
15
+
16
+ #Load model
17
+ from transformers import Gemma3ForConditionalGeneration, AutoProcessor
18
+
19
+ processor = AutoProcessor.from_pretrained(model_id, cache_dir=cache_dir)
20
+ model = Gemma3ForConditionalGeneration.from_pretrained(
21
+ model_id,
22
+ torch_dtype=compute_dtype,
23
+ attn_implementation="sdpa",
24
+ cache_dir=cache_dir,
25
+ device_map="cuda",
26
+ )
27
+
28
+ #Optimize
29
+ from hqq.utils.patching import prepare_for_inference
30
+ prepare_for_inference(model.language_model, backend=backend, verbose=True)
31
+
32
+
33
+ ############################################################################
34
+ #Inference
35
+ messages = [
36
+ {
37
+ "role": "system",
38
+ "content": [{"type": "text", "text": "You are a helpful assistant."}]
39
+ },
40
+ {
41
+ "role": "user",
42
+ "content": [
43
+ {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
44
+ {"type": "text", "text": "Describe this image in detail."}
45
+ ]
46
+ }
47
+ ]
48
+
49
+ inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=compute_dtype)
50
+
51
+ input_len = inputs["input_ids"].shape[-1]
52
+
53
+ with torch.inference_mode():
54
+ generation = model.generate(**inputs, max_new_tokens=128, do_sample=False)[0][input_len:]
55
+ decoded = processor.decode(generation, skip_special_tokens=True)
56
+
57
+ print(decoded)
58
+
59
+
60
+ ```
61
+