aifeifei798 commited on
Commit
8a7c610
·
verified ·
1 Parent(s): 20b9178

Update tobit4.py

Browse files
Files changed (1) hide show
  1. tobit4.py +35 -26
tobit4.py CHANGED
@@ -1,26 +1,35 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
2
- import torch
3
- import bitsandbytes as bnb
4
-
5
- # 加载模型和分词器
6
- model_name = "./Llama-3.1-Nemotron-Nano-8B-v1"
7
-
8
- # 配置量化参数
9
- quantization_config = BitsAndBytesConfig(
10
- load_in_4bit=True,
11
- bnb_4bit_compute_dtype=torch.bfloat16,
12
- bnb_4bit_quant_type="nf4",
13
- bnb_4bit_use_double_quant=True,
14
- llm_int8_skip_modules=["lm_head", "multi_modal_projector", "merger", "modality_projection", "model.layers.1.mlp"],
15
- )
16
-
17
- model = AutoModelForCausalLM.from_pretrained(
18
- model_name,
19
- quantization_config=quantization_config,
20
- device_map="auto" # 自动分配设备
21
- )
22
- tokenizer = AutoTokenizer.from_pretrained(model_name)
23
-
24
- # 保存量化后的模型
25
- model.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit")
26
- tokenizer.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit")
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
2
+ import torch
3
+ import bitsandbytes as bnb
4
+
5
+ # Define the model name and path
6
+ model_name = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"
7
+
8
+ # Configure quantization parameters
9
+ quantization_config = BitsAndBytesConfig(
10
+ load_in_4bit=True, # Load the model weights in 4-bit precision
11
+ bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for computation
12
+ bnb_4bit_quant_type="nf4", # Use "nf4" quantization type
13
+ bnb_4bit_use_double_quant=True, # Enable double quantization
14
+ llm_int8_skip_modules=[ # Specify modules to skip during quantization
15
+ "lm_head",
16
+ "multi_modal_projector",
17
+ "merger",
18
+ "modality_projection",
19
+ "model.layers.1.mlp"
20
+ ],
21
+ )
22
+
23
+ # Load the pre-trained model with the specified quantization configuration
24
+ model = AutoModelForCausalLM.from_pretrained(
25
+ model_name,
26
+ quantization_config=quantization_config,
27
+ device_map="auto" # Automatically allocate devices
28
+ )
29
+
30
+ # Load the tokenizer associated with the model
31
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
32
+
33
+ # Save the quantized model and tokenizer to a specified directory
34
+ model.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit")
35
+ tokenizer.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit")