|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
|
import torch |
|
import bitsandbytes as bnb |
|
|
|
|
|
model_name = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1" |
|
|
|
|
|
quantization_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_use_double_quant=True, |
|
llm_int8_skip_modules=[ |
|
"lm_head", |
|
"multi_modal_projector", |
|
"merger", |
|
"modality_projection", |
|
"model.layers.1.mlp" |
|
], |
|
) |
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
quantization_config=quantization_config, |
|
device_map="auto" |
|
) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
model.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit") |
|
tokenizer.save_pretrained("Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit") |