|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
import torch
|
|
|
|
|
|
quantization_config = BitsAndBytesConfig(
|
|
load_in_4bit=True,
|
|
bnb_4bit_compute_dtype=torch.bfloat16,
|
|
bnb_4bit_quant_type="nf4",
|
|
bnb_4bit_use_double_quant=True,
|
|
)
|
|
|
|
|
|
model_name = "./Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit"
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
model_name,
|
|
quantization_config=quantization_config,
|
|
device_map="auto"
|
|
)
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
|
|
device = model.device
|
|
|
|
|
|
input_text = "Once upon a time"
|
|
inputs = tokenizer(input_text, return_tensors="pt").to(device)
|
|
|
|
|
|
with torch.no_grad():
|
|
outputs = model.generate(**inputs, max_length=50)
|
|
|
|
|
|
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
print(generated_text)
|
|
|