aifeifei798
/

Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit

Text Generation

text-generation-inference

4-bit precision

Model card Files Files and versions

aifeifei798 commited on Mar 24

Commit

035246c

·

verified ·

1 Parent(s): 701871d

Update bit4-chat.py

Files changed (1) hide show

bit4-chat.py +38 -38

bit4-chat.py CHANGED Viewed

@@ -1,38 +1,38 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-import torch
-# Configure quantization parameters
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,                  # Load the model weights in 4-bit precision
-    bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for computation
-    bnb_4bit_quant_type="nf4",         # Use "nf4" quantization type
-    bnb_4bit_use_double_quant=True,    # Enable double quantization
-)
-# Define the model name and path for the quantized model
-model_name = "./Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit"
-# Load the quantized model with the specified configuration
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    quantization_config=quantization_config,
-    device_map="auto"  # Automatically allocate devices
-)
-# Load the tokenizer associated with the model
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-# Determine the device where the model is located
-device = model.device
-# Prepare input text and move it to the same device as the model
-input_text = "Once upon a time"
-inputs = tokenizer(input_text, return_tensors="pt").to(device)
-# Perform inference
-with torch.no_grad():
-    outputs = model.generate(**inputs, max_length=50)
-# Decode the generated text
-generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(generated_text)

+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import torch
+# Configure quantization parameters
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,                  # Load the model weights in 4-bit precision
+    bnb_4bit_compute_dtype=torch.bfloat16,  # Use bfloat16 for computation
+    bnb_4bit_quant_type="nf4",         # Use "nf4" quantization type
+    bnb_4bit_use_double_quant=True,    # Enable double quantization
+)
+# Define the model name and path for the quantized model
+model_name = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1-bnb-4bit"
+# Load the quantized model with the specified configuration
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    quantization_config=quantization_config,
+    device_map="auto"  # Automatically allocate devices
+)
+# Load the tokenizer associated with the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# Determine the device where the model is located
+device = model.device
+# Prepare input text and move it to the same device as the model
+input_text = "Once upon a time"
+inputs = tokenizer(input_text, return_tensors="pt").to(device)
+# Perform inference
+with torch.no_grad():
+    outputs = model.generate(**inputs, max_length=50)
+# Decode the generated text
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(generated_text)