EpistemeAI
/

Fireball-12B-v1.13a-philosophers

Text Generation

text-generation-inference

Model card Files Files and versions Community

legolasyiu commited on Aug 29, 2024

Commit

abcbd52

·

verified ·

1 Parent(s): fbffc89

Update README.md

Files changed (1) hide show

README.md +6 -4

README.md CHANGED Viewed

@@ -95,9 +95,12 @@ import torch
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained("EpistemeAI2/Fireball-12B-v1.13a-philosophers")
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-# Load the model with 4-bit quantization (no need to use .to() later)
 model = AutoModelForCausalLM.from_pretrained(
     "EpistemeAI2/Fireball-12B-v1.13a-philosophers",
     quantization_config=quantization_config,
@@ -122,7 +125,6 @@ output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
 # Print the output
 print(output_text)
 ```
 Google colab - [link](https://colab.research.google.com/drive/1ZgUrbonMlK05iQ-tgWZ_lFmUZFbWZNnM?usp=sharing)

 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained("EpistemeAI2/Fireball-12B-v1.13a-philosophers")
+# Configure 4-bit quantization and enable CPU offloading
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    llm_int8_enable_fp32_cpu_offload=True
+)
+# Load the model with 4-bit quantization and CPU offloading
 model = AutoModelForCausalLM.from_pretrained(
     "EpistemeAI2/Fireball-12B-v1.13a-philosophers",
     quantization_config=quantization_config,
 # Print the output
 print(output_text)
 ```
 Google colab - [link](https://colab.research.google.com/drive/1ZgUrbonMlK05iQ-tgWZ_lFmUZFbWZNnM?usp=sharing)