File size: 11,749 Bytes
cb796b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 |
from unsloth import FastModel
import torch
fourbit_models = [
# 4bit dynamic quants for superior accuracy and low memory use
"unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
"unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
"unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
"unsloth/gemma-3-27b-it-unsloth-bnb-4bit",
# Other popular models!
"unsloth/Llama-3.1-8B",
"unsloth/Llama-3.2-3B",
"unsloth/Llama-3.3-70B",
"unsloth/mistral-7b-instruct-v0.3",
"unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth
model, tokenizer = FastModel.from_pretrained(
model_name = "NewEden/Gemma-LN-merged",
max_seq_length = 8192, # Choose any for long context!
load_in_4bit = False, # 4 bit quantization to reduce memory
load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
full_finetuning = False, # [NEW!] We have full finetuning now!
# token = "hf_...", # use one if using gated models
)
"""We now add LoRA adapters so we only need to update a small amount of parameters!"""
model = FastModel.get_peft_model(
model,
finetune_vision_layers = False, # Turn off for just text!
finetune_language_layers = True, # Should leave on!
finetune_attention_modules = True, # Attention good for GRPO
finetune_mlp_modules = True, # SHould leave on always!
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
r = 64, # Larger = higher accuracy, but might overfit
lora_alpha = 32, # Recommended alpha == r at least
lora_dropout = 0.1,
bias = "none",
random_state = 3407,
)
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
tokenizer,
chat_template = "gemma-3",
)
from datasets import load_dataset
dataset = load_dataset("NewEden/Light-Novels-Roleplay-Logs-Books-Oh-My-duplicate-turns-removed", split = "train")
"""We now use `standardize_data_formats` to try converting datasets to the correct format for finetuning purposes!"""
from unsloth.chat_templates import standardize_data_formats
dataset = standardize_data_formats(dataset)
"""Let's see how row 100 looks like!"""
dataset[100]
"""We now have to apply the chat template for `Gemma-3` onto the conversations, and save it to `text`"""
def apply_chat_template(examples):
texts = tokenizer.apply_chat_template(examples["conversations"])
return { "text" : texts }
pass
dataset = dataset.map(apply_chat_template, batched = True)
"""Let's see how the chat template did! Notice `Gemma-3` default adds a `<bos>`!"""
dataset[100]["text"]
"""<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`.
"""
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
eval_dataset=None,
args=SFTConfig(
dataset_text_field="text",
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_steps=50,
num_train_epochs=2,
learning_rate=1e-5,
max_grad_norm=0.2,
logging_steps=1,
optim="paged_adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="cosine",
seed=3407,
report_to="wandb",
save_strategy="epoch",
),
)
"""We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!"""
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
trainer,
instruction_part = "<start_of_turn>user\n",
response_part = "<start_of_turn>model\n",
)
"""Let's verify masking the instruction part is done! Let's print the 100th row again:"""
tokenizer.decode(trainer.train_dataset[100]["input_ids"])
"""Now let's print the masked out example - you should see only the answer is present:"""
tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ")
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")
"""Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`"""
trainer_stats = trainer.train()
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
"""<a name="Inference"></a>
### Inference
Let's run the model via Unsloth native inference! According to the `Gemma-3` team, the recommended settings for inference are `temperature = 1.0, top_p = 0.95, top_k = 64`
"""
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
tokenizer,
chat_template = "gemma-3",
)
messages = [{
"role": "user",
"content": [{
"type" : "text",
"text" : "Continue the sequence: 1, 1, 2, 3, 5, 8,",
}]
}]
text = tokenizer.apply_chat_template(
messages,
add_generation_prompt = True, # Must add for generation
)
outputs = model.generate(
**tokenizer([text], return_tensors = "pt").to("cuda"),
max_new_tokens = 64, # Increase for longer outputs!
# Recommended Gemma-3 settings!
temperature = 1.0, top_p = 0.95, top_k = 64,
)
tokenizer.batch_decode(outputs)
""" You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!"""
messages = [{
"role": "user",
"content": [{"type" : "text", "text" : "Why is the sky blue?",}]
}]
text = tokenizer.apply_chat_template(
messages,
add_generation_prompt = True, # Must add for generation
)
from transformers import TextStreamer
_ = model.generate(
**tokenizer([text], return_tensors = "pt").to("cuda"),
max_new_tokens = 64, # Increase for longer outputs!
# Recommended Gemma-3 settings!
temperature = 1.0, top_p = 0.95, top_k = 64,
streamer = TextStreamer(tokenizer, skip_prompt = True),
)
"""<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.
**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
"""
model.save_pretrained("gemma-3") # Local saving
tokenizer.save_pretrained("gemma-3")
# model.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving
# tokenizer.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving
"""Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:"""
if False:
from unsloth import FastModel
model, tokenizer = FastModel.from_pretrained(
model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
max_seq_length = 2048,
load_in_4bit = True,
)
messages = [{
"role": "user",
"content": [{"type" : "text", "text" : "What is Gemma-3?",}]
}]
text = tokenizer.apply_chat_template(
messages,
add_generation_prompt = True, # Must add for generation
)
from transformers import TextStreamer
_ = model.generate(
**tokenizer([text], return_tensors = "pt").to("cuda"),
max_new_tokens = 64, # Increase for longer outputs!
# Recommended Gemma-3 settings!
temperature = 1.0, top_p = 0.95, top_k = 64,
streamer = TextStreamer(tokenizer, skip_prompt = True),
)
"""### Saving to float16 for VLLM
We also support saving to `float16` directly for deployment! We save it in the folder `gemma-3-finetune`. Set `if False` to `if True` to let it run!
"""
if False: # Change to True to save finetune!
model.save_pretrained_merged("gemma-3-finetune", tokenizer)
"""If you want to upload / push to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!"""
if False: # Change to True to upload finetune
model.push_to_hub_merged(
"HF_ACCOUNT/gemma-3-finetune", tokenizer,
token = "hf_..."
)
"""### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now for all models! For now, you can convert easily to `Q8_0, F16 or BF16` precision. `Q4_K_M` for 4bit will come later!
"""
if False: # Change to True to save to GGUF
model.save_pretrained_gguf(
"gemma-3-finetune",
quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported
)
"""Likewise, if you want to instead push to GGUF to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!"""
if False: # Change to True to upload GGUF
model.push_to_hub_gguf(
"gemma-3-finetune",
quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported
repo_id = "HF_ACCOUNT/gemma-finetune-gguf",
token = "hf_...",
)
"""Now, use the `gemma-3-finetune.gguf` file or `gemma-3-finetune-Q4_K_M.gguf` file in llama.cpp or a UI based system like Jan or Open WebUI. You can install Jan [here](https://github.com/janhq/jan) and Open WebUI [here](https://github.com/open-webui/open-webui)
And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!
Some other links:
1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb)
2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb)
3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)
6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!
<div class="align-center">
<a href="https://unsloth.ai"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
<a href="https://discord.gg/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
<a href="https://docs.unsloth.ai/"><img src="https://github.com/unslothai/unsloth/blob/main/images/documentation%20green%20button.png?raw=true" width="125"></a>
Join Discord if you need help + ⭐️ <i>Star us on <a href="https://github.com/unslothai/unsloth">Github</a> </i> ⭐️
</div>
""" |