from unsloth import FastModel import torch fourbit_models = [ # 4bit dynamic quants for superior accuracy and low memory use "unsloth/gemma-3-1b-it-unsloth-bnb-4bit", "unsloth/gemma-3-4b-it-unsloth-bnb-4bit", "unsloth/gemma-3-12b-it-unsloth-bnb-4bit", "unsloth/gemma-3-27b-it-unsloth-bnb-4bit", # Other popular models! "unsloth/Llama-3.1-8B", "unsloth/Llama-3.2-3B", "unsloth/Llama-3.3-70B", "unsloth/mistral-7b-instruct-v0.3", "unsloth/Phi-4", ] # More models at https://huggingface.co/unsloth model, tokenizer = FastModel.from_pretrained( model_name = "NewEden/Gemma-LN-merged", max_seq_length = 8192, # Choose any for long context! load_in_4bit = False, # 4 bit quantization to reduce memory load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory full_finetuning = False, # [NEW!] We have full finetuning now! # token = "hf_...", # use one if using gated models ) """We now add LoRA adapters so we only need to update a small amount of parameters!""" model = FastModel.get_peft_model( model, finetune_vision_layers = False, # Turn off for just text! finetune_language_layers = True, # Should leave on! finetune_attention_modules = True, # Attention good for GRPO finetune_mlp_modules = True, # SHould leave on always! target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], r = 64, # Larger = higher accuracy, but might overfit lora_alpha = 32, # Recommended alpha == r at least lora_dropout = 0.1, bias = "none", random_state = 3407, ) from unsloth.chat_templates import get_chat_template tokenizer = get_chat_template( tokenizer, chat_template = "gemma-3", ) from datasets import load_dataset dataset = load_dataset("NewEden/Light-Novels-Roleplay-Logs-Books-Oh-My-duplicate-turns-removed", split = "train") """We now use `standardize_data_formats` to try converting datasets to the correct format for finetuning purposes!""" from unsloth.chat_templates import standardize_data_formats dataset = standardize_data_formats(dataset) """Let's see how row 100 looks like!""" dataset[100] """We now have to apply the chat template for `Gemma-3` onto the conversations, and save it to `text`""" def apply_chat_template(examples): texts = tokenizer.apply_chat_template(examples["conversations"]) return { "text" : texts } pass dataset = dataset.map(apply_chat_template, batched = True) """Let's see how the chat template did! Notice `Gemma-3` default adds a ``!""" dataset[100]["text"] """ ### Train the model Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. """ from trl import SFTTrainer, SFTConfig trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, eval_dataset=None, args=SFTConfig( dataset_text_field="text", per_device_train_batch_size=1, gradient_accumulation_steps=4, warmup_steps=50, num_train_epochs=2, learning_rate=1e-5, max_grad_norm=0.2, logging_steps=1, optim="paged_adamw_8bit", weight_decay=0.01, lr_scheduler_type="cosine", seed=3407, report_to="wandb", save_strategy="epoch", ), ) """We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs. This helps increase accuracy of finetunes!""" from unsloth.chat_templates import train_on_responses_only trainer = train_on_responses_only( trainer, instruction_part = "user\n", response_part = "model\n", ) """Let's verify masking the instruction part is done! Let's print the 100th row again:""" tokenizer.decode(trainer.train_dataset[100]["input_ids"]) """Now let's print the masked out example - you should see only the answer is present:""" tokenizer.decode([tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[100]["labels"]]).replace(tokenizer.pad_token, " ") # @title Show current memory stats gpu_stats = torch.cuda.get_device_properties(0) start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3) print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.") print(f"{start_gpu_memory} GB of memory reserved.") """Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`""" trainer_stats = trainer.train() # @title Show final memory and time stats used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3) used_memory_for_lora = round(used_memory - start_gpu_memory, 3) used_percentage = round(used_memory / max_memory * 100, 3) lora_percentage = round(used_memory_for_lora / max_memory * 100, 3) print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.") print( f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training." ) print(f"Peak reserved memory = {used_memory} GB.") print(f"Peak reserved memory for training = {used_memory_for_lora} GB.") print(f"Peak reserved memory % of max memory = {used_percentage} %.") print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.") """ ### Inference Let's run the model via Unsloth native inference! According to the `Gemma-3` team, the recommended settings for inference are `temperature = 1.0, top_p = 0.95, top_k = 64` """ from unsloth.chat_templates import get_chat_template tokenizer = get_chat_template( tokenizer, chat_template = "gemma-3", ) messages = [{ "role": "user", "content": [{ "type" : "text", "text" : "Continue the sequence: 1, 1, 2, 3, 5, 8,", }] }] text = tokenizer.apply_chat_template( messages, add_generation_prompt = True, # Must add for generation ) outputs = model.generate( **tokenizer([text], return_tensors = "pt").to("cuda"), max_new_tokens = 64, # Increase for longer outputs! # Recommended Gemma-3 settings! temperature = 1.0, top_p = 0.95, top_k = 64, ) tokenizer.batch_decode(outputs) """ You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!""" messages = [{ "role": "user", "content": [{"type" : "text", "text" : "Why is the sky blue?",}] }] text = tokenizer.apply_chat_template( messages, add_generation_prompt = True, # Must add for generation ) from transformers import TextStreamer _ = model.generate( **tokenizer([text], return_tensors = "pt").to("cuda"), max_new_tokens = 64, # Increase for longer outputs! # Recommended Gemma-3 settings! temperature = 1.0, top_p = 0.95, top_k = 64, streamer = TextStreamer(tokenizer, skip_prompt = True), ) """ ### Saving, loading finetuned models To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save. **[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down! """ model.save_pretrained("gemma-3") # Local saving tokenizer.save_pretrained("gemma-3") # model.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving # tokenizer.push_to_hub("HF_ACCOUNT/gemma-3", token = "...") # Online saving """Now if you want to load the LoRA adapters we just saved for inference, set `False` to `True`:""" if False: from unsloth import FastModel model, tokenizer = FastModel.from_pretrained( model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING max_seq_length = 2048, load_in_4bit = True, ) messages = [{ "role": "user", "content": [{"type" : "text", "text" : "What is Gemma-3?",}] }] text = tokenizer.apply_chat_template( messages, add_generation_prompt = True, # Must add for generation ) from transformers import TextStreamer _ = model.generate( **tokenizer([text], return_tensors = "pt").to("cuda"), max_new_tokens = 64, # Increase for longer outputs! # Recommended Gemma-3 settings! temperature = 1.0, top_p = 0.95, top_k = 64, streamer = TextStreamer(tokenizer, skip_prompt = True), ) """### Saving to float16 for VLLM We also support saving to `float16` directly for deployment! We save it in the folder `gemma-3-finetune`. Set `if False` to `if True` to let it run! """ if False: # Change to True to save finetune! model.save_pretrained_merged("gemma-3-finetune", tokenizer) """If you want to upload / push to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!""" if False: # Change to True to upload finetune model.push_to_hub_merged( "HF_ACCOUNT/gemma-3-finetune", tokenizer, token = "hf_..." ) """### GGUF / llama.cpp Conversion To save to `GGUF` / `llama.cpp`, we support it natively now for all models! For now, you can convert easily to `Q8_0, F16 or BF16` precision. `Q4_K_M` for 4bit will come later! """ if False: # Change to True to save to GGUF model.save_pretrained_gguf( "gemma-3-finetune", quantization_type = "Q8_0", # For now only Q8_0, BF16, F16 supported ) """Likewise, if you want to instead push to GGUF to your Hugging Face account, set `if False` to `if True` and add your Hugging Face token and upload location!""" if False: # Change to True to upload GGUF model.push_to_hub_gguf( "gemma-3-finetune", quantization_type = "Q8_0", # Only Q8_0, BF16, F16 supported repo_id = "HF_ACCOUNT/gemma-finetune-gguf", token = "hf_...", ) """Now, use the `gemma-3-finetune.gguf` file or `gemma-3-finetune-Q4_K_M.gguf` file in llama.cpp or a UI based system like Jan or Open WebUI. You can install Jan [here](https://github.com/janhq/jan) and Open WebUI [here](https://github.com/open-webui/open-webui) And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/unsloth) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord! Some other links: 1. Train your own reasoning model - Llama GRPO notebook [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-GRPO.ipynb) 2. Saving finetunes to Ollama. [Free notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-Ollama.ipynb) 3. Llama 3.2 Vision finetuning - Radiography use case. [Free Colab](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb) 6. See notebooks for DPO, ORPO, Continued pretraining, conversational finetuning and more on our [documentation](https://docs.unsloth.ai/get-started/unsloth-notebooks)!
Join Discord if you need help + ⭐️ Star us on Github ⭐️
"""