This PEFT model was trained with unsloth/meta-llama-3.1-8b-instruct-bnb-4bit using GRPO with the openai/gsm8k dataset.


import time
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)
from unsloth import is_bfloat16_supported
import torch

# Define model parameters
max_seq_length = 13000  # Increase for longer contexts if needed
lora_rank = 32        # Larger rank = potentially better performance but slower inference

# Define system prompt and response format
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

# Load the base model in 16-bit mode (required for LoRA with PEFT)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Llama-3.1-8B-Instruct",
    max_seq_length=max_seq_length,
    load_in_4bit=True,  # Use 16-bit mode for LoRA support
    fast_inference=True,  # Enable vLLM fast inference
    max_lora_rank=lora_rank,
    gpu_memory_utilization=0.6,  # Adjust if you run into memory issues
)

# Load the LoRA weights using PEFT
from peft import PeftModel
model = PeftModel.from_pretrained(model, "miike-ai/Llama-3.1-8b-gsm8k-r")

# Import sampling parameters from vLLM
from vllm import SamplingParams

print("Model fully loaded into memory. Ready to chat!")
print("Type 'exit' or 'quit' to stop.\n")

# Interactive chat loop
while True:
    try:
        user_input = input("User: ")
    except KeyboardInterrupt:
        print("\nExiting...")
        break

    # Exit the chat loop if the user types 'exit' or 'quit'
    if user_input.strip().lower() in {"exit", "quit"}:
        print("Exiting...")
        break

    # Prepare the prompt with the system prompt and user input
    prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_input},
        ],
        tokenize=False,
        add_generation_prompt=True
    )

    # Set sampling parameters for generation
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
        max_tokens=1024
    )

    # Stream the response
    start_time = time.time()
    print("\nAssistant: ", end="", flush=True)
    
    # Generate initial response
    outputs = model.fast_generate(prompt, sampling_params=sampling_params)
    response_text = outputs[0].outputs[0].text
    
    # Stream the response token by token
    current_text = ""
    for i in range(len(response_text)):
        new_token = response_text[i]
        print(new_token, end="", flush=True)
        current_text += new_token
        time.sleep(0.02)  # Small delay for readability
    
    inference_time = time.time() - start_time
    print(f"\nInference time: {inference_time:.2f} seconds\n")
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for miike-ai/Llama-3.1-8b-gsm8k-r

Dataset used to train miike-ai/Llama-3.1-8b-gsm8k-r