RLHF
Collection
Some RLHF experiments using GRPO and DPO.
•
3 items
•
Updated
A 3-billion-parameter Qwen2.5 model fine-tuned with Group-Relative Policy Optimization (GRPO) on the GSM8K grade-school math dataset. The aim is to turn the compact 3B model into a lightweight but highly capable step-by-step math reasoner that runs on a single consumer GPU.
from transformers import AutoTokenizer, AutoModelForCausalLM
model_id = "BounharAbdelaziz/Qwen2.5-3B-GRPO-Math-GSM8K"
tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype="auto",
device_map="auto"
)
problem = "If a book costs $7 and a pen costs $2, how much do 3 books and 4 pens cost in total?"
messages = [
{"role": "system", "content": "You are a step-by-step math tutor."},
{"role": "user", "content": problem}
]
prompt = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
out_ids = model.generate(
**tok(prompt, return_tensors="pt").to(model.device),
max_new_tokens=1024,
temperature=0.7
)
print(tok.decode(out_ids[0], skip_special_tokens=True))