SARM: Interpretable Reward Model via Sparse Autoencoder

Reward Bench V2 evaluation

[Official results in progress]

SARM inference demo


import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer


def get_reward_score(model, prompt, response) -> float:
    """
    Receives a prompt and a response, and returns the reward score calculated by the SARM model.
    """
    messages = [{"role": "user", "content": prompt}, {"role": "assistant", "content": response}]
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to(model.device)

    with torch.no_grad():
        score = model(input_ids).logits.item()

    return round(score, 4)


device = "cuda"
path = "Schrieffer/Llama-SARM-4B"

tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForSequenceClassification.from_pretrained(
    path, 
    device_map=device, 
    trust_remote_code=True, 
    torch_dtype=torch.bfloat16
)

examples=[
    ["What is the capital of France?", "The capital of France is Paris."],
    ["What is the capital of France?", "Berlin is a large city in Germany."],
    ["Write a short poem about the moon.", "Silver orb in velvet night, / Casting shadows, soft and light. / Silent watcher, distant, bright, / Guiding dreams till morning's light."],
    ["Write a short poem about the moon.", "The moon is a rock."]
]

for example in examples:
    print("example".center(80,'='))
    print("Question:\n"+example[0])
    print("Answer:\n"+example[1])
    print("Score:", get_reward_score(model, example[0],example[1]))
Downloads last month
88
Safetensors
Model size
4.55B params
Tensor type
BF16
ยท
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support

Space using Schrieffer/Llama-SARM-4B 1