Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
from datasets import load_dataset | |
import torch | |
# Cache to avoid reloading the model | |
model_cache = {} | |
def load_model(model_id): | |
if model_id in model_cache: | |
return model_cache[model_id] | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained(model_id).to("cuda" if torch.cuda.is_available() else "cpu") | |
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1) | |
model_cache[model_id] = generator | |
return generator | |
def format_prompt(item, source): | |
if source == "cais/mmlu": | |
prompt = f"{item['question']}\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\nAnswer:" | |
answer = item['answer'] | |
elif source == "TIGER-Lab/MMLU-Pro": | |
prompt = f"{item['question']}\nA. {item['A']}\nB. {item['B']}\nC. {item['C']}\nD. {item['D']}\nAnswer:" | |
answer = item['answer'] | |
elif source == "cais/hle": | |
prompt = f"{item['question']}\n{item['A']}\n{item['B']}\n{item['C']}\n{item['D']}\nAnswer:" | |
answer = item['answer'] | |
else: | |
prompt, answer = "", "" | |
return prompt, answer | |
def evaluate(model_id, dataset_name, sample_count): | |
gen = load_model(model_id) | |
dataset = load_dataset(dataset_name) | |
if 'test' in dataset: | |
dataset = dataset['test'] | |
else: | |
dataset = dataset[list(dataset.keys())[0]] | |
dataset = dataset.shuffle(seed=42).select(range(min(sample_count, len(dataset)))) | |
correct = 0 | |
results = [] | |
for item in dataset: | |
prompt, answer = format_prompt(item, dataset_name) | |
output = gen(prompt, max_new_tokens=10, do_sample=False)[0]["generated_text"] | |
output_letter = next((char for char in output[::-1] if char in "ABCD"), None) | |
is_correct = output_letter == answer | |
correct += is_correct | |
results.append((prompt, output.strip(), answer, output_letter, is_correct)) | |
accuracy = correct / len(dataset) * 100 | |
return f"Accuracy: {accuracy:.2f}%", results | |
def run(model_id, benchmark, sample_count): | |
score, details = evaluate(model_id, benchmark, sample_count) | |
formatted = "\n\n".join([ | |
f"### Question:\n{q}\n\n**Model Answer:** {o}\n**Expected:** {a}\n**Predicted:** {g}\n**Correct:** {c}" | |
for q, o, a, g, c in details | |
]) | |
return score, formatted | |
with gr.Blocks(css="body {font-family: Inter, sans-serif; padding: 1em; max-width: 900px; margin: auto;}", analytics_enabled=False, custom_code=True) as demo: | |
gr.Markdown(""" | |
# π€ LLM Benchmark Evaluator | |
Easily evaluate your Hugging Face-hosted model on: | |
- **MMLU** (`cais/mmlu`) | |
- **MMLU-Pro** (`TIGER-Lab/MMLU-Pro`) | |
- **Humanity's Last Exam** (`cais/hle`) | |
Enter your model ID, pick a benchmark, and hit evaluate. | |
""") | |
with gr.Row(): | |
model_id = gr.Textbox(label="Your Hugging Face Model ID", placeholder="e.g., your-org/your-model") | |
benchmark = gr.Dropdown( | |
label="Choose Benchmark", | |
choices=["cais/mmlu", "TIGER-Lab/MMLU-Pro", "cais/hle"], | |
value="cais/mmlu" | |
) | |
sample_count = gr.Slider(label="Number of Samples", minimum=1, maximum=100, value=10, step=1) | |
run_button = gr.Button("π Run Evaluation") | |
acc_output = gr.Textbox(label="Benchmark Accuracy", interactive=False) | |
detail_output = gr.Textbox(label="Evaluation Details", lines=20, interactive=False) | |
run_button.click(run, inputs=[model_id, benchmark, sample_count], outputs=[acc_output, detail_output]) | |
demo.launch() | |