Update README.md
Browse files
README.md
CHANGED
|
@@ -25,7 +25,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
| 25 |
# load the model and tokenizer
|
| 26 |
model = AutoModelForCausalLM.from_pretrained(
|
| 27 |
'jnanliu/LiveMath-Judge',
|
| 28 |
-
device_map=
|
| 29 |
torch_dtype=torch.bfloat16,
|
| 30 |
)
|
| 31 |
tokenizer = AutoTokenizer.from_pretrained(
|
|
@@ -66,12 +66,12 @@ Analysis:
|
|
| 66 |
conversations = [
|
| 67 |
{'role': 'user', 'content': prompt.format(question=question, gold_answer=golden_answer, answer=generated_answer)}
|
| 68 |
]
|
| 69 |
-
inputs = tokenizer.apply_chat_template(conversations, return_tensors=
|
| 70 |
|
| 71 |
# do inference
|
| 72 |
pred = model.generate(
|
| 73 |
-
input_ids=inputs[
|
| 74 |
-
attention_mask=inputs[
|
| 75 |
num_return_sequences=1,
|
| 76 |
)[0].cpu().tolist()
|
| 77 |
response = tokenizer.decode(pred, skip_special_tokens=True)
|
|
@@ -84,8 +84,8 @@ response = tokenizer.decode(pred, skip_special_tokens=True)
|
|
| 84 |
Following is the mG-Pass@16 results of Qwen2.5-72B-Instruct-as-Judge and LiveMath-Judge-as-Judge of 10 models.
|
| 85 |
|model| Qwen2.5-72B-Instruct | LiveMath-Judge |
|
| 86 |
| -- | -- | -- |
|
| 87 |
-
| Qwen2.5-7B-Instruct | 26.45 |
|
| 88 |
-
| Qwen2.5-Math-7B-Instruct | 37.91 |
|
| 89 |
| Llama-3.1-8B-Instruct | 10.43 | 10.41 |
|
| 90 |
| Llama-3.1-70B-Instruct | 21.37 | 22.12 |
|
| 91 |
| Llama-3.3-70B-Instruct | 27.36 | 27.23 |
|
|
|
|
| 25 |
# load the model and tokenizer
|
| 26 |
model = AutoModelForCausalLM.from_pretrained(
|
| 27 |
'jnanliu/LiveMath-Judge',
|
| 28 |
+
device_map='auto',
|
| 29 |
torch_dtype=torch.bfloat16,
|
| 30 |
)
|
| 31 |
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
|
| 66 |
conversations = [
|
| 67 |
{'role': 'user', 'content': prompt.format(question=question, gold_answer=golden_answer, answer=generated_answer)}
|
| 68 |
]
|
| 69 |
+
inputs = tokenizer.apply_chat_template(conversations, return_tensors='pt')
|
| 70 |
|
| 71 |
# do inference
|
| 72 |
pred = model.generate(
|
| 73 |
+
input_ids=inputs['input_ids'].to(model.device),
|
| 74 |
+
attention_mask=inputs['attention_mask'].to(model.device),
|
| 75 |
num_return_sequences=1,
|
| 76 |
)[0].cpu().tolist()
|
| 77 |
response = tokenizer.decode(pred, skip_special_tokens=True)
|
|
|
|
| 84 |
Following is the mG-Pass@16 results of Qwen2.5-72B-Instruct-as-Judge and LiveMath-Judge-as-Judge of 10 models.
|
| 85 |
|model| Qwen2.5-72B-Instruct | LiveMath-Judge |
|
| 86 |
| -- | -- | -- |
|
| 87 |
+
| Qwen2.5-7B-Instruct | 26.45 | 28.17 |
|
| 88 |
+
| Qwen2.5-Math-7B-Instruct | 37.91 | 39.54 |
|
| 89 |
| Llama-3.1-8B-Instruct | 10.43 | 10.41 |
|
| 90 |
| Llama-3.1-70B-Instruct | 21.37 | 22.12 |
|
| 91 |
| Llama-3.3-70B-Instruct | 27.36 | 27.23 |
|