|
from transformers import AutoTokenizer
|
|
from vllm import LLM, SamplingParams
|
|
import argparse
|
|
import json
|
|
from tqdm import tqdm
|
|
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--start', type=int,help='模型路径')
|
|
parser.add_argument('--end', type=int,help='模型路径')
|
|
args = parser.parse_args()
|
|
|
|
|
|
modelpath = "/dev_data/swzhang/model/LLM-Research/Meta-Llama-3-8B-Instruct/"
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(modelpath, trust_remote_code=True)
|
|
|
|
|
|
llm = LLM(modelpath, dtype='float16', tensor_parallel_size=1, trust_remote_code=True, enforce_eager=True)
|
|
sampling_params = SamplingParams(temperature=0.5, top_p=0.9, max_tokens=16000)
|
|
num_concepts = 5
|
|
system_prompt = "You are a pirate chatbot who always responds in pirate speak!"
|
|
with open('right_data_with_gold_shuffle.json','r',encoding='utf-8')as f:
|
|
lines = f.readlines()[args.start:args.end]
|
|
|
|
prompts = []
|
|
for line in tqdm(lines):
|
|
one_data = json.loads(line)
|
|
problem = one_data['question']
|
|
prompt = f""" As an expert in educational assessment, analyze this problem:
|
|
{problem}
|
|
Break downandidentify {num_concepts} foundational concepts being tested. List these knowledge
|
|
points that:
|
|
• Are core curriculum concepts typically taught in standard courses,
|
|
• Are precise and measurable (not vague like "understanding math"),
|
|
• Are essential building blocks needed to solve this problem,
|
|
• Represent fundamental principles rather than problem-specific techniques.
|
|
Think through your analysis step by step, then format your response as a Python code snippet
|
|
containing a list of {num_concepts} strings, where each string clearly describes one fundamental
|
|
knowledge point."""
|
|
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
|
|
one_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True,tokenize=False)
|
|
|
|
prompts.append(one_prompt)
|
|
|
|
outputs = llm.generate(prompts=prompts, sampling_params=sampling_params)
|
|
|
|
concepts_data = []
|
|
for i in range(len(outputs)):
|
|
one_data = json.loads(lines[i])
|
|
output = outputs[i].outputs[0].text
|
|
one_data['concepts_output'] = output
|
|
concepts_data.append(one_data)
|
|
|
|
with open(f'get_concept_right_{args.start}_{args.end}.json','w',encoding='utf-8') as f:
|
|
for one in concepts_data:
|
|
f.write(json.dumps(one,ensure_ascii=False)+"\n")
|
|
|
|
|
|
|