from transformers import AutoTokenizer from vllm import LLM, SamplingParams import argparse import json from tqdm import tqdm parser = argparse.ArgumentParser() parser.add_argument('--start', type=int,help='模型路径') parser.add_argument('--end', type=int,help='模型路径') args = parser.parse_args() ### 分片跑数据 modelpath = "/dev_data/swzhang/model/LLM-Research/Meta-Llama-3-8B-Instruct/" # Initialize the tokenizer tokenizer = AutoTokenizer.from_pretrained(modelpath, trust_remote_code=True) # Input the model name or path. Can be GPTQ or AWQ models. llm = LLM(modelpath, dtype='float16', tensor_parallel_size=1, trust_remote_code=True, enforce_eager=True) sampling_params = SamplingParams(temperature=0.5, top_p=0.9, max_tokens=16384) num_concepts = 5 system_prompt = "You are a pirate chatbot who always responds in pirate speak!" with open('right_data_with_gold_shuffle.json','r',encoding='utf-8')as f: lines = f.readlines()[args.start:args.end] prompts = [] for line in tqdm(lines): one_data = json.loads(line) problem = one_data['question'] prompt = f""" As an expert in educational assessment, analyze this problem: {problem} Break downandidentify {num_concepts} foundational concepts being tested. List these knowledge points that: • Are core curriculum concepts typically taught in standard courses, • Are precise and measurable (not vague like "understanding math"), • Are essential building blocks needed to solve this problem, • Represent fundamental principles rather than problem-specific techniques. Think through your analysis step by step, then format your response as a Python code snippet containing a list of {num_concepts} strings, where each string clearly describes one fundamental knowledge point.""" messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}] one_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True,tokenize=False) prompts.append(one_prompt) outputs = llm.generate(prompts=prompts, sampling_params=sampling_params) concepts_data = [] for i in range(len(outputs)): one_data = json.loads(lines[i]) output = outputs[i].outputs[0].text one_data['concepts_output'] = output concepts_data.append(one_data) with open(f'get_concept_right_{args.start}_{args.end}.json','w',encoding='utf-8') as f: for one in concepts_data: f.write(json.dumps(one,ensure_ascii=False)+"\n")