zswzswzsw commited on
Commit
85a2888
·
verified ·
1 Parent(s): 523c28f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ right_data_with_gold_shuffle.json filter=lfs diff=lfs merge=lfs -text
get_concept.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+ from vllm import LLM, SamplingParams
3
+ import argparse
4
+ import json
5
+ from tqdm import tqdm
6
+
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument('--start', type=int,help='模型路径')
9
+ parser.add_argument('--end', type=int,help='模型路径')
10
+ args = parser.parse_args()
11
+ ### 分片跑数据
12
+
13
+ modelpath = "/dev_data/swzhang/model/LLM-Research/Meta-Llama-3-8B-Instruct/"
14
+
15
+ # Initialize the tokenizer
16
+ tokenizer = AutoTokenizer.from_pretrained(modelpath, trust_remote_code=True)
17
+
18
+ # Input the model name or path. Can be GPTQ or AWQ models.
19
+ llm = LLM(modelpath, dtype='float16', tensor_parallel_size=1, trust_remote_code=True, enforce_eager=True)
20
+ sampling_params = SamplingParams(temperature=0.5, top_p=0.9, max_tokens=16000)
21
+ num_concepts = 5
22
+ system_prompt = "You are a pirate chatbot who always responds in pirate speak!"
23
+ with open('right_data_with_gold_shuffle.json','r',encoding='utf-8')as f:
24
+ lines = f.readlines()[args.start:args.end]
25
+
26
+ prompts = []
27
+ for line in tqdm(lines):
28
+ one_data = json.loads(line)
29
+ problem = one_data['question']
30
+ prompt = f""" As an expert in educational assessment, analyze this problem:
31
+ {problem}
32
+ Break downandidentify {num_concepts} foundational concepts being tested. List these knowledge
33
+ points that:
34
+ • Are core curriculum concepts typically taught in standard courses,
35
+ • Are precise and measurable (not vague like "understanding math"),
36
+ • Are essential building blocks needed to solve this problem,
37
+ • Represent fundamental principles rather than problem-specific techniques.
38
+ Think through your analysis step by step, then format your response as a Python code snippet
39
+ containing a list of {num_concepts} strings, where each string clearly describes one fundamental
40
+ knowledge point."""
41
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}]
42
+ one_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True,tokenize=False)
43
+
44
+ prompts.append(one_prompt)
45
+
46
+ outputs = llm.generate(prompts=prompts, sampling_params=sampling_params)
47
+
48
+ concepts_data = []
49
+ for i in range(len(outputs)):
50
+ one_data = json.loads(lines[i])
51
+ output = outputs[i].outputs[0].text
52
+ one_data['concepts_output'] = output
53
+ concepts_data.append(one_data)
54
+
55
+ with open(f'get_concept_right_{args.start}_{args.end}.json','w',encoding='utf-8') as f:
56
+ for one in concepts_data:
57
+ f.write(json.dumps(one,ensure_ascii=False)+"\n")
58
+
59
+
right_data_with_gold_shuffle.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e530e86d7468ea786018615a1f5db951785f7c2c1e89fc567b1ae8504ca0f60d
3
+ size 1666288201