Spaces:

loveblairsky
/

LLM-model-cards

Runtime error

LLM-model-cards / Sample.py

Blair Yang

Paraphraser

336c585 over 1 year ago

4.69 kB

	import random
	import numpy as np
	import os
	import json
	from Config import *
	import pandas as pd
	from models import HFAPIModel

	def format_card_str(card):
	entries = []
	for k, v in card.items():
	r = ''
	if isinstance(v, str):
	r += f'- {k}: {v}\n'
	elif isinstance(v, dict):
	r += f"- {k}: {v['overview']}\n"
	# r += f"- {k}:\n"
	if v['thinking_pattern'] + v['strength'] + v['weakness'] == '':
	continue
	r += f" - Thinking Patterns: {v['thinking_pattern']}\n"
	r += f" - Strength: {v['strength']}\n"
	r += f" - Weakness: {v['weakness']}\n"
	else:
	raise ValueError(f'Unknown type: {type(v)}')

	entries.append(r)
	return entries

	def format_qa_entry(qa):
	# concat question + choice
	question = qa['question']
	choices = qa['choices']
	ground_truth = qa['ground truth']
	choice_str = ''
	# choices are in 0 - n, convert to A - Z
	for i, c in enumerate(choices):
	choice_str += f"{chr(65+i)}. {c}\n"

	choice_str = choice_str[:-1]

	return question + '\n\n' + choice_str +'\n\n' + f'Ground Truth: {chr(65+ground_truth)}'


	def sample_random_entry(dataset='', topic='', model='', n=1):
	"""
	Sample n (cache_size) random entries from the dataset, topic, model

	"""
	if dataset == '':
	dataset = random.choice(DATASETS)

	if topic == '':
	topic = random.choice(TOPICS[dataset])

	if model == '':
	model = random.choice(MODELS)

	# print(f"Sampling {n} random entries from {dataset} - {topic} - {model}")
	card_lst = sample_card(dataset, topic, model)
	qa, index = sample_QA_entry(dataset, topic, model)

	display_dict, info_dict = process_for_display(card_lst, qa)
	info_dict['index'] = index

	return display_dict, info_dict


	def process_for_display(card_lst, qa):
	qa_entry = format_qa_entry(qa)
	display_dict = {}
	display_dict['card'] = select_entry(qa_entry, card_lst)
	display_dict['qa'] = qa_entry
	info_dict = {**qa}
	info_dict.pop('question')
	info_dict.pop('choices')

	return display_dict, info_dict



	def select_entry(qa_entry, card_lst):

	system_prompt = '''
	Your task is to effectively condense the essential details from the student's evaluation card that are most relevant to predicting the correctness of their answer to a question.
	Limit your paraphrase to 50-100 words, focusing on distilling the key observations and outcomes that are directly pertinent to the inquiry.
	It's crucial to present an informative, unbiased summary that retains the integrity of the original card's information.
	Your goal is to craft a paraphrase that enhances the user's ability to accurately gauge the student's response, by emphasizing relevant insights and conclusions without altering the core facts.
	'''

	card_str = '\n'.join(card_lst)
	prompt = f'''
	## Question:
	{qa_entry}

	## Evaluation Card:
	{card_str}

	Again, your task is not to answer the question, but summarize the student's ability in answering the question! Only 100 words max! Use bullet points.
	Only relevant information to the question is needed.
	'''

	model_avaliable = {'mixtral': 'mistralai/Mixtral-8x7B-Instruct-v0.1',
	'mistral': 'mistralai/Mistral-7B-Instruct-v0.2'}

	model = HFAPIModel(system_prompt=system_prompt,
	model_name=model_avaliable['mistral'])


	response = model(prompt).replace('\n', '\n\n')

	del model
	return response



	def sample_card(dataset='', topic='', model='', card_cnt=2):
	card_index = random.randint(0, card_cnt-1)
	path = f'dataset/{dataset}/cards/{topic}/{topic}_{model}_{card_index}.jsonl'
	# load jsonl

	with open(path, 'r') as f:
	data = json.load(f)

	card = format_card_str(data)

	return card


	def sample_QA_entry(dataset='', topic='', model='', n=1):
	path = f'dataset/{dataset}/{topic}/{topic}_test.jsonl'
	# load jsonl
	# with jsonlines.open(path) as reader:
	# data = list(reader)
	# use json
	# load line by line
	with open(path, 'r') as f:
	data = [json.loads(line) for line in f.readlines()]

	# transfer into pandas
	df = pd.DataFrame(data)

	# select whose model equals model
	df = df[df['model'] == model]
	sample = df.sample(1)
	# Convert to dictionary
	sample_idx = sample.index[0]
	sample = sample.to_dict(orient='records')[0]
	return sample, sample_idx

	if __name__ == '__main__':
	sample_random_entry(n=5)