tuandunghcmut
/

vlm_clone_2

Model card Files Files and versions Community

vlm_clone_2 / VLMEvalKit /vlmeval /dataset /dynamath.py

tuandunghcmut

Add files using upload-large-folder tool

c85c9e5 verified 3 months ago

raw

history blame

9.25 kB

	import re
	import json
	import sympy as sp
	import numpy as np
	import pandas as pd
	from sympy import simplify, Eq, sympify, Pow, pi
	from sympy.parsing.latex import parse_latex
	import sys
	import math
	import os
	import os.path as osp
	import argparse

	from .image_base import ImageBaseDataset
	from .utils import build_judge
	from ..utils import track_progress_rich
	from ..smp import load, dump, d2df, toliststr


	def preprocess(str1):
	if 0 <= str1.find("{") < str1.rfind("}"):
	str1 = str1[str1.find("{"): str1.rfind("}") + 1]
	str2 = str1.replace("\\", "")
	str2 = str2.replace("\\n", "\n")
	return str2


	def transfer(str1):
	if "\u03c0" in str1:
	strs = str1.split('\u03c0')
	str1 = strs[0]
	return float(str1) * np.pi
	else:
	return float(str1)


	def parse_answer(answer, answer_type="multiple choice"):
	if answer_type == "float":
	if answer.isdigit():
	return True, float(answer)
	else:
	parts = answer.split(' ')
	answer = parts[0]
	try:
	answer = transfer(answer)
	return True, answer
	except:
	return False, None
	elif answer_type == "multiple choice":
	if len(answer) == 1:
	return True, answer.upper()
	else:
	in_flag = [ch in answer.upper() for ch in 'ABCDE']
	if sum(in_flag) == 1:
	for ch in 'ABCDE':
	if ch in answer.upper():
	return True, ch
	return False, None
	else:
	return True, answer


	def DynaMath_auxeval(model, line):
	pred = line['prediction']
	pred = preprocess(pred)

	succeed, short_answer = None, None
	try:
	dj = json.loads(pred, strict=False)
	short_answer = dj.get("short answer")
	assert short_answer is not None
	succeed, short_answer = parse_answer(short_answer, answer_type=line['anwser_type'])
	assert succeed
	except:
	# Failed to parse the JSON, use an auxiliary LLM to get the short answer
	if line['answer_type'] == 'multiple choice':
	inst = "Output the corresponing choice option, such as 'A', 'B', 'C', 'D', in a single line."
	elif line['answer_type'] == 'float':
	inst = "Output a three-digit floating-point number in a single line."
	else:
	inst = (
	"Output a short answer in a single line. Any float numbers in the answer "
	"should be formatted as three-digit floating-point numbers."
	)

	prompt = f"Free-form answer: {pred}\nInstruction: {inst}"
	response = pred
	succeed, short_answer = parse_answer(response, line['answer_type'])
	if not succeed:
	response = model.generate(prompt)
	succeed, short_answer = parse_answer(response, line['answer_type'])

	if line['answer_type'] == 'float':
	if succeed:
	diff = float(short_answer) - float(line['answer'])
	if abs(diff) <= 0.001:
	return dict(parse=True, extracted=short_answer, correct=True)
	else:
	return dict(parse=True, extracted=short_answer, correct=False)
	else:
	return dict(parse=False, extracted=None, correct=False)
	elif line['answer_type'] == 'multiple choice':
	if succeed:
	return dict(parse=True, extracted=short_answer, correct=(short_answer == line['answer']))
	else:
	if line['answer'] in pred[:3].upper():
	return dict(parse=False, extracted=None, correct=True)
	else:
	return dict(parse=False, extracted=None, correct=False)
	else:
	if succeed:
	return dict(parse=True, extracted=short_answer, correct=(short_answer.lower() in line['answer'].lower()))
	else:
	return dict(parse=False, extracted=None, correct=(short_answer.lower() in line['answer'].lower()))


	class Dynamath(ImageBaseDataset):

	TYPE = 'VQA'
	DATASET_URL = {'DynaMath': 'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv'}
	DATASET_MD5 = {'DynaMath': 'b8425ad9a7114571fc9366e013699494'}
	GUIDE = """
	## Answer Instruction Please provide an answer to the question outlined above. Your response should adhere \
	to the following JSON format, which includes two keys: 'solution' and 'short answer'. The 'solution' key can contain \
	detailed steps needed to solve the question, and the 'short answer' key should provide a concise response. {INST}

	Example of expected JSON response format:

	"""
	EXAMPLE = {
	"solution": "[Detailed step-by-step explanation]",
	"short answer": "[Concise Answer]"
	}
	TEXT_EXAMPLE = json.dumps(EXAMPLE, indent=4)

	# Given one data record, return the built prompt (a multi-modal message), can override
	def build_prompt(self, line):
	if isinstance(line, int):
	line = self.data.iloc[line]

	if self.meta_only:
	tgt_path = toliststr(line['image_path'])
	else:
	tgt_path = self.dump_image(line)

	prompt = f"## Question\n {line['question']}"
	if line['answer_type'] == 'multiple choice':
	inst = "Provide the corresponing choice option in the 'short answer' key, such as 'A', 'B', 'C', or 'D'."
	elif line['answer_type'] == 'float':
	inst = "Format the answer as a three-digit floating-point number and provide it in the 'short answer' key."
	else:
	inst = "Float numbers in the answer should be formatted as three-digit floating-point numbers."

	prompt = prompt + self.GUIDE.format(INST=inst) + self.TEXT_EXAMPLE

	msgs = []
	if isinstance(tgt_path, list):
	msgs.extend([dict(type='image', value=p) for p in tgt_path])
	else:
	msgs = [dict(type='image', value=tgt_path)]
	msgs.append(dict(type='text', value=prompt))
	return msgs

	def evaluate(self, eval_file, **judge_kwargs):
	judge_name = judge_kwargs.pop('model', 'gpt-4o-mini')

	model = build_judge(model=judge_name, **judge_kwargs)
	suffix = eval_file.split('.')[-1]

	storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
	score_file = eval_file.replace(f'.{suffix}', f'_{judge_name}_score.csv') # noqa: F841
	tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
	nproc = judge_kwargs.pop('nproc', 6) # noqa: F841

	res = load(tmp_file) if os.path.exists(tmp_file) else {}
	res = {k: v for k, v in res.items() if v is not None}

	model.system_prompt = """\
	You are a helpful assistant that helps me to format free-form answers into a short answer according to the instruction.
	"""
	if not osp.exists(storage):
	data = load(eval_file)
	lt = len(data)
	payloads = [dict(model=model, line=data.iloc[i]) for i in range(lt) if data.iloc[i]['index'] not in res]
	keys = [idx for idx in data['index'] if idx not in res]

	if len(keys):
	results = track_progress_rich(DynaMath_auxeval, payloads, nproc=nproc, save=tmp_file, keys=keys)
	for k, r in zip(keys, results):
	res[k] = r

	data['parse'] = [res[idx]['parse'] for idx in data['index']]
	data['extracted'] = [res[idx]['extracted'] for idx in data['index']]
	data['correct'] = [res[idx]['correct'] for idx in data['index']]
	dump(data, storage)

	data = load(storage)
	# Calculate Average Accuracy
	score_avg = {}
	score_avg['Overall'] = np.mean(data['correct'])

	subs = set(data['subject'])
	for sub in subs:
	data_sub = data[data['subject'] == sub]
	score_avg[f'Subject-{sub}'] = np.mean(data_sub['correct'])

	lvls = set(data['knowledge_level'])
	for lvl in lvls:
	data_lvl = data[data['knowledge_level'] == lvl]
	score_avg[f'Level-{lvl}'] = np.mean(data_lvl['correct'])

	# Calculate the Worst Case Accuracy
	score_worst = {}
	data_worst = data[data['varid'] == 1]
	qid2corr = {idx: True for idx in data_worst['index']}
	lt = len(data)
	for i in range(lt):
	item = data.iloc[i]
	qid2corr[item['qid']] *= item['correct']
	data_worst['correct'] = [qid2corr[idx] for idx in data_worst['qid']]
	score_worst['Overall'] = np.mean(data_worst['correct'])

	subs = set(data_worst['subject'])
	for sub in subs:
	data_sub = data_worst[data_worst['subject'] == sub]
	score_worst[f'Subject-{sub}'] = np.mean(data_sub['correct'])

	lvls = set(data_worst['knowledge_level'])
	for lvl in lvls:
	data_lvl = data_worst[data_worst['knowledge_level'] == lvl]
	score_worst[f'Level-{lvl}'] = np.mean(data_lvl['correct'])

	d1 = {'Setting': 'Average'}
	d1.update(score_avg)
	d2 = {'Setting': 'Worst Case'}
	d2.update(score_worst)
	score = pd.concat([d2df(d1), d2df(d2)], ignore_index=True)

	dump(score, score_file)
	return score