import json import os import pandas as pd from .image_base import ImageBaseDataset from ..smp import * from .utils import build_judge, DEBUG_MESSAGE from ..utils import track_progress_rich def generate_prompt(d): question = d['question'] weights = eval(d['component_weight']) components = eval(d['components']) num_of_component = int(d['num_of_component']) response = d['prediction'] if num_of_component == 1: components = f"The first component is: '{components[0]}'. " score = f"The first component is worth: {weights[0]} scores. " elif num_of_component == 2: components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. " score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. " elif num_of_component == 3: components = ( f"The first component is: '{components[0]}', and the second component is '{components[1]}', " f"and the third component is '{components[2]}'. " ) score = ( "The first, second, and third component is each worth " f"{weights[0]}, {weights[1]}, and {weights[2]} scores." ) elif num_of_component == 4: components = ( f"The first component is: '{components[0]}', and the second component is '{components[1]}', " f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. " ) score = ( "The first, second, third, and fourth component is each worth " f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores." ) elif num_of_component == 5: components = ( f"The first component is: '{components[0]}', and the second component is '{components[1]}', " f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', " f"and the fifth component is '{components[4]}'. " ) score = ( "The first, second, third, fourth, and fifth component is each worth " f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores." ) return ( "Here is an instruction for a multimodal LLM: '" f"{question}" "'. You need to grade if the response from the model follows each component of the instruction. " f"{components}" "The response is: '" f"{response}" "'. You need to score the response and be strict. The total score ranges from 0 to 10, " "depending on if the response follows the instruction. " f"{score}" "List scores of each component, and the total score in one sentence in this format: " "score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons." ) def process_rawscore(component_type, raw_score): first_sentence = raw_score.split('.')[0].split(',') score_dict = {} for i in range(len(first_sentence) - 1): score_ = first_sentence[i].split(':')[1][1:].split('/') score = int(score_[0]) / int(score_[1]) score_dict[component_type[i]] = score total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/') total_score = int(total_score_[0]) / int(total_score_[1]) score_dict['total_score'] = total_score return score_dict def get_score_dict(data, score_raw): cat_score_dict = {} for i in range(len(data)): try: cmp = data['component_type'][i][2:-2] cmp_list = cmp.split('\', \'') score_dict = process_rawscore(cmp_list, score_raw[i]) for key, val in score_dict.items(): if key not in cat_score_dict.keys(): cat_score_dict[key] = [val] else: cat_score_dict[key].append(val) except: pass cat_score_dict_average = {} for key, val in cat_score_dict.items(): cat_score_dict_average[key] = sum(val) / len(val) return cat_score_dict_average class MIABench(ImageBaseDataset): TYPE = 'VQA' DATASET_URL = { 'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv', } DATASET_MD5 = { 'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82', } @classmethod def evaluate(self, eval_file, **judge_kwargs): judge_name = judge_kwargs.pop('model', 'gpt-4o') model = build_judge(model=judge_name, **judge_kwargs) suffix = eval_file.split('.')[-1] storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841 tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841 nproc = judge_kwargs.pop('nproc', 4) # noqa: F841 if not osp.exists(storage): data = load(eval_file) num_samples = len(data) lines = [data.loc[i] for i in range(num_samples)] prompts = [generate_prompt(line) for line in lines] org_data = MIABench('MIA-Bench').data img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])} image_b64 = [img_map[idx] for idx in data['index']] indices = list(data['index']) mm_messages = [ dict(message=[ dict(type='text', value=prompt), dict(type='image', value=f'data:image/jpeg;base64,{b64}') ]) for prompt, b64 in zip(prompts, image_b64) ] res = {} if osp.exists(tmp_file): res = load(tmp_file) jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res} job_keys = list(jobs.keys()) job_vals = [jobs[k] for k in job_keys] resps = track_progress_rich( model.generate, job_vals, nproc=nproc, chunksize=nproc, keys=job_keys, save=tmp_file, ) for k, resp in zip(job_keys, resps): res[k] = resp data['score_raw'] = [res[idx] for idx in indices] dump(data, storage) goresult = load(storage) results = get_score_dict(goresult, goresult['score_raw']) result_pth = storage.replace('.xlsx', '_score.csv') results_pd = pd.DataFrame.from_dict(list(results.items())) dump(results_pd, result_pth) return results