microsoft/Phi-4-multimodal-instruct · temp_len: 5217, output

I tested this model on mmmu_val
on validation set (all 900). there is error near index 740(no shuffle)
example["id] == "validation_Music_21"
from datasets import load_dataset, concatenate_datasets
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

# Define model path
model_path = "microsoft/Phi-4-multimodal-instruct"

# Load model and processor
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path, 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True,
    # if you do not use Ampere or later GPUs, change attention to "eager"
    _attn_implementation='flash_attention_2',
).cuda()

data_subsets = [
    "Accounting",
    "Agriculture",
    "Architecture_and_Engineering",
    "Art",
    "Art_Theory",
    "Basic_Medical_Science",
    "Biology",
    "Chemistry",
    "Clinical_Medicine",
    "Computer_Science",
    "Design",
    "Diagnostics_and_Laboratory_Medicine",
    "Economics",
    "Electronics",
    "Energy_and_Power",
    "Finance",
    "Geography",
    "History",
    "Literature",
    "Manage",
    "Marketing",
    "Materials",
    "Math",
    "Mechanical_Engineering",
    "Music",
    "Pharmacy",
    "Physics",
    "Psychology",
    "Public_Health",
    "Sociology"
  ]
data_split =  "validation"
loaded_dataset = []
for subset_name in data_subsets:
    loaded_dataset.append(load_dataset("mmmu/mmmu", name=subset_name, split="validation"))

dataset = concatenate_datasets(loaded_dataset)
dataset = dataset.skip(740)


def phi4mminst_mmmu_preprocess(example):
    messages = []

    def transform_image_tags(text):
        """
        텍스트 내의 "<image 숫자>" 형태의 태그를 "<image_숫자>" 형태로 변환합니다.
        예: "<image 1>" -> "<|image_1|>", "<image 23>" -> "<|image_23|>"
        """
        # 패턴 설명:
        # <image : 문자열 "<image "와 일치합니다.
        # (\d+)  : 괄호 안은 캡처 그룹입니다. \d+는 하나 이상의 숫자와 일치합니다 (예: "1", "12", "05").
        # >      : 문자열 ">"와 일치합니다.
        pattern = r"<image (\d+)>"
        
        # 치환될 문자열 설명:
        # <image_ : 문자열 "<image_"를 의미합니다.
        # \1      : 첫 번째 캡처 그룹(\d+)의 내용 (즉, 숫자)을 참조합니다.
        # >       : 문자열 ">"를 의미합니다.
        replacement = r"<|image_\1|>"
        
        transformed_text, count = re.subn(pattern, replacement, text)
        return transformed_text, count

    question = example["question"]

    options = {cand: eval(example["options"])[i] for i,cand in enumerate(string.ascii_uppercase[:len(eval(example["options"]))])}
    options_prompt = 'Options:\n'
    for key, item in options.items():
        options_prompt += f'{key}. {item}\n'

    prompt = ''
    prompt += f'Question: {question}\n'
    if len(options):
        prompt += options_prompt
        prompt += 'Please select the correct answer from the options above. \n'
    prompt = prompt.rstrip()
    
    preprocessed_question_text, image_cnt_in_question = transform_image_tags(prompt)
    prompt = f'{preprocessed_question_text}'
    messages.append({'role': 'user', 'content': prompt})
    
    images = []
    def find_image_tokens(text):
        """주어진 문자열에서 '<|image_n|>' 패턴을 찾아 리스트로 반환합니다.

        Args:
            text: 검색할 문자열입니다.

        Returns:
            찾은 '<|image_n|>' 토큰의 리스트입니다.
        """
        pattern = r"<\|image_\d+\|>"
        matches = re.findall(pattern, text)
        return matches

    for key in find_image_tokens(prompt):
        images.append(example[key.replace("<|", "").replace("|>", "")])
    
    return {
        "messages" : messages,
        "images" : images,
    }


for i, example in tqdm(enumerate(dataset), total=len(dataset), desc="inference & evaluate"):
        input = phi4mminst_mmmu_preprocess(example)
        input["text"] = processor.apply_chat_template(
            input["messages"],
            tokenize=False,
            add_generation_prompt=True,
        )
        del input["messages"]
        inputs = processor(**input, padding=True, return_tensors='pt').to(model.device, model.dtype)
        outputs = model.generate(
            **inputs,
            **generate_kwargs
        )
microsoft
/

Phi-4-multimodal-instruct

temp_len: 5217, output_imgs[-1].shape[1]: 5233