voxreality/rgb_language_vqa_onnx

ONNX format of voxreality/rgb_language_vqa model
Model inference example:
import onnxruntime as ort
from transformers import BlipProcessor
import numpy as np
from PIL import Image

# load the ONNX model
onnx_model_path = "models/rgb_language_vqa_onnx/model.onnx"
ort_session = ort.InferenceSession(onnx_model_path,providers=["CPUExecutionProvider"])

# load the processor
model_id = "models/rgb_language_vqa_onnx"
processor = BlipProcessor.from_pretrained(model_id)

# prepare the input image and question
raw_image = Image.open("img1.jpg")
question = "Where is the person?"

# process the inputs using the processor 
inputs = processor(raw_image, question, return_tensors="np")

# the input tensors for ONNX
pixel_values = inputs["pixel_values"]  
input_ids = inputs["input_ids"]  

# run inference on the ONNX model
outputs = ort_session.run(
    None,  
    {
        "pixel_values": pixel_values,  
        "input_ids": input_ids,  
    }
)

# decode the output
output_ids = outputs[0]  # Extract the output (token IDs)
decoded_output = processor.tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(decoded_output)