hi , may i ask how do u do it?
thanks :)
This is an example for model inference
from transformers import AutoTokenizer, pipeline
o4model = ORTModelForSequenceClassification.from_pretrained('bge-reranker-v2-m3-onnx-o4')
o4model.to("cuda")
pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]
with torch.no_grad():
# warm model
tokenizer.enable_padding()
tokenizer.enable_truncation(max_length=512)
inputs = tokenizer.encode_batch(pairs)
scores = o4model(**inputs, return_dict=True).logits.view(-1, ).float()
print(scores)
This is an example for model inference
from transformers import AutoTokenizer, pipeline o4model = ORTModelForSequenceClassification.from_pretrained('bge-reranker-v2-m3-onnx-o4') o4model.to("cuda") pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']] with torch.no_grad(): # warm model tokenizer.enable_padding() tokenizer.enable_truncation(max_length=512) inputs = tokenizer.encode_batch(pairs) scores = o4model(**inputs, return_dict=True).logits.view(-1, ).float() print(scores)
thanks,
This is an example for model inference
from transformers import AutoTokenizer, pipeline o4model = ORTModelForSequenceClassification.from_pretrained('bge-reranker-v2-m3-onnx-o4') o4model.to("cuda") pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']] with torch.no_grad(): # warm model tokenizer.enable_padding() tokenizer.enable_truncation(max_length=512) inputs = tokenizer.encode_batch(pairs) scores = o4model(**inputs, return_dict=True).logits.view(-1, ).float() print(scores)
thanks,
This is an example for model inference
from transformers import AutoTokenizer, pipeline o4model = ORTModelForSequenceClassification.from_pretrained('bge-reranker-v2-m3-onnx-o4') o4model.to("cuda") pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']] with torch.no_grad(): # warm model tokenizer.enable_padding() tokenizer.enable_truncation(max_length=512) inputs = tokenizer.encode_batch(pairs) scores = o4model(**inputs, return_dict=True).logits.view(-1, ).float() print(scores)
root@53720704aade:/media/data/xgp/scripts# python3 o4.py
/usr/local/lib/python3.10/dist-packages/xformers/ops/fmha/flash.py:211: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("xformers_flash::flash_fwd")
/usr/local/lib/python3.10/dist-packages/xformers/ops/fmha/flash.py:344: FutureWarning: torch.library.impl_abstract
was renamed to torch.library.register_fake
. Please use that instead; we will remove torch.library.impl_abstract
in a future version of PyTorch.
@torch
.library.impl_abstract("xformers_flash::flash_bwd")
use_io_binding was set to False, setting it to True because it can provide a huge speedup on GPUs. It is possible to disable this feature manually by setting the use_io_binding attribute back to False.
2025-05-20 03:36:19.684715619 [W:onnxruntime:, session_state.cc:1280 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.
2025-05-20 03:36:19.684750464 [W:onnxruntime:, session_state.cc:1282 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.
Traceback (most recent call last):
File "/media/data/xgp/scripts/o4.py", line 17, in
tokenizer.enable_padding()
AttributeError: 'XLMRobertaTokenizerFast' object has no attribute 'enable_padding'