|
from pathlib import Path |
|
import onnx |
|
from onnxconverter_common import float16 |
|
from onnxruntime.quantization import quantize_dynamic, QuantType |
|
|
|
from optimum.onnxruntime import ORTModelForFeatureExtraction |
|
from transformers import AutoTokenizer |
|
|
|
|
|
model_name = "dangvantuan/vietnamese-embedding" |
|
output_dir = Path("onnx") |
|
output_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
|
print("Exporting the FP32 model...") |
|
model = ORTModelForFeatureExtraction.from_pretrained(model_name, export=True) |
|
model.save_pretrained(output_dir) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
tokenizer.save_pretrained(Path(".")) |
|
|
|
|
|
model_fp32_path = output_dir / "model.onnx" |
|
|
|
|
|
|
|
|
|
print("Converting to FP16...") |
|
model_fp16_path = output_dir / "model-fp16.onnx" |
|
|
|
model_fp32 = onnx.load(model_fp32_path.as_posix()) |
|
|
|
model_fp16 = float16.convert_float_to_float16(model_fp32, keep_io_types=True) |
|
|
|
onnx.save(model_fp16, model_fp16_path.as_posix()) |
|
|
|
|
|
|
|
|
|
print("Converting to INT8 (dynamic quantization)...") |
|
model_int8_path = output_dir / "model-int8.onnx" |
|
quantize_dynamic( |
|
model_fp32_path.as_posix(), |
|
model_int8_path.as_posix(), |
|
weight_type=QuantType.QInt8 |
|
) |
|
|
|
print("✅ Model conversion complete!") |
|
print(f"FP32 model: {model_fp32_path}") |
|
print(f"FP16 model: {model_fp16_path}") |
|
print(f"INT8 model: {model_int8_path}") |
|
|