from pathlib import Path
import onnx
from onnxconverter_common import float16
from onnxruntime.quantization import quantize_dynamic, QuantType

from optimum.onnxruntime import ORTModelForFeatureExtraction
from transformers import AutoTokenizer

# Set model name and output directory
model_name = "dangvantuan/vietnamese-embedding"
output_dir = Path("onnx")
output_dir.mkdir(parents=True, exist_ok=True)

# -------------------------------------------
# Step 1: Export the model to ONNX (FP32)
# -------------------------------------------
print("Exporting the FP32 model...")
model = ORTModelForFeatureExtraction.from_pretrained(model_name, export=True)
model.save_pretrained(output_dir)

# Save the tokenizer alongside the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(Path("."))

# Define FP32 model path
model_fp32_path = output_dir / "model.onnx"

# -------------------------------------------
# Step 2: Convert FP32 model to FP16
# -------------------------------------------
print("Converting to FP16...")
model_fp16_path = output_dir / "model-fp16.onnx"
# Load the FP32 ONNX model
model_fp32 = onnx.load(model_fp32_path.as_posix())
# Convert weights to FP16 while keeping input/output types in FP32 if needed
model_fp16 = float16.convert_float_to_float16(model_fp32, keep_io_types=True)
# Save the FP16 model
onnx.save(model_fp16, model_fp16_path.as_posix())

# -------------------------------------------
# Step 3: Convert FP32 model to INT8 (Dynamic Quantization)
# -------------------------------------------
print("Converting to INT8 (dynamic quantization)...")
model_int8_path = output_dir / "model-int8.onnx"
quantize_dynamic(
    model_fp32_path.as_posix(),
    model_int8_path.as_posix(),
    weight_type=QuantType.QInt8  # Use QInt8 or QUInt8 depending on your requirements
)

print("✅ Model conversion complete!")
print(f"FP32 model: {model_fp32_path}")
print(f"FP16 model: {model_fp16_path}")
print(f"INT8 model: {model_int8_path}")