gpt-oss-20b
Collection
gpt-oss-20b
•
2 items
•
Updated
This is a quantized INT4 model based on EdgeAI with NVIDIA GPU. You can deploy it on your devices with GPU.
Note: This is unoffical version,just for test and dev.
Note AzureML or Google Colab or Vast.ai is good system for this
ONNXRuntime GenAI
git clone https://github.com/microsoft/onnxruntime-genai.git
cd onnxruntime-genai && python3 build.py --use_cuda --cuda_home=/usr/local/cuda --config Release
pip install ./onnxruntime-genai/build/Linux/Release/wheel/onnxruntime_genai_cuda-0.9.0.dev0-cp312-cp312-linux_x86_64.whl -U
TensorRT-LLM
sudo apt-get -y install libopenmpi-dev
pip install cuda-python==12.8
pip install tensorrt_llm==0.20.0 -U
Transformers
pip install transformers==4.55.2 -U
TorchRT
pip install torch_tensorrt==2.7.0 --upgrade
ONNXRuntime
pip install onnx==1.17.0 -U
pip install onnx_ir==0.1.6 -U
pip install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --pre onnxruntime-gpu==1.23.0.dev20250810002 -U
PyTorch
Note PyTorch 2.8.0
pip install torch torchvision torchaudio -U
CuPy
pip install cupy-cuda12x
! python3 ./onnxruntime-genai/src/python/py/models/builder.py -m openai/gpt-oss-20b -o your gpt_oss_int4_cuda save path -p int4 -e cuda -c ./cache_dir --extra_options int4_algo_config=k_quant_mixed
import onnxruntime_genai as og
import argparse
import os
import json
import time
config = og.Config("./gpt_oss_int4_cuda")
config.clear_providers()
config.append_provider("cuda")
model = og.Model(config)
tokenizer = og.Tokenizer(model)
tokenizer_stream = tokenizer.create_stream()
search_options = {}
search_options['top_k'] = 50
search_options['max_length'] = 256
messages = f"""[{{"role": "user", "content": "can you introduce yourself"}}]"""
template_str = ""
tokenizer_input_system_prompt = None
jinja_path = os.path.join("your gpt_oss_int4_cuda save path", "chat_template.jinja")
if os.path.exists(jinja_path):
with open(jinja_path, "r", encoding="utf-8") as f:
template_str = f.read()
tokenizer_input_system_prompt = tokenizer.apply_chat_template(messages=messages, add_generation_prompt=False, template_str=template_str)
# print("yes")
else:
tokenizer_input_system_prompt = tokenizer.apply_chat_template(messages=messages, add_generation_prompt=False)
params = og.GeneratorParams(model)
params.set_search_options(**search_options)
generator = og.Generator(model, params)
input_tokens = tokenizer.encode(tokenizer_input_system_prompt)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.generate_next_token()
new_token = generator.get_next_tokens()[0]
print(tokenizer_stream.decode(new_token), end='', flush=True)