works fine in TensorRT-LLM too!
#1
by
textgeflecht
- opened
Tested in v0.21.0rc0 on 1x RTX 5090
# according to https://github.com/NVIDIA/TensorRT-LLM/tree/v0.21.0rc1/examples/models/core/qwen#serving
root@computer:/app/tensorrt_llm/examples/models/core/qwen# cat >./extra-llm-api-config.yml <<EOF
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
- 384
print_iter_log: true
enable_attention_dp: true
EOF
root@computer:/app/tensorrt_llm/examples/models/core/qwen# trtllm-serve /mymodel/ \
--host localhost --port 8000 --backend pytorch --max_batch_size 1 \
--max_num_tokens 2000 --max_seq_len 2000 \
--tp_size 1 --ep_size 1 --pp_size 1 \
--kv_cache_free_gpu_memory_fraction 0.9 --extra_llm_api_options ./extra-llm-api-config.yml
Thanks for the quant!