Quantization method specified in the model config (fp8) does not match the quantization method specified in the `quantization` argument (gguf).
#13
by
Minami-su
- opened
MODEL_PATH="/data2/jcxy/llm_model/DeepSeek-R1-0528-GGUF-UD-Q2_K_XL/DeepSeek-R1-0528-UD-Q2_K_XL-00001-of-00006.gguf"
LOG_FILE="vllm.log"
export VLLM_USE_V1=1
SERVED_MODEL_NAME="DeepSeek-R1-0528"
export CUDA_VISIBLE_DEVICES=2,3,4,5
运行命令
nohup python -m vllm.entrypoints.openai.api_server
--model "$MODEL_PATH"
--hf-config-path /data2/jcxy/llm_model/DeepSeek-R1-0528-GGUF-UD-Q2_K_XL
--tokenizer /data2/jcxy/llm_model/DeepSeek-R1-0528-GGUF-UD-Q2_K_XL
--served-model-name "$SERVED_MODEL_NAME"
--trust-remote-code
--port 6011
--host 0.0.0.0
--dtype auto
--max-model-len 8192
--cpu-offload-gb 80
--gpu_memory_utilization 0.98
--tensor_parallel_size 4
--enable-prefix-caching
>"$LOG_FILE" 2>&1 &