services: sglang: image: lmsysorg/sglang:latest container_name: sglang volumes: - ${HOME}/.cache/huggingface:/root/.cache/huggingface # If you use modelscope, you need mount this directory # - ${HOME}/.cache/modelscope:/root/.cache/modelscope restart: always network_mode: host # Or you can only publish port 30000 # ports: # - 30000:30000 environment: HF_TOKEN: # if you use modelscope to download model, you need set this environment # - SGLANG_USE_MODELSCOPE: true entrypoint: python3 -m sglang.launch_server command: --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000 ulimits: memlock: -1 stack: 67108864 ipc: host healthcheck: test: ["CMD-SHELL", "curl -f http://localhost:30000/health || exit 1"] deploy: resources: reservations: devices: - driver: nvidia device_ids: ['0'] capabilities: [gpu]