apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
  name: nvidia
handler: nvidia
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: meta-llama-31-8b-instruct-sglang
spec:
  replicas: 1
  strategy:
    type: Recreate
  selector:
    matchLabels:
      app: meta-llama-31-8b-instruct-sglang
  template:
    metadata:
      labels:
        app: meta-llama-31-8b-instruct-sglang
        model: meta-llama-31-8b-instruct
        engine: sglang
    spec:
      hostIPC: true
      restartPolicy: Always
      runtimeClassName: nvidia
      containers:
        - name: meta-llama-31-8b-instruct-sglang
          image: docker.io/lmsysorg/sglang:latest
          imagePullPolicy: Always  # IfNotPresent or Never
          ports:
            - containerPort: 30000
          command: ["python3", "-m", "sglang.launch_server"]
          args: ["--model-path", "meta-llama/Llama-3.1-8B-Instruct", "--host", "0.0.0.0", "--port", "30000"]
          env:
            - name: HF_TOKEN
              value: <secret>
          resources:
            limits:
              nvidia.com/gpu: 1
          volumeMounts:
            - name: hf-cache
              mountPath: /root/.cache/huggingface
              readOnly: true
            - name: localtime
              mountPath: /etc/localtime
              readOnly: true
          livenessProbe:
            httpGet:
              path: /health
              port: 30000
            initialDelaySeconds: 30
            periodSeconds: 10
      volumes:
        - name: hf-cache
          hostPath:
            path: /root/.cache/huggingface
            type: Directory
        - name: localtime
          hostPath:
            path: /etc/localtime
            type: File
---
apiVersion: v1
kind: Service
metadata:
  name: meta-llama-31-8b-instruct-sglang
spec:
  selector:
    app: meta-llama-31-8b-instruct-sglang
  ports:
    - protocol: TCP
      port: 30000  # port on host
      targetPort: 30000  # port in container
  type: LoadBalancer