Spaces:
Sleeping
Sleeping
File size: 7,357 Bytes
87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 87f911f 6a2d777 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 |
import subprocess
from huggingface_hub import snapshot_download
import vllm
import modal
APP_NAME = "llm-server"
VOLUME_NAME = APP_NAME + "-volume"
MOUNT_VOLUME = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
MOUNT_DIR = "/data"
# Model identifier for the Hugging Face model
# NOTE: Gemma-3 GGUF models are not supported by vLLM yet (2025-06-10).
# NOTE: vLLM allocate all GPU memory according to the value specified by `gpu_memory_utilization` at initialization.
# https://huggingface.co/google/gemma-3-4b-it
MODEL_IDENTIFIER = "google/gemma-3-4b-it" # GPU memory requirements: 10GB when MAX_MODEL_TOKENS=2k, 20GB when MAX_MODEL_TOKENS=128k
# https://huggingface.co/google/gemma-3-12b-it
# MODEL_IDENTIFIER = "google/gemma-3-12b-it"
# https://huggingface.co/google/gemma-3-27b-it
# MODEL_IDENTIFIER = "google/gemma-3-27b-it"
# https://modal.com/docs/guide/gpu#specifying-gpu-type
GPU_NAME = "A100-40GB"
GPU_NUM = 1 # Number of GPUs to use
GPU = f"{GPU_NAME}:{GPU_NUM}"
# https://modal.com/pricing
# | GPU | Memory | Price |
# |-----------|--------|----------|
# | B200 | 180 GB | $6.25 /h |
# | H200 | 141 GB | $4.54 /h |
# | H100 | 80 GB | $3.95 /h |
# | A100-80GB | 80 GB | $2.50 /h |
# | A100-40GB | 40 GB | $2.10 /h |
# | L40S | 48 GB | $1.95 /h |
# | A10G | 24 GB | $1.10 /h |
# | L4 | 24 GB | $0.80 /h |
# | T4 | 16 GB | $0.59 /h |
# MAX_MODEL_TOKENS >= Input + Output
MAX_MODEL_TOKENS = 128 * 1024 # Gemma-3-4B~ has 128K context length
MAX_OUTPUT_TOKENS = 512
image = (
# https//hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04/images/sha256-4b9ed5fa8361736996499f64ecebf25d4ec37ff56e4d11323ccde10aa36e0c43
modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu24.04", add_python="3.12")
.pip_install(
[
"accelerate>=1.7.0",
"bitsandbytes>=0.46.0",
"sentencepiece>=0.2.0",
"torch==2.7.0", # torch2.7.1 is not compatible with vllm
"transformers>=4.52.4",
"vllm>=0.9.0.1",
]
)
.env(
{
"HF_HOME": MOUNT_DIR + "/huggingface",
"VLLM_CACHE_ROOT": MOUNT_DIR + "/vllm",
}
)
)
app = modal.App(APP_NAME, image=image)
# NOTE: `@app.cls`, `@modal.enter()`, and `@modal.method()` are used like `@app.function()`
# https://modal.com/docs/guide/lifecycle-functions
@app.cls(
gpu=GPU,
image=image,
volumes={MOUNT_DIR: MOUNT_VOLUME},
secrets=[modal.Secret.from_name("huggingface-secret")],
scaledown_window=15 * 60,
timeout=30 * 60,
)
class VLLMModel:
@modal.enter()
def setup(self):
# Ensure the cache volume is the latest
MOUNT_VOLUME.reload()
# NOTE:"HF_TOKEN" environment variable is required for Hugging Face authentication
# self._download_model(MODEL_IDENTIFIER) # This is not needed because vLLM can download the model automatically.
self._load_model()
# Commit the volume to ensure the model is saved
MOUNT_VOLUME.commit()
def _download_model(self, repo_id: str):
"""Download the model from Hugging Face if not already present."""
# Ensure the cache volume is the latest
MOUNT_VOLUME.reload()
snapshot_download(
repo_id=repo_id,
)
# Commit downloaded model
MOUNT_VOLUME.commit()
def _load_model(self):
self.llm = vllm.LLM(
model=MODEL_IDENTIFIER,
tensor_parallel_size=1,
dtype="auto",
max_model_len=MAX_MODEL_TOKENS,
gpu_memory_utilization=0.9,
trust_remote_code=True,
)
# Show GPU information
subprocess.run(["nvidia-smi"])
@modal.method()
def generate(self, chat_history):
"""Generate a response"""
formatted_text = self._get_formatted_text(chat_history)
input_token_len = self._check_input_length(formatted_text)
if input_token_len + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
raise ValueError(
f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
f"Current input length: {input_token_len} tokens."
)
sampling_params = self._get_sampling_params()
outputs = self.llm.generate([formatted_text], sampling_params)
response = outputs[0].outputs[0].text
return response
@modal.method()
def generate_stream(self, chat_history):
"""Generate a streaming response"""
formatted_text = self._get_formatted_text(chat_history)
input_token_len = self._check_input_length(formatted_text)
if input_token_len + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
raise ValueError(
f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
f"Current input length: {input_token_len} tokens."
)
sampling_params = self._get_sampling_params()
# Streaming generation with vLLM
for output in self.llm.generate([formatted_text], sampling_params):
for completion_output in output.outputs:
yield completion_output.text
def _get_formatted_text(self, chat_history):
"""Format the chat history"""
tokenizer = self.llm.get_tokenizer()
return tokenizer.apply_chat_template(
chat_history,
tokenize=False,
add_generation_prompt=True,
)
def _check_input_length(self, formatted_text):
tokenizer = self.llm.get_tokenizer()
input_token_len = len(tokenizer(formatted_text)["input_ids"])
return input_token_len
def _get_sampling_params(self):
"""Get sampling parameters for generation"""
return vllm.SamplingParams(
temperature=1.0,
top_k=50,
top_p=1.0,
max_tokens=MAX_OUTPUT_TOKENS,
)
@app.local_entrypoint()
def main():
SYSTEM_PROMPT = (
"You are a friendly Chatbot. Please respond in the same language as the user."
)
# Initialize chat history list
chat_history = []
chat_history.append(
{"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
)
user_prompt = "Hi!"
print(f"USER: {user_prompt}\n")
chat_history.append(
{
"role": "user",
"content": [
# {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
{"type": "text", "text": user_prompt}
],
}
)
model = VLLMModel()
# Call non-streaming function
response = model.generate.remote(chat_history)
print("AI:", response)
chat_history.append(
{"role": "assistant", "content": [{"type": "text", "text": response}]}
)
user_prompt = "What is your name?"
print(f"USER: {user_prompt}\n")
chat_history.append(
{
"role": "user",
"content": [{"type": "text", "text": user_prompt}],
}
)
# Call streaming function
print("AI: ", end="", flush=True)
response = ""
for chunk in model.generate_stream.remote_gen(chat_history):
print(chunk, end="", flush=True)
response += chunk
print()
|