File size: 7,357 Bytes
87f911f
 
 
6a2d777
87f911f
 
 
 
6a2d777
 
 
87f911f
6a2d777
 
 
 
 
 
87f911f
6a2d777
87f911f
 
 
6a2d777
87f911f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a2d777
 
 
87f911f
6a2d777
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87f911f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a2d777
 
87f911f
6a2d777
87f911f
6a2d777
 
 
87f911f
6a2d777
 
 
 
 
 
 
87f911f
 
 
 
 
6a2d777
 
87f911f
 
 
6a2d777
87f911f
 
6a2d777
 
87f911f
 
 
 
 
 
 
 
 
6a2d777
 
87f911f
6a2d777
 
 
 
 
 
87f911f
6a2d777
87f911f
 
 
 
 
 
 
 
6a2d777
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87f911f
6a2d777
 
87f911f
6a2d777
87f911f
 
 
 
 
6a2d777
 
 
 
 
 
 
 
87f911f
 
 
6a2d777
87f911f
 
 
 
 
6a2d777
 
 
87f911f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a2d777
 
87f911f
 
 
 
6a2d777
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import subprocess

from huggingface_hub import snapshot_download
import vllm

import modal

APP_NAME = "llm-server"
VOLUME_NAME = APP_NAME + "-volume"
MOUNT_VOLUME = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
MOUNT_DIR = "/data"

# Model identifier for the Hugging Face model
# NOTE: Gemma-3 GGUF models are not supported by vLLM yet (2025-06-10).
# NOTE: vLLM allocate all GPU memory according to the value specified by `gpu_memory_utilization` at initialization.
# https://huggingface.co/google/gemma-3-4b-it
MODEL_IDENTIFIER = "google/gemma-3-4b-it"  # GPU memory requirements: 10GB when MAX_MODEL_TOKENS=2k, 20GB when MAX_MODEL_TOKENS=128k
# https://huggingface.co/google/gemma-3-12b-it
# MODEL_IDENTIFIER = "google/gemma-3-12b-it"
# https://huggingface.co/google/gemma-3-27b-it
# MODEL_IDENTIFIER = "google/gemma-3-27b-it"

# https://modal.com/docs/guide/gpu#specifying-gpu-type
GPU_NAME = "A100-40GB"
GPU_NUM = 1  # Number of GPUs to use
GPU = f"{GPU_NAME}:{GPU_NUM}"

# https://modal.com/pricing
# | GPU       | Memory | Price    |
# |-----------|--------|----------|
# | B200      | 180 GB | $6.25 /h |
# | H200      | 141 GB | $4.54 /h |
# | H100      |  80 GB | $3.95 /h |
# | A100-80GB |  80 GB | $2.50 /h |
# | A100-40GB |  40 GB | $2.10 /h |
# | L40S      |  48 GB | $1.95 /h |
# | A10G      |  24 GB | $1.10 /h |
# | L4        |  24 GB | $0.80 /h |
# | T4        |  16 GB | $0.59 /h |

# MAX_MODEL_TOKENS >= Input + Output
MAX_MODEL_TOKENS = 128 * 1024  # Gemma-3-4B~ has 128K context length
MAX_OUTPUT_TOKENS = 512

image = (
    # https//hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04/images/sha256-4b9ed5fa8361736996499f64ecebf25d4ec37ff56e4d11323ccde10aa36e0c43
    modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu24.04", add_python="3.12")
    .pip_install(
        [
            "accelerate>=1.7.0",
            "bitsandbytes>=0.46.0",
            "sentencepiece>=0.2.0",
            "torch==2.7.0",  # torch2.7.1 is not compatible with vllm
            "transformers>=4.52.4",
            "vllm>=0.9.0.1",
        ]
    )
    .env(
        {
            "HF_HOME": MOUNT_DIR + "/huggingface",
            "VLLM_CACHE_ROOT": MOUNT_DIR + "/vllm",
        }
    )
)

app = modal.App(APP_NAME, image=image)

# NOTE: `@app.cls`, `@modal.enter()`, and `@modal.method()` are used like `@app.function()`
# https://modal.com/docs/guide/lifecycle-functions


@app.cls(
    gpu=GPU,
    image=image,
    volumes={MOUNT_DIR: MOUNT_VOLUME},
    secrets=[modal.Secret.from_name("huggingface-secret")],
    scaledown_window=15 * 60,
    timeout=30 * 60,
)
class VLLMModel:

    @modal.enter()
    def setup(self):
        # Ensure the cache volume is the latest
        MOUNT_VOLUME.reload()

        # NOTE:"HF_TOKEN" environment variable is required for Hugging Face authentication

        # self._download_model(MODEL_IDENTIFIER)  # This is not needed because vLLM can download the model automatically.

        self._load_model()

        # Commit the volume to ensure the model is saved
        MOUNT_VOLUME.commit()

    def _download_model(self, repo_id: str):
        """Download the model from Hugging Face if not already present."""
        # Ensure the cache volume is the latest
        MOUNT_VOLUME.reload()

        snapshot_download(
            repo_id=repo_id,
        )

        # Commit downloaded model
        MOUNT_VOLUME.commit()

    def _load_model(self):

        self.llm = vllm.LLM(
            model=MODEL_IDENTIFIER,
            tensor_parallel_size=1,
            dtype="auto",
            max_model_len=MAX_MODEL_TOKENS,
            gpu_memory_utilization=0.9,
            trust_remote_code=True,
        )

        # Show GPU information
        subprocess.run(["nvidia-smi"])

    @modal.method()
    def generate(self, chat_history):
        """Generate a response"""
        formatted_text = self._get_formatted_text(chat_history)

        input_token_len = self._check_input_length(formatted_text)
        if input_token_len + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
            raise ValueError(
                f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
                f"Current input length: {input_token_len} tokens."
            )

        sampling_params = self._get_sampling_params()

        outputs = self.llm.generate([formatted_text], sampling_params)
        response = outputs[0].outputs[0].text

        return response

    @modal.method()
    def generate_stream(self, chat_history):
        """Generate a streaming response"""
        formatted_text = self._get_formatted_text(chat_history)

        input_token_len = self._check_input_length(formatted_text)
        if input_token_len + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
            raise ValueError(
                f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
                f"Current input length: {input_token_len} tokens."
            )

        sampling_params = self._get_sampling_params()

        # Streaming generation with vLLM
        for output in self.llm.generate([formatted_text], sampling_params):
            for completion_output in output.outputs:
                yield completion_output.text

    def _get_formatted_text(self, chat_history):
        """Format the chat history"""
        tokenizer = self.llm.get_tokenizer()
        return tokenizer.apply_chat_template(
            chat_history,
            tokenize=False,
            add_generation_prompt=True,
        )

    def _check_input_length(self, formatted_text):
        tokenizer = self.llm.get_tokenizer()
        input_token_len = len(tokenizer(formatted_text)["input_ids"])
        return input_token_len

    def _get_sampling_params(self):
        """Get sampling parameters for generation"""
        return vllm.SamplingParams(
            temperature=1.0,
            top_k=50,
            top_p=1.0,
            max_tokens=MAX_OUTPUT_TOKENS,
        )


@app.local_entrypoint()
def main():
    SYSTEM_PROMPT = (
        "You are a friendly Chatbot. Please respond in the same language as the user."
    )

    # Initialize chat history list
    chat_history = []
    chat_history.append(
        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
    )

    user_prompt = "Hi!"
    print(f"USER: {user_prompt}\n")
    chat_history.append(
        {
            "role": "user",
            "content": [
                # {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
                {"type": "text", "text": user_prompt}
            ],
        }
    )

    model = VLLMModel()

    # Call non-streaming function
    response = model.generate.remote(chat_history)
    print("AI:", response)
    chat_history.append(
        {"role": "assistant", "content": [{"type": "text", "text": response}]}
    )

    user_prompt = "What is your name?"
    print(f"USER: {user_prompt}\n")
    chat_history.append(
        {
            "role": "user",
            "content": [{"type": "text", "text": user_prompt}],
        }
    )

    # Call streaming function
    print("AI: ", end="", flush=True)
    response = ""
    for chunk in model.generate_stream.remote_gen(chat_history):
        print(chunk, end="", flush=True)
        response += chunk
    print()