Spaces:

Agents-MCP-Hackathon
/

novel-heroes

Sleeping

File size: 7,357 Bytes

import subprocess

from huggingface_hub import snapshot_download
import vllm

import modal

APP_NAME = "llm-server"
VOLUME_NAME = APP_NAME + "-volume"
MOUNT_VOLUME = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
MOUNT_DIR = "/data"

# Model identifier for the Hugging Face model
# NOTE: Gemma-3 GGUF models are not supported by vLLM yet (2025-06-10).
# NOTE: vLLM allocate all GPU memory according to the value specified by `gpu_memory_utilization` at initialization.
# https://huggingface.co/google/gemma-3-4b-it
MODEL_IDENTIFIER = "google/gemma-3-4b-it"  # GPU memory requirements: 10GB when MAX_MODEL_TOKENS=2k, 20GB when MAX_MODEL_TOKENS=128k
# https://huggingface.co/google/gemma-3-12b-it
# MODEL_IDENTIFIER = "google/gemma-3-12b-it"
# https://huggingface.co/google/gemma-3-27b-it
# MODEL_IDENTIFIER = "google/gemma-3-27b-it"

# https://modal.com/docs/guide/gpu#specifying-gpu-type
GPU_NAME = "A100-40GB"
GPU_NUM = 1  # Number of GPUs to use
GPU = f"{GPU_NAME}:{GPU_NUM}"

# https://modal.com/pricing
# | GPU       | Memory | Price    |
# |-----------|--------|----------|
# | B200      | 180 GB | $6.25 /h |
# | H200      | 141 GB | $4.54 /h |
# | H100      |  80 GB | $3.95 /h |
# | A100-80GB |  80 GB | $2.50 /h |
# | A100-40GB |  40 GB | $2.10 /h |
# | L40S      |  48 GB | $1.95 /h |
# | A10G      |  24 GB | $1.10 /h |
# | L4        |  24 GB | $0.80 /h |
# | T4        |  16 GB | $0.59 /h |

# MAX_MODEL_TOKENS >= Input + Output
MAX_MODEL_TOKENS = 128 * 1024  # Gemma-3-4B~ has 128K context length
MAX_OUTPUT_TOKENS = 512

image = (
    # https//hub.docker.com/layers/nvidia/cuda/12.8.1-devel-ubuntu24.04/images/sha256-4b9ed5fa8361736996499f64ecebf25d4ec37ff56e4d11323ccde10aa36e0c43
    modal.Image.from_registry("nvidia/cuda:12.8.1-devel-ubuntu24.04", add_python="3.12")
    .pip_install(
        [
            "accelerate>=1.7.0",
            "bitsandbytes>=0.46.0",
            "sentencepiece>=0.2.0",
            "torch==2.7.0",  # torch2.7.1 is not compatible with vllm
            "transformers>=4.52.4",
            "vllm>=0.9.0.1",
        ]
    )
    .env(
        {
            "HF_HOME": MOUNT_DIR + "/huggingface",
            "VLLM_CACHE_ROOT": MOUNT_DIR + "/vllm",
        }
    )
)

app = modal.App(APP_NAME, image=image)

# NOTE: `@app.cls`, `@modal.enter()`, and `@modal.method()` are used like `@app.function()`
# https://modal.com/docs/guide/lifecycle-functions


@app.cls(
    gpu=GPU,
    image=image,
    volumes={MOUNT_DIR: MOUNT_VOLUME},
    secrets=[modal.Secret.from_name("huggingface-secret")],
    scaledown_window=15 * 60,
    timeout=30 * 60,
)
class VLLMModel:

    @modal.enter()
    def setup(self):
        # Ensure the cache volume is the latest
        MOUNT_VOLUME.reload()

        # NOTE:"HF_TOKEN" environment variable is required for Hugging Face authentication

        # self._download_model(MODEL_IDENTIFIER)  # This is not needed because vLLM can download the model automatically.

        self._load_model()

        # Commit the volume to ensure the model is saved
        MOUNT_VOLUME.commit()

    def _download_model(self, repo_id: str):
        """Download the model from Hugging Face if not already present."""
        # Ensure the cache volume is the latest
        MOUNT_VOLUME.reload()

        snapshot_download(
            repo_id=repo_id,
        )

        # Commit downloaded model
        MOUNT_VOLUME.commit()

    def _load_model(self):

        self.llm = vllm.LLM(
            model=MODEL_IDENTIFIER,
            tensor_parallel_size=1,
            dtype="auto",
            max_model_len=MAX_MODEL_TOKENS,
            gpu_memory_utilization=0.9,
            trust_remote_code=True,
        )

        # Show GPU information
        subprocess.run(["nvidia-smi"])

    @modal.method()
    def generate(self, chat_history):
        """Generate a response"""
        formatted_text = self._get_formatted_text(chat_history)

        input_token_len = self._check_input_length(formatted_text)
        if input_token_len + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
            raise ValueError(
                f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
                f"Current input length: {input_token_len} tokens."
            )

        sampling_params = self._get_sampling_params()

        outputs = self.llm.generate([formatted_text], sampling_params)
        response = outputs[0].outputs[0].text

        return response

    @modal.method()
    def generate_stream(self, chat_history):
        """Generate a streaming response"""
        formatted_text = self._get_formatted_text(chat_history)

        input_token_len = self._check_input_length(formatted_text)
        if input_token_len + MAX_OUTPUT_TOKENS > MAX_MODEL_TOKENS:
            raise ValueError(
                f"Input length exceeds the maximum allowed tokens: {MAX_MODEL_TOKENS}. "
                f"Current input length: {input_token_len} tokens."
            )

        sampling_params = self._get_sampling_params()

        # Streaming generation with vLLM
        for output in self.llm.generate([formatted_text], sampling_params):
            for completion_output in output.outputs:
                yield completion_output.text

    def _get_formatted_text(self, chat_history):
        """Format the chat history"""
        tokenizer = self.llm.get_tokenizer()
        return tokenizer.apply_chat_template(
            chat_history,
            tokenize=False,
            add_generation_prompt=True,
        )

    def _check_input_length(self, formatted_text):
        tokenizer = self.llm.get_tokenizer()
        input_token_len = len(tokenizer(formatted_text)["input_ids"])
        return input_token_len

    def _get_sampling_params(self):
        """Get sampling parameters for generation"""
        return vllm.SamplingParams(
            temperature=1.0,
            top_k=50,
            top_p=1.0,
            max_tokens=MAX_OUTPUT_TOKENS,
        )


@app.local_entrypoint()
def main():
    SYSTEM_PROMPT = (
        "You are a friendly Chatbot. Please respond in the same language as the user."
    )

    # Initialize chat history list
    chat_history = []
    chat_history.append(
        {"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]},
    )

    user_prompt = "Hi!"
    print(f"USER: {user_prompt}\n")
    chat_history.append(
        {
            "role": "user",
            "content": [
                # {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
                {"type": "text", "text": user_prompt}
            ],
        }
    )

    model = VLLMModel()

    # Call non-streaming function
    response = model.generate.remote(chat_history)
    print("AI:", response)
    chat_history.append(
        {"role": "assistant", "content": [{"type": "text", "text": response}]}
    )

    user_prompt = "What is your name?"
    print(f"USER: {user_prompt}\n")
    chat_history.append(
        {
            "role": "user",
            "content": [{"type": "text", "text": user_prompt}],
        }
    )

    # Call streaming function
    print("AI: ", end="", flush=True)
    response = ""
    for chunk in model.generate_stream.remote_gen(chat_history):
        print(chunk, end="", flush=True)
        response += chunk
    print()