tuandunghcmut
/

vlm_clone_2

Model card Files Files and versions Community

tuandunghcmut commited on Apr 10

Commit

127dcad

verified ·

1 Parent(s): 95e1e2e

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LLaVA/.devcontainer/Dockerfile +53 -0
LLaVA/.devcontainer/devcontainer.env +2 -0
LLaVA/.devcontainer/devcontainer.json +71 -0
LLaVA/.devcontainer/postCreateCommand.sh +45 -0
LLaVA/docs/Evaluation.md +167 -0
LLaVA/scripts/convert_sqa_to_llava_base_prompt.py +334 -0
LLaVA/scripts/finetune_qlora.sh +50 -0
LLaVA/scripts/pretrain.sh +46 -0
LLaVA/scripts/zero2.json +23 -0
sglang/.github/ISSUE_TEMPLATE/2-feature-request.yml +23 -0
sglang/.github/workflows/close-inactive-issues.yml +96 -0
sglang/.github/workflows/execute-notebook.yml +49 -0
sglang/.github/workflows/lint.yml +22 -0
sglang/.github/workflows/nightly-test.yml +34 -0
sglang/.github/workflows/pr-test.yml +270 -0
sglang/.github/workflows/release-docker-dev.yml +35 -0
sglang/.github/workflows/release-docker.yml +64 -0
sglang/.github/workflows/release-pypi-kernel.yml +41 -0
sglang/.github/workflows/release-pypi.yml +31 -0
sglang/3rdparty/amd/profiling/PROFILING.md +425 -0
sglang/3rdparty/amd/profiling/client.sh +27 -0
sglang/3rdparty/amd/profiling/install_rpd.sh +10 -0
sglang/3rdparty/amd/profiling/loadTracer.sh +43 -0
sglang/3rdparty/amd/profiling/rpd.patch +12 -0
sglang/3rdparty/amd/profiling/rpd_profile_server_enable.patch +49 -0
sglang/3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch +126 -0
sglang/3rdparty/amd/profiling/server.sh +20 -0
sglang/3rdparty/amd/tuning/TUNING.md +118 -0
sglang/3rdparty/amd/tuning/benchmark_moe_rocm.py +377 -0
sglang/benchmark/blog_v0_2/405b_sglang.sh +24 -0
sglang/benchmark/blog_v0_2/405b_trt.sh +17 -0
sglang/benchmark/blog_v0_2/405b_vllm.sh +24 -0
sglang/benchmark/dspy/README.md +51 -0
sglang/benchmark/dspy/bench_dspy_intro.py +192 -0
sglang/benchmark/gsm8k/README.md +47 -0
sglang/benchmark/gsm8k/bench_other.py +151 -0
sglang/benchmark/gsm8k/bench_sglang.py +141 -0
sglang/benchmark/hellaswag/README.md +47 -0
sglang/benchmark/hellaswag/bench_other.py +118 -0
sglang/benchmark/lora/launch_server.py +47 -0
sglang/benchmark/lora/lora_bench.py +484 -0
sglang/benchmark/mmlu/README.md +59 -0
sglang/benchmark/mtbench/README.md +37 -0
sglang/benchmark/mtbench/bench_other.py +111 -0
sglang/benchmark/mtbench/bench_sglang.py +99 -0
sglang/benchmark/multi_chain_reasoning/README.md +49 -0
sglang/benchmark/multi_chain_reasoning/bench_sglang.py +140 -0
sglang/benchmark/multi_turn_chat/README.md +66 -0
sglang/benchmark/multi_turn_chat/bench_other.py +93 -0
sglang/benchmark/multi_turn_chat/bench_sglang.py +79 -0

LLaVA/.devcontainer/Dockerfile ADDED Viewed

	@@ -0,0 +1,53 @@

+FROM mcr.microsoft.com/devcontainers/base:ubuntu-20.04
+SHELL [ "bash", "-c" ]
+# update apt and install packages
+RUN apt update && \
+    apt install -yq \
+        ffmpeg \
+        dkms \
+        build-essential
+# add user tools
+RUN sudo apt install -yq \
+        jq \
+        jp \
+        tree \
+        tldr
+# add git-lfs and install
+RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash && \
+    sudo apt-get install -yq git-lfs && \
+    git lfs install
+############################################
+# Setup user
+############################################
+USER vscode
+# install azcopy, a tool to copy to/from blob storage
+# for more info: https://learn.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-blobs-upload#upload-a-file
+RUN cd /tmp && \
+    wget https://azcopyvnext.azureedge.net/release20230123/azcopy_linux_amd64_10.17.0.tar.gz && \
+    tar xvf azcopy_linux_amd64_10.17.0.tar.gz && \
+    mkdir -p ~/.local/bin && \
+    mv azcopy_linux_amd64_10.17.0/azcopy ~/.local/bin && \
+    chmod +x ~/.local/bin/azcopy && \
+    rm -rf azcopy_linux_amd64*
+# Setup conda
+RUN cd /tmp && \
+    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
+    bash ./Miniconda3-latest-Linux-x86_64.sh -b && \
+    rm ./Miniconda3-latest-Linux-x86_64.sh
+# Install dotnet
+RUN cd /tmp && \
+    wget https://dot.net/v1/dotnet-install.sh && \
+    chmod +x dotnet-install.sh && \
+    ./dotnet-install.sh --channel 7.0 && \
+    ./dotnet-install.sh --channel 3.1 && \
+    rm ./dotnet-install.sh

LLaVA/.devcontainer/devcontainer.env ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ SAMPLE_ENV_VAR1="Sample Value"
2	+ SAMPLE_ENV_VAR2=332431bf-68bf

LLaVA/.devcontainer/devcontainer.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+    "name": "LLaVA",
+    "build": {
+        "dockerfile": "Dockerfile",
+        "context": "..",
+        "args": {}
+    },
+    "features": {
+        "ghcr.io/devcontainers/features/docker-in-docker:2": {},
+        "ghcr.io/devcontainers/features/azure-cli:1": {},
+        "ghcr.io/azure/azure-dev/azd:0": {},
+        "ghcr.io/devcontainers/features/powershell:1": {},
+        "ghcr.io/devcontainers/features/common-utils:2": {},
+        "ghcr.io/devcontainers-contrib/features/zsh-plugins:0": {},
+    },
+    // "forwardPorts": [],
+    "postCreateCommand": "bash ./.devcontainer/postCreateCommand.sh",
+    "customizations": {
+        "vscode": {
+            "settings": {
+                "python.analysis.autoImportCompletions": true,
+                "python.analysis.autoImportUserSymbols": true,
+                "python.defaultInterpreterPath": "~/miniconda3/envs/llava/bin/python",
+                "python.formatting.provider": "yapf",
+                "python.linting.enabled": true,
+                "python.linting.flake8Enabled": true,
+                "isort.check": true,
+                "dev.containers.copyGitConfig": true,
+                "terminal.integrated.defaultProfile.linux": "zsh",
+                "terminal.integrated.profiles.linux": {
+                    "zsh": {
+                        "path": "/usr/bin/zsh"
+                    },
+                }
+            },
+            "extensions": [
+                "aaron-bond.better-comments",
+                "eamodio.gitlens",
+                "EditorConfig.EditorConfig",
+                "foxundermoon.shell-format",
+                "GitHub.copilot-chat",
+                "GitHub.copilot-labs",
+                "GitHub.copilot",
+                "lehoanganh298.json-lines-viewer",
+                "mhutchie.git-graph",
+                "ms-azuretools.vscode-docker",
+                "ms-dotnettools.dotnet-interactive-vscode",
+                "ms-python.flake8",
+                "ms-python.isort",
+                "ms-python.python",
+                "ms-python.vscode-pylance",
+                "njpwerner.autodocstring",
+                "redhat.vscode-yaml",
+                "stkb.rewrap",
+                "yzhang.markdown-all-in-one",
+            ]
+        }
+    },
+    "mounts": [],
+    "runArgs": [
+        "--gpus",
+        "all",
+        // "--ipc",
+        // "host",
+        "--ulimit",
+        "memlock=-1",
+        "--env-file",
+        ".devcontainer/devcontainer.env"
+    ],
+    // "remoteUser": "root"
+}

LLaVA/.devcontainer/postCreateCommand.sh ADDED Viewed

	@@ -0,0 +1,45 @@

+git config --global safe.directory '*'
+git config --global core.editor "code --wait"
+git config --global pager.branch false
+# Set AZCOPY concurrency to auto
+echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.zshrc
+echo "export AZCOPY_CONCURRENCY_VALUE=AUTO" >> ~/.bashrc
+# Activate conda by default
+echo ". /home/vscode/miniconda3/bin/activate" >> ~/.zshrc
+echo ". /home/vscode/miniconda3/bin/activate" >> ~/.bashrc
+# Use llava environment by default
+echo "conda activate llava" >> ~/.zshrc
+echo "conda activate llava" >> ~/.bashrc
+# Add dotnet to PATH
+echo 'export PATH="$PATH:$HOME/.dotnet"' >> ~/.bashrc
+echo 'export PATH="$PATH:$HOME/.dotnet"' >> ~/.zshrc
+# Create and activate llava environment
+source /home/vscode/miniconda3/bin/activate
+conda create -y -q -n llava python=3.10
+conda activate llava
+# Install Nvidia Cuda Compiler
+conda install -y -c nvidia cuda-compiler
+pip install pre-commit==3.0.2
+# Install package locally
+pip install --upgrade pip  # enable PEP 660 support
+pip install -e .
+# Install additional packages for training
+pip install -e ".[train]"
+pip install flash-attn --no-build-isolation
+# Download checkpoints to location outside of the repo
+git clone https://huggingface.co/liuhaotian/llava-v1.5-7b ~/llava-v1.5-7b
+# Commented because it is unlikely for users to have enough local GPU memory to load the model
+# git clone https://huggingface.co/liuhaotian/llava-v1.5-13b ~/llava-v1.5-13b
+echo "postCreateCommand.sh COMPLETE!"

LLaVA/docs/Evaluation.md ADDED Viewed

	@@ -0,0 +1,167 @@

+# Evaluation
+In LLaVA-1.5, we evaluate models on a diverse set of 12 benchmarks. To ensure the reproducibility, we evaluate the models with greedy decoding. We do not evaluate using beam search to make the inference process consistent with the chat demo of real-time outputs.
+Currently, we mostly utilize the official toolkit or server for the evaluation.
+## Evaluate on Custom Datasets
+You can evaluate LLaVA on your custom datasets by converting your dataset to LLaVA's jsonl format, and evaluate using [`model_vqa.py`](https://github.com/haotian-liu/LLaVA/blob/main/llava/eval/model_vqa.py).
+Below we provide a general guideline for evaluating datasets with some common formats.
+1. Short-answer (e.g. VQAv2, MME).
+```
+<question>
+Answer the question using a single word or phrase.
+```
+2. Option-only for multiple-choice (e.g. MMBench, SEED-Bench).
+```
+<question>
+A. <option_1>
+B. <option_2>
+C. <option_3>
+D. <option_4>
+Answer with the option's letter from the given choices directly.
+```
+3. Natural QA (e.g. LLaVA-Bench, MM-Vet).
+No postprocessing is needed.
+## Scripts
+Before preparing task-specific data, **you MUST first download [eval.zip](https://drive.google.com/file/d/1atZSBBrAX54yYpxtVVW33zFvcnaHeFPy/view?usp=sharing)**. It contains custom annotations, scripts, and the prediction files with LLaVA v1.5. Extract to `./playground/data/eval`. This also provides a general structure for all datasets.
+### VQAv2
+1. Download [`test2015`](http://images.cocodataset.org/zips/test2015.zip) and put it under `./playground/data/eval/vqav2`.
+2. Multi-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/vqav2.sh
+```
+3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/830/my-submission): `./playground/data/eval/vqav2/answers_upload`.
+### GQA
+1. Download the [data](https://cs.stanford.edu/people/dorarad/gqa/download.html) and [evaluation scripts](https://cs.stanford.edu/people/dorarad/gqa/evaluate.html) following the official instructions and put under `./playground/data/eval/gqa/data`. You may need to modify `eval.py` as [this](https://gist.github.com/haotian-liu/db6eddc2a984b4cbcc8a7f26fd523187) due to the missing assets in the GQA v1.2 release.
+2. Multi-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/gqa.sh
+```
+### VisWiz
+1. Download [`test.json`](https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip) and extract [`test.zip`](https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip) to `test`. Put them under `./playground/data/eval/vizwiz`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/vizwiz.sh
+```
+3. Submit the results to the [evaluation server](https://eval.ai/web/challenges/challenge-page/2185/my-submission): `./playground/data/eval/vizwiz/answers_upload`.
+### ScienceQA
+1. Under `./playground/data/eval/scienceqa`, download `images`, `pid_splits.json`, `problems.json` from the `data/scienceqa` folder of the ScienceQA [repo](https://github.com/lupantech/ScienceQA).
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/sqa.sh
+```
+### TextVQA
+1. Download [`TextVQA_0.5.1_val.json`](https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json) and [images](https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip) and extract to `./playground/data/eval/textvqa`.
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/textvqa.sh
+```
+### POPE
+1. Download `coco` from [POPE](https://github.com/AoiDragon/POPE/tree/e3e39262c85a6a83f26cf5094022a782cb0df58d/output/coco) and put under `./playground/data/eval/pope`.
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/pope.sh
+```
+### MME
+1. Download the data following the official instructions [here](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation).
+2. Downloaded images to `MME_Benchmark_release_version`.
+3. put the official `eval_tool` and `MME_Benchmark_release_version` under `./playground/data/eval/MME`.
+4. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mme.sh
+```
+### MMBench
+1. Download [`mmbench_dev_20230712.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_20230712.tsv) and put under `./playground/data/eval/mmbench`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench.sh
+```
+3. Submit the results to the [evaluation server](https://opencompass.org.cn/leaderboard-multimodal): `./playground/data/eval/mmbench/answers_upload/mmbench_dev_20230712`.
+### MMBench-CN
+1. Download [`mmbench_dev_cn_20231003.tsv`](https://download.openmmlab.com/mmclassification/datasets/mmbench/mmbench_dev_cn_20231003.tsv) and put under `./playground/data/eval/mmbench`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmbench_cn.sh
+```
+3. Submit the results to the evaluation server: `./playground/data/eval/mmbench/answers_upload/mmbench_dev_cn_20231003`.
+### SEED-Bench
+1. Following the official [instructions](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md) to download the images and the videos. Put images under `./playground/data/eval/seed_bench/SEED-Bench-image`.
+2. Extract the video frame in the middle from the downloaded videos, and put them under `./playground/data/eval/seed_bench/SEED-Bench-video-image`. We provide our script `extract_video_frames.py` modified from the official one.
+3. Multiple-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash scripts/v1_5/eval/seed.sh
+```
+4. Optionally, submit the results to the leaderboard: `./playground/data/eval/seed_bench/answers_upload` using the official jupyter notebook.
+### LLaVA-Bench-in-the-Wild
+1. Extract contents of [`llava-bench-in-the-wild`](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) to `./playground/data/eval/llava-bench-in-the-wild`.
+2. Single-GPU inference and evaluate.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/llavabench.sh
+```
+### MM-Vet
+1. Extract [`mm-vet.zip`](https://github.com/yuweihao/MM-Vet/releases/download/v1/mm-vet.zip) to `./playground/data/eval/mmvet`.
+2. Single-GPU inference.
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh
+```
+3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook.
+## More Benchmarks
+Below are awesome benchmarks for multimodal understanding from the research community, that are not initially included in the LLaVA-1.5 release.
+### Q-Bench
+1. Download [`llvisionqa_dev.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/llvisionqa_dev.json) (for `dev`-subset) and [`llvisionqa_test.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/llvisionqa_test.json) (for `test`-subset). Put them under `./playground/data/eval/qbench`.
+2. Download and extract [images](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/images_llvisionqa.tar) and put all the images directly under `./playground/data/eval/qbench/images_llviqionqa`.
+3. Single-GPU inference (change `dev` to `test` for evaluation on test set).
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/qbench.sh dev
+```
+4. Submit the results by instruction [here](https://github.com/VQAssessment/Q-Bench#option-1-submit-results): `./playground/data/eval/qbench/llvisionqa_dev_answers.jsonl`.
+### Chinese-Q-Bench
+1. Download [`质衡-问答-验证集.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/%E8%B4%A8%E8%A1%A1-%E9%97%AE%E7%AD%94-%E9%AA%8C%E8%AF%81%E9%9B%86.json) (for `dev`-subset) and [`质衡-问答-测试集.json`](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/%E8%B4%A8%E8%A1%A1-%E9%97%AE%E7%AD%94-%E6%B5%8B%E8%AF%95%E9%9B%86.json) (for `test`-subset). Put them under `./playground/data/eval/qbench`.
+2. Download and extract [images](https://huggingface.co/datasets/nanyangtu/LLVisionQA-QBench/resolve/main/images_llvisionqa.tar) and put all the images directly under `./playground/data/eval/qbench/images_llviqionqa`.
+3. Single-GPU inference (change `dev` to `test` for evaluation on test set).
+```Shell
+CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/qbench_zh.sh dev
+```
+4. Submit the results by instruction [here](https://github.com/VQAssessment/Q-Bench#option-1-submit-results): `./playground/data/eval/qbench/llvisionqa_zh_dev_answers.jsonl`.

LLaVA/scripts/convert_sqa_to_llava_base_prompt.py ADDED Viewed

	@@ -0,0 +1,334 @@

+def get_question_text(problem):
+    question = problem['question']
+    return question
+def get_context_text(problem, use_caption):
+    txt_context = problem['hint']
+    img_context = problem['caption'] if use_caption else ""
+    context = " ".join([txt_context, img_context]).strip()
+    if context == "":
+        context = "N/A"
+    return context
+def get_choice_text(probelm, options):
+    choices = probelm['choices']
+    choice_list = []
+    for i, c in enumerate(choices):
+        choice_list.append("({}) {}".format(options[i], c))
+    choice_txt = " ".join(choice_list)
+    #print(choice_txt)
+    return choice_txt
+def get_answer(problem, options):
+    return options[problem['answer']]
+def get_lecture_text(problem):
+    # \\n: GPT-3 can generate the lecture with more tokens.
+    lecture = problem['lecture'].replace("\n", "\\n")
+    return lecture
+def get_solution_text(problem):
+    # \\n: GPT-3 can generate the solution with more tokens
+    solution = problem['solution'].replace("\n", "\\n")
+    return solution
+def create_one_example_chatbot(format, question, context, choice, answer, lecture, solution, test_example=True):
+    input_format, output_format = format.split("-")
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == 'A':
+        output = f"Answer: The answer is {answer}."
+    elif output_format == 'AL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == 'AE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == 'ALE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == 'AEL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+    elif output_format == 'LA':
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == 'EA':
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == 'LEA':
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == 'ELA':
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+    elif output_format == 'LEPA':
+        output = ''
+        if len(lecture.strip()) > 0:
+            output += f"LECTURE: {lecture}\n"
+        if len(solution.strip()) > 0:
+            output += f"SOLUTION: {solution}\n"
+        output += '###\n'
+        output += f"ANSWER: {answer}."
+    input = input.replace("  ", " ").strip()
+    output = output.replace("  ", " ").strip()
+    if input.endswith("BECAUSE:"):
+        input = input.replace("BECAUSE:", "").strip()
+    if output.endswith("BECAUSE:"):
+        output = output.replace("BECAUSE:", "").strip()
+    return input, output
+def create_one_example(format, question, context, choice, answer, lecture, solution, test_example=True):
+    input_format, output_format = format.split("-")
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == 'A':
+        output = f"Answer: The answer is {answer}."
+    elif output_format == 'AL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == 'AE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == 'ALE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == 'AEL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+    elif output_format == 'LA':
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == 'EA':
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == 'LEA':
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == 'ELA':
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+    text = input + output
+    text = text.replace("  ", " ").strip()
+    if text.endswith("BECAUSE:"):
+        text = text.replace("BECAUSE:", "").strip()
+    return text
+def create_one_example_gpt4(format, question, context, choice, answer, lecture, solution, test_example=True):
+    input_format, output_format = format.split("-")
+    ## Inputs
+    if input_format == "CQM":
+        input = f"Context: {context}\nQuestion: {question}\nOptions: {choice}\n"
+    elif input_format == "QCM":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
+    # upper bound experiment
+    elif input_format == "QCML":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture}\n"
+    elif input_format == "QCME":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {solution}\n"
+    elif input_format == "QCMLE":
+        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\nBECAUSE: {lecture} {solution}\n"
+    elif input_format == "QCLM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture}\nOptions: {choice}\n"
+    elif input_format == "QCEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {solution}\nOptions: {choice}\n"
+    elif input_format == "QCLEM":
+        input = f"Question: {question}\nContext: {context}\nBECAUSE: {lecture} {solution}\nOptions: {choice}\n"
+    # Outputs
+    if test_example:
+        output = "Answer:"
+    elif output_format == 'A':
+        output = f"Answer: The answer is {answer}."
+    elif output_format == 'AL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution}"
+    elif output_format == 'AE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture}"
+    elif output_format == 'ALE':
+        output = f"Answer: The answer is {answer}. BECAUSE: {lecture} {solution}"
+    elif output_format == 'AEL':
+        output = f"Answer: The answer is {answer}. BECAUSE: {solution} {lecture}"
+    elif output_format == 'LA':
+        output = f"Answer: {lecture} The answer is {answer}."
+    elif output_format == 'EA':
+        output = f"Answer: {solution} The answer is {answer}."
+    elif output_format == 'LEA':
+        output = f"Answer: {lecture} {solution} The answer is {answer}."
+    elif output_format == 'ELA':
+        output = f"Answer: {solution} {lecture} The answer is {answer}."
+    input = input.replace("  ", " ").strip()
+    output = output.replace("  ", " ").strip()
+    if output.endswith("BECAUSE:"):
+        output = output.replace("BECAUSE:", "").strip()
+    user_prompt = {"role": "user", "content": f"Can you explain {input}?"}
+    assistant_prompt = {"role": "assistant", "content": f"{output}"}
+    return user_prompt, assistant_prompt
+def build_prompt_chatbot(problems, shot_qids, prompt_format, use_caption=False, options=["A", "B", "C", "D", "E"], is_test=False):
+    examples = {}
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], use_caption)
+        choice = get_choice_text(problems[qid], options)
+        answer = get_answer(problems[qid], options)
+        lecture = get_lecture_text(problems[qid]).replace('\\n', '\n')
+        solution = get_solution_text(problems[qid]).replace('\\n', '\n')
+        train_example = create_one_example_chatbot(prompt_format,
+                                           question,
+                                           context,
+                                           choice,
+                                           answer,
+                                           lecture,
+                                           solution,
+                                           test_example=is_test)
+        examples[qid] = train_example
+    return examples
+def build_prompt(problems, shot_qids, test_qid, args):
+    examples = []
+    # n-shot training examples
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], args.use_caption)
+        choice = get_choice_text(problems[qid], args.options)
+        answer = get_answer(problems[qid], args.options)
+        lecture = get_lecture_text(problems[qid])
+        solution = get_solution_text(problems[qid])
+        train_example = create_one_example(args.prompt_format,
+                                           question,
+                                           context,
+                                           choice,
+                                           answer,
+                                           lecture,
+                                           solution,
+                                           test_example=False)
+        examples.append(train_example)
+    # test example
+    question = get_question_text(problems[test_qid])
+    context = get_context_text(problems[test_qid], args.use_caption)
+    choice = get_choice_text(problems[test_qid], args.options)
+    answer = get_answer(problems[test_qid], args.options)
+    lecture = get_lecture_text(problems[test_qid])
+    solution = get_solution_text(problems[test_qid])
+    test_example = create_one_example(args.prompt_format,
+                                      question,
+                                      context,
+                                      choice,
+                                      answer,
+                                      lecture,
+                                      solution,
+                                      test_example=True)
+    examples.append(test_example)
+    # create the prompt input
+    prompt_input = '\n\n'.join(examples)
+    return prompt_input
+def build_prompt_gpt4(problems, shot_qids, test_qid, args):
+    prompt_array = [{"role": "system", "content": "You are a helpful assistant."}]
+    # n-shot training examples
+    for qid in shot_qids:
+        question = get_question_text(problems[qid])
+        context = get_context_text(problems[qid], args.use_caption)
+        choice = get_choice_text(problems[qid], args.options)
+        answer = get_answer(problems[qid], args.options)
+        lecture = get_lecture_text(problems[qid])
+        solution = get_solution_text(problems[qid])
+        user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format,
+                                           question,
+                                           context,
+                                           choice,
+                                           answer,
+                                           lecture,
+                                           solution,
+                                           test_example=False)
+        prompt_array.append(user_prompt)
+        prompt_array.append(assistant_prompt)
+    # test example
+    question = get_question_text(problems[test_qid])
+    context = get_context_text(problems[test_qid], args.use_caption)
+    choice = get_choice_text(problems[test_qid], args.options)
+    answer = get_answer(problems[test_qid], args.options)
+    lecture = get_lecture_text(problems[test_qid])
+    solution = get_solution_text(problems[test_qid])
+    user_prompt, assistant_prompt = create_one_example_gpt4(args.prompt_format,
+                                      question,
+                                      context,
+                                      choice,
+                                      answer,
+                                      lecture,
+                                      solution,
+                                      test_example=True)
+    prompt_array.append(user_prompt)
+    prompt_array.append(assistant_prompt)
+    return prompt_array

LLaVA/scripts/finetune_qlora.sh ADDED Viewed

	@@ -0,0 +1,50 @@

+#!/bin/bash
+# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
+# Uncomment and set the following variables correspondingly to run this script:
+################## VICUNA ##################
+# PROMPT_VERSION=v1
+# MODEL_VERSION="vicuna-v1-3-7b"
+################## VICUNA ##################
+################## LLaMA-2 ##################
+# PROMPT_VERSION="llava_llama_2"
+# MODEL_VERSION="llama-2-7b-chat"
+################## LLaMA-2 ##################
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --lora_enable True \
+    --bits 4 \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path ./playground/data/llava_instruct_80k.json \
+    --image_folder /path/to/coco/train2017 \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 50000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True \
+    --dataloader_num_workers 4 \
+    --report_to wandb

LLaVA/scripts/pretrain.sh ADDED Viewed

	@@ -0,0 +1,46 @@

+#!/bin/bash
+# IMPORTANT: this is the training script for the original LLaVA, NOT FOR LLaVA V1.5!
+# Uncomment and set the following variables correspondingly to run this script:
+# MODEL_VERSION=vicuna-v1-3-7b
+# MODEL_VERSION=llama-2-7b-chat
+########### DO NOT CHANGE ###########
+########### USE THIS FOR BOTH ###########
+PROMPT_VERSION=plain
+########### DO NOT CHANGE ###########
+deepspeed llava/train/train_mem.py \
+    --deepspeed ./scripts/zero2.json \
+    --model_name_or_path ./checkpoints/$MODEL_VERSION \
+    --version $PROMPT_VERSION \
+    --data_path /path/to/pretrain_data.json \
+    --image_folder /path/to/images \
+    --vision_tower openai/clip-vit-large-patch14 \
+    --tune_mm_mlp_adapter True \
+    --mm_vision_select_layer -2 \
+    --mm_use_im_start_end False \
+    --mm_use_im_patch_token False \
+    --bf16 True \
+    --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 4 \
+    --gradient_accumulation_steps 1 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 24000 \
+    --save_total_limit 1 \
+    --learning_rate 2e-3 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --tf32 True \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --dataloader_num_workers 4 \
+    --lazy_preprocess True \
+    --report_to wandb

LLaVA/scripts/zero2.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    }
+}

sglang/.github/ISSUE_TEMPLATE/2-feature-request.yml ADDED Viewed

	@@ -0,0 +1,23 @@

+name: 🚀 Feature request
+description: Suggest an idea for this project
+title: "[Feature] "
+body:
+- type: checkboxes
+  attributes:
+    label: Checklist
+    options:
+    - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/sgl-project/sglang/discussions/new/choose Otherwise, it will be closed.
+    - label: 2. Please use English, otherwise it will be closed.
+- type: textarea
+  attributes:
+    label: Motivation
+    description: |
+      A clear and concise description of the motivation of the feature.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Related resources
+    description: |
+      If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.

sglang/.github/workflows/close-inactive-issues.yml ADDED Viewed

	@@ -0,0 +1,96 @@

+name: Close Inactive Issues
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+permissions:
+  issues: write
+  contents: read
+jobs:
+  close-inactive-issues:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check and close inactive issues
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{secrets.GITHUB_TOKEN}}
+          script: |
+            const sixtyDaysAgo = new Date(Date.now() - 60 * 24 * 60 * 60 * 1000);
+            const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/');
+            console.log(`Owner: ${owner}, Repo: ${repo}`);
+            async function fetchIssues(page = 1) {
+              console.log(`Fetching issues for ${owner}/${repo}, page ${page}`);
+              return await github.rest.issues.listForRepo({
+                owner,
+                repo,
+                state: 'open',
+                sort: 'updated',
+                direction: 'asc',
+                per_page: 100,
+                page: page
+              });
+            }
+            async function processIssues() {
+              console.log('Starting to process issues');
+              console.log(`Repository: ${owner}/${repo}`);
+              let page = 1;
+              let hasMoreIssues = true;
+              while (hasMoreIssues) {
+                try {
+                  const issues = await fetchIssues(page);
+                  console.log(`Fetched ${issues.data.length} issues on page ${page}`);
+                  if (issues.data.length === 0) {
+                    hasMoreIssues = false;
+                    break;
+                  }
+                  for (const issue of issues.data) {
+                    // Skip if the issue has 'good first issue' label
+                    if (issue.labels.some(label => label.name === 'good first issue')) {
+                      console.log(`Skipping issue #${issue.number} as it's marked as 'good first issue'`);
+                      continue;
+                    }
+                    if (new Date(issue.updated_at) < sixtyDaysAgo) {
+                      try {
+                        await github.rest.issues.update({
+                          owner,
+                          repo,
+                          issue_number: issue.number,
+                          state: 'closed',
+                          labels: [...issue.labels.map(l => l.name), 'inactive']
+                        });
+                        await github.rest.issues.createComment({
+                          owner,
+                          repo,
+                          issue_number: issue.number,
+                          body: 'This issue has been automatically closed due to inactivity. Please feel free to reopen it if needed.'
+                        });
+                        console.log(`Closed issue #${issue.number} due to inactivity.`);
+                      } catch (error) {
+                        console.error(`Failed to close issue #${issue.number}: ${error.message}`);
+                      }
+                    } else {
+                      console.log(`Issue #${issue.number} is still active. Stopping processing.`);
+                      hasMoreIssues = false;
+                      break;
+                    }
+                  }
+                  page += 1;
+                } catch (error) {
+                  console.error(`Error fetching issues on page ${page}: ${error.message}`);
+                  hasMoreIssues = false;
+                }
+              }
+              console.log('Finished processing issues');
+            }
+            await processIssues();

sglang/.github/workflows/execute-notebook.yml ADDED Viewed

	@@ -0,0 +1,49 @@

+name: Execute Notebooks
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "docs/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "docs/**"
+  workflow_dispatch:
+concurrency:
+  group: execute-notebook-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  run-all-notebooks:
+    runs-on: 1-gpu-runner
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_dependency.sh
+          pip install -r docs/requirements.txt
+      - name: Setup Jupyter Kernel
+        run: |
+          python -m ipykernel install --user --name python3 --display-name "Python 3"
+      - name: Execute notebooks
+        timeout-minutes: 30
+        run: |
+          cd docs
+          make clean
+          make compile

sglang/.github/workflows/lint.yml ADDED Viewed

	@@ -0,0 +1,22 @@

+name: Lint
+on: [pull_request]
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+      - name: Install pre-commit hook
+        run: |
+          python -m pip install pre-commit
+          pre-commit install
+      - name: Linting
+        run: pre-commit run --all-files

sglang/.github/workflows/nightly-test.yml ADDED Viewed

	@@ -0,0 +1,34 @@

+name: Nightly Test
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+concurrency:
+  group: nightly-test-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  nightly-test:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        run: |
+          bash scripts/ci_install_dependency.sh
+          pip install --upgrade "evalplus[vllm] @ git+https://github.com/evalplus/evalplus"
+      - name: Run test
+        timeout-minutes: 120
+        run: |
+          cd test/srt
+          python3 run_suite.py --suite nightly --timeout-per-file 2400

sglang/.github/workflows/pr-test.yml ADDED Viewed

	@@ -0,0 +1,270 @@

+name: PR Test
+on:
+  push:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  pull_request:
+    branches: [ main ]
+    paths:
+      - "python/sglang/**"
+      - "test/**"
+  workflow_dispatch:
+    inputs:
+      version:
+        description: "FlashInfer version"
+        required: true
+        type: choice
+        default: 'release'
+        options:
+          - 'release'
+          - 'nightly'
+concurrency:
+  group: pr-test-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  unit-test-frontend:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        env:
+          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
+        run: |
+          bash scripts/ci_install_dependency.sh
+      - name: Run test
+        timeout-minutes: 10
+        run: |
+          cd test/lang
+          python3 run_suite.py --suite per-commit
+  unit-test-backend-1-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    strategy:
+      matrix:
+        range: [0-6, 6-16, 16-23, 23-30, 30-100]
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        env:
+          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
+        run: |
+          bash scripts/ci_install_dependency.sh
+      - name: Run test
+        timeout-minutes: 25
+        run: |
+          cd test/srt
+          RANGE=${{ matrix.range }}
+          range_begin=${RANGE%-*}
+          range_end=${RANGE#*-}
+          python3 run_suite.py --suite per-commit --range-begin ${range_begin} --range-end ${range_end}
+  unit-test-backend-2-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        env:
+          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
+        run: |
+          bash scripts/ci_install_dependency.sh
+      - name: Evaluate data parallelism accuracy (DP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 test_data_parallelism.py
+      - name: Evaluate MLA accuracy (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 test_mla.py
+          python3 test_mla_fp8.py
+          python3 test_dp_attention.py
+      - name: Test update weights from distributed
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 test_update_weights_from_distributed.py
+      - name: Evaluate MoE EP accuracy (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 test_moe_ep.py
+  performance-test-1-gpu-part-1:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        env:
+          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
+        run: |
+          bash scripts/ci_install_dependency.sh
+      - name: Benchmark single latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_default
+      - name: Benchmark online latency
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
+      - name: Benchmark offline throughput
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
+      - name: Benchmark offline throughput (Non-streaming, small batch size)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
+  performance-test-1-gpu-part-2:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        env:
+          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
+        run: |
+          bash scripts/ci_install_dependency.sh
+      - name: Benchmark offline throughput (w/o RadixAttention)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
+      - name: Benchmark offline throughput (w/ Triton)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
+      - name: Benchmark offline throughput (w/ FP8)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
+  performance-test-2-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        env:
+          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
+        run: |
+          bash scripts/ci_install_dependency.sh
+      - name: Benchmark single latency (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_default
+      - name: Benchmark offline throughput (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
+      - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
+  accuracy-test-1-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 1-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        env:
+          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
+        run: |
+          bash scripts/ci_install_dependency.sh
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+      - name: Evaluate accuracy
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_eval_accuracy_large.py
+  accuracy-test-2-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        env:
+          FLASHINFER_REPO: ${{ inputs.version == 'nightly' && 'https://flashinfer.ai/whl/nightly/cu124/torch2.4/flashinfer' || 'https://flashinfer.ai/whl/cu124/torch2.4/flashinfer' }}
+        run: |
+          bash scripts/ci_install_dependency.sh
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+      - name: Evaluate accuracy (TP=2)
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_moe_eval_accuracy_large.py
+  finish:
+    needs: [
+      unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu,
+      performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
+      accuracy-test-1-gpu, accuracy-test-2-gpu
+    ]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."

sglang/.github/workflows/release-docker-dev.yml ADDED Viewed

	@@ -0,0 +1,35 @@

+name: Build Development Docker Image
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 0 * * *'
+jobs:
+  build-dev:
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Free disk space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: false
+          docker-images: false
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          swap-storage: false
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Build and Push Dev Image
+        run: |
+          docker build . -f docker/Dockerfile.dev -t lmsysorg/sglang:dev --no-cache
+          docker push lmsysorg/sglang:dev

sglang/.github/workflows/release-docker.yml ADDED Viewed

	@@ -0,0 +1,64 @@

+name: Release Docker Images
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    environment: 'prod'
+    strategy:
+      matrix:
+        cuda_version: ['11.8.0', '12.1.1', '12.4.1']
+        build_type: ['all', 'srt']
+    steps:
+      - name: Delete huge unnecessary tools folder
+        run: rm -rf /opt/hostedtoolcache
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Login to Docker Hub
+        uses: docker/login-action@v2
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Build and Push
+        run: |
+          version=$(cat python/sglang/version.py | cut -d'"' -f2)
+          if [ "${{ matrix.cuda_version }}" = "11.8.0" ]; then
+            cuda_tag="cu118"
+          elif [ "${{ matrix.cuda_version }}" = "12.1.1" ]; then
+            cuda_tag="cu121"
+          elif [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then
+            cuda_tag="cu124"
+          else
+            echo "Unsupported CUDA version"
+            exit 1
+          fi
+          tag=v${version}-${cuda_tag}
+          if [ "${{ matrix.build_type }}" = "all" ]; then
+            tag_suffix=""
+          elif [ "${{ matrix.build_type }}" = "srt" ]; then
+            tag_suffix="-srt"
+          else
+            echo "Unsupported build type"
+            exit 1
+          fi
+          docker build . -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.cuda_version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} -t lmsysorg/sglang:${tag}${tag_suffix} --no-cache
+          docker push lmsysorg/sglang:${tag}${tag_suffix}
+          if [ "${{ matrix.cuda_version }}" = "12.4.1" ]; then
+            docker tag lmsysorg/sglang:${tag}${tag_suffix} lmsysorg/sglang:latest${tag_suffix}
+            docker push lmsysorg/sglang:latest${tag_suffix}
+          fi

sglang/.github/workflows/release-pypi-kernel.yml ADDED Viewed

	@@ -0,0 +1,41 @@

+name: Release SGLang Kernel to PyPI
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - sgl-kernel/pyproject.toml
+  workflow_dispatch:
+concurrency:
+  group: release-pypi-kernel-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  build-wheels:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+        cuda-version: ['12.1']
+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Build wheels for Python ${{ matrix.python-version }} and CUDA ${{ matrix.cuda-version }}
+      run: |
+        cd sgl-kernel
+        chmod +x ./build.sh
+        ./build.sh "${{ matrix.python-version }}" "${{ matrix.cuda-version }}"
+    - name: Upload to pypi
+      working-directory: sgl-kernel
+      run: |
+        pip install twine
+        python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}

sglang/.github/workflows/release-pypi.yml ADDED Viewed

	@@ -0,0 +1,31 @@

+name: Release PyPI
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "python/sglang/version.py"
+  workflow_dispatch:
+jobs:
+  publish:
+    if: github.repository == 'sgl-project/sglang'
+    runs-on: ubuntu-latest
+    environment: 'prod'
+    steps:
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9'
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Upload to pypi
+        run: |
+          cd python
+          cp ../README.md ../LICENSE .
+          pip install build
+          python3 -m build
+          pip install twine
+          python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}

sglang/3rdparty/amd/profiling/PROFILING.md ADDED Viewed

	@@ -0,0 +1,425 @@

+## Profiling SGLang Infer System with AMD GPUs
+This AppNote describes the SGLang profiling technical, code augment and running steps for systems with AMD Instinct GPUs, nevertheless the same procedure may work with Nvidia GPUs too.
+Examples and steps are provided in detail, to facilitate easy reproduce and use to localize performance problem towards optimizations.
+Two primary methods are covered:
+- [RPD](https://github.com/ROCm/rocmProfileData.git)
+- [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)
+### Profiling SGLang Infer System with RPD Profiler
+RPD profiler is a low-overhead cross-platform profiler. Therefore, the same RPD code augment not only works for profiling on ROCm/AMD GPUs, but also works for profiling on CUDA/Nvidia GPUs as well. To do RPD profiling on SGLang repository, please use scripts and patch files included in this directory and follow the steps below:
+1. Install RPD with rpd.patch applied during installation using install_rpd.sh, both files are in this directory.
+install_rpd.sh
+```bash
+# download and install RPD
+apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev
+# install rpd module
+git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData
+cd rocmProfileData
+git checkout 976899e9c6dbc6dd2bccf770818e4e44125590ac
+git apply rpd.patch
+make && make install
+cd rocpd_python && python setup.py install && cd ..
+cd rpd_tracer && make clean;make install && python setup.py install && cd ..
+```
+rpd.patch
+```bash
+diff --git a/rpd_tracer/Makefile b/rpd_tracer/Makefile
+index e9d9feb..b2e9e1a 100644
+--- a/rpd_tracer/Makefile
++++ b/rpd_tracer/Makefile
+@@ -16,7 +16,7 @@ ifneq (,$(HIP_PATH))
+         $(info Building with roctracer)
+         RPD_LIBS += -L/opt/rocm/lib -lroctracer64 -lroctx64 -lamdhip64 -lrocm_smi64
+         RPD_INCLUDES += -I/opt/rocm/include -I/opt/rocm/include/roctracer -I/opt/rocm/include/hsa
+-        RPD_SRCS += RoctracerDataSource.cpp RocmSmiDataSource.cpp
++        RPD_SRCS += RoctracerDataSource.cpp
+         RPD_INCLUDES += -D__HIP_PLATFORM_AMD__
+ endif
+```
+2. Add loadTracer.sh file included in this directory to /sglang/python/sglang.
+loadTracer.sh
+```bash
+#!/bin/bash
+################################################################################
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+################################################################################
+OUTPUT_FILE="trace.rpd"
+if [ "$1" = "-o" ] ; then
+  OUTPUT_FILE=$2
+  shift
+  shift
+fi
+if [ -e ${OUTPUT_FILE} ] ; then
+  rm ${OUTPUT_FILE}
+fi
+python3 -m rocpd.schema --create ${OUTPUT_FILE}
+if [ $? != 0 ] ; then
+  echo "Error: Could not create rpd file. Please run 'python setup.py install' from the rocpd_python dir"
+  exit
+fi
+export RPDT_FILENAME=${OUTPUT_FILE}
+export RPDT_AUTOSTART=0
+LD_PRELOAD=librocm-smi_64:librpd_tracer.so "$@"
+```
+3. Apply patch (provided in this directory) with "git apply rpd_profile_server_enable.patch" if the main profiling purpose is to get info on gpu kernels as well as limited cpu activity info.
+#### Common Notes 1
+Please note that although we are doing TP=8 in the example, we purposely only log RPD profiling on 2 ranks in the patch file (i.e.tp_rank=0/1) for profiling/visualization convenience, as even Perfetto streaming mode can only load maximal 8GB json file for visualization. With 2 ranks logged in RPD profiling, we could still check whether there are issues among ranks (e.g. load imbalance issue, nccl issue), and at the same time, we could log relatively longer time duration before the json file generated from RPD file hits 8GB size.
+rpd_profile_server_enable.patch
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..9021c01 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+ logger = logging.getLogger(__name__)
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,24 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start() #block pytorch profiler for rpd profiler enabling
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.start()
++            self.rpd.rangePush("", "rpd profile range", "")
++            logger.info("rpd is enabled")
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done")
+         logger.info("Profiler is done")
+```
+#### Advanced Debugging with RPD Profiler
+Sometimes, we want to use rpd profiler to capture more CPU and python activities in order to debug some challenging issues (e.g. root cause of load imbalance across gpu processes, root cause of bubbles, etc). Only in such cases, we need to apply patch "git apply rpd_profile_server_enable_wCPU_activities.patch", where 3 files are modified.
+rpd_profile_server_enable_wCPU_activities.patch
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..2edb427 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+ logger = logging.getLogger(__name__)
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,26 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start()
++        logger.info("torch profiler is disabled")
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.setPythonTrace(True)
++            self.rpd.start()
++            self.rpd.rangePush("", "scheduler", "")
++        logger.info("rpd is enabled inside scheduler profiling")
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done inside scheduler")
+         logger.info("Profiler is done")
+diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
+index 2621ccd..181df85 100644
+--- a/python/sglang/srt/managers/tokenizer_manager.py
++++ b/python/sglang/srt/managers/tokenizer_manager.py
+@@ -58,6 +58,10 @@ from sglang.srt.sampling.sampling_params import SamplingParams
+ from sglang.srt.server_args import PortArgs, ServerArgs
+ from sglang.srt.utils import is_generation_model, is_multimodal_model
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
++
++
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+ logger = logging.getLogger(__name__)
+@@ -514,10 +518,20 @@ class TokenizerManager:
+         self.send_to_scheduler.send_pyobj(req)
+     def start_profile(self):
++        rpd = rpdTracerControl()
++        rpd.setPythonTrace(True)
++        rpd.start()
++        rpd.rangePush("", "tokenizer_manager", "")
++        logger.info("tokenizer_manager rpd profiling started!")
+         req = ProfileReq.START_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+     def stop_profile(self):
++        rpd = rpdTracerControl()
++        rpd.rangePop()
++        rpd.stop()
++        rpd.flush()
++        logger.info("rpd profiling is done inside tokenizer_manager!")
+         req = ProfileReq.STOP_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
+index 7111c93..2bd722c 100644
+--- a/python/sglang/srt/server.py
++++ b/python/sglang/srt/server.py
+@@ -30,6 +30,8 @@ import threading
+ import time
+ from http import HTTPStatus
+ from typing import Dict, List, Optional, Union
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+ # Fix a bug of Python threading
+ setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+@@ -152,6 +154,11 @@ async def flush_cache():
+ @app.post("/start_profile")
+ async def start_profile():
+     """Start profiling."""
++    rpd = rpdTracerControl()
++    rpd.setPythonTrace(True)
++    rpd.start()
++    rpd.rangePush("", "server rpd profile range", "")
++    logger.info("rpd profiling started in server.py!")
+     tokenizer_manager.start_profile()
+     return Response(
+         content="Start profiling.\n",
+@@ -164,6 +171,11 @@ async def start_profile():
+ async def stop_profile():
+     """Stop profiling."""
+     tokenizer_manager.stop_profile()
++    rpd = rpdTracerControl()
++    rpd.rangePop()
++    rpd.stop()
++    rpd.flush()
++    logger.info("rpd profiling is done in server.py!")
+     return Response(
+         content="Stop profiling. This will take some time.\n",
+         status_code=200,
+```
+4. As an example for grok1 profiling, we create a dummy_grok1 directory with config.json (see content below) inside this directory and copy this directory to the right path for "--model-path" if you want to use the example server.sh file provided.
+```bash
+cat ../dummy_grok1/config.json
+{
+  "architectures": [
+    "Grok1ModelForCausalLM"
+  ],
+  "embedding_multiplier_scale": 78.38367176906169,
+  "output_multiplier_scale": 0.5773502691896257,
+  "vocab_size": 131072,
+  "hidden_size": 6144,
+  "intermediate_size": 32768,
+  "max_position_embeddings": 8192,
+  "num_experts_per_tok": 2,
+  "num_local_experts": 8,
+  "num_attention_heads": 48,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "head_dim": 128,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "model_type": "mixtral",
+  "torch_dtype": "bfloat16"
+}
+```
+5. Launch server with rpd enabled script ./server.sh in one terminal inside the docker container.
+#### Common Notes 2
+- Remember to change model-path to the correct path
+- loadTracer.sh is needed to conduct profiling
+- SGLANG_TORCH_PROFILER_DIR is used for default torch profiler
+- Do not use loadTracer.sh if you are using the torch profiler, simply use python3 -m sglang.launch_server.
+server.sh
+```bash
+#!/bin/bash
+# export SGLANG_TORCH_PROFILER_DIR=/data/sglang/
+export SGLANG_TORCH_PROFILER_DIR=/sgl-workspace/sglang/profile/
+# Get the current timestamp
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+# Define the log file with a timestamp
+LOGFILE="sglang_server_log_$TIMESTAMP.json"
+# Run the Python command and save the output to the log file
+loadTracer.sh python3 -m sglang.launch_server \
+    --model-path /sgl-workspace/sglang/dummy_grok1 \
+    --tokenizer-path Xenova/grok-1-tokenizer \
+    --load-format dummy \
+    --quant fp8 \
+    --tp 8 \
+    --port 30000 \
+    --disable-radix-cache 2>&1 | tee "$LOGFILE"
+```
+6. Open another terminal for the same docker container, and run the rpd enabled ./client.sh after you see "The server is fired up and is ready to roll!" message from server side terminal.
+#### Common Notes 3
+- Use curl http://localhost:30000/start_profile & curl http://localhost:30000/stop_profile to control the start and end of profiling. Check sglang/python/sglang/srt/managers/scheduler.py for more details.
+- Please don't use RPD profiler together with PyTorch profiler to avoid interference.
+- The rocmProfileData/tools/rpd2tracing.py file is used to generate json file from RPD file.
+client.sh
+```bash
+#!/bin/bash
+# Start profiling via API
+curl http://localhost:30000/start_profile -H "Content-Type: application/json"
+# Benchmark serving using sglang with random dataset and tokenizer
+# Define the log file with a timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOGFILE="sglang_client_log_$TIMESTAMP.json"
+# Run the benchmark with specified parameters and save logs
+python3 -m sglang.bench_serving \
+    --backend sglang \
+    --tokenizer Xenova/grok-1-tokenizer \
+    --dataset-name random \
+    --random-input 1024\
+    --random-output 1024 \
+    --num-prompts 120 \
+    --request-rate 8 \
+    --output-file online.jsonl 2>&1 | tee "$LOGFILE"
+# Stop profiling via API
+curl http://localhost:30000/stop_profile -H "Content-Type: application/json"
+# Convert tracing file to csv & json
+sqlite3 trace.rpd ".mode csv" ".header on" ".output trace.csv" "select * from top;" ".output stdout"
+python3 ./rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
+```
+7. Follow [Perfetto docs](https://perfetto.dev/docs/visualization/large-traces) to visualize large json files. Try to adjust parameters so that the trace.json file size is less than 9GB.
+### Profiling SGLang Infer System with PyTorch Profiler
+Please use the steps as follows:
+1. Apply the patch torch_profiler.patch. Note that you can modify "if self.tp_rank == 0" in the patch to allow more ranks be recorded in profiling.
+torch_profiler.patch
+```bash
+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..6ecd78c 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -240,7 +240,6 @@ class Scheduler:
+             )
+             self.profiler = torch.profiler.profile(
+                 activities=[
+-                    torch.profiler.ProfilerActivity.CPU,
+                     torch.profiler.ProfilerActivity.CUDA,
+                 ],
+                 with_stack=True,
+@@ -1033,9 +1032,11 @@ class Scheduler:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+         self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        if self.tp_rank == 0:
++            with open(f"stats_repro_{int(time.time())}.txt", "w") as f:
++                print(self.profiler.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=-1), file=f)
++                print("Profiling stats done.")
++
+         logger.info("Profiler is done")
+```
+2. Create the model path directory and copy it to the right path for "--model-path" if you want to use the server.sh file provided.
+3. Modify the included server.sh by removing "loadTracer.sh" before python command and launch script ./server.sh in one terminal inside the docker container.
+4. Similar to step 6 in RPD profiling section, but remove the last 2 lines in client.sh, which converted rpd file into csv and json files. Run modified client.sh for PyTorch profiling.
+-------
+- [Torch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html)

sglang/3rdparty/amd/profiling/client.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/bin/bash
+# Start profiling via API
+curl http://localhost:30000/start_profile -H "Content-Type: application/json"
+# Benchmark serving using sglang with random dataset and tokenizer
+# Define the log file with a timestamp
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOGFILE="sglang_client_log_$TIMESTAMP.json"
+# Run the benchmark with specified parameters and save logs
+python3 -m sglang.bench_serving \
+    --backend sglang \
+    --tokenizer Xenova/grok-1-tokenizer \
+    --dataset-name random \
+    --random-input 1024\
+    --random-output 1024 \
+    --num-prompts 240 \
+    --request-rate 8 \
+    --output-file online.jsonl 2>&1 | tee "$LOGFILE"
+# Stop profiling via API
+curl http://localhost:30000/stop_profile -H "Content-Type: application/json"
+# Convert tracing file to csv & json
+sqlite3 trace.rpd ".mode csv" ".header on" ".output trace.csv" "select * from top;" ".output stdout"
+python3 /sgl-workspace/rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json

sglang/3rdparty/amd/profiling/install_rpd.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+# download and install RPD
+apt update && apt install -y sqlite3 libsqlite3-dev libfmt-dev
+# install rpd module
+git clone https://github.com/ROCmSoftwarePlatform/rocmProfileData
+cd rocmProfileData
+git apply rpd.patch
+make && make install
+cd rocpd_python && python setup.py install && cd ..
+cd rpd_tracer && make clean;make install && python setup.py install && cd ..

sglang/3rdparty/amd/profiling/loadTracer.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+#!/bin/bash
+################################################################################
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+################################################################################
+OUTPUT_FILE="trace.rpd"
+if [ "$1" = "-o" ] ; then
+  OUTPUT_FILE=$2
+  shift
+  shift
+fi
+if [ -e ${OUTPUT_FILE} ] ; then
+  rm ${OUTPUT_FILE}
+fi
+python3 -m rocpd.schema --create ${OUTPUT_FILE}
+if [ $? != 0 ] ; then
+  echo "Error: Could not create rpd file. Please run 'python setup.py install' from the rocpd_python dir"
+  exit
+fi
+export RPDT_FILENAME=${OUTPUT_FILE}
+export RPDT_AUTOSTART=0
+LD_PRELOAD=librocm-smi_64:librpd_tracer.so "$@"

sglang/3rdparty/amd/profiling/rpd.patch ADDED Viewed

	@@ -0,0 +1,12 @@

+diff --git a/rpd_tracer/Makefile b/rpd_tracer/Makefile
+index e9d9feb..b2e9e1a 100644
+--- a/rpd_tracer/Makefile
++++ b/rpd_tracer/Makefile
+@@ -16,7 +16,7 @@ ifneq (,$(HIP_PATH))
+         $(info Building with roctracer)
+         RPD_LIBS += -L/opt/rocm/lib -lroctracer64 -lroctx64 -lamdhip64 -lrocm_smi64
+         RPD_INCLUDES += -I/opt/rocm/include -I/opt/rocm/include/roctracer -I/opt/rocm/include/hsa
+-        RPD_SRCS += RoctracerDataSource.cpp RocmSmiDataSource.cpp
++        RPD_SRCS += RoctracerDataSource.cpp
+         RPD_INCLUDES += -D__HIP_PLATFORM_AMD__
+ endif

sglang/3rdparty/amd/profiling/rpd_profile_server_enable.patch ADDED Viewed

	@@ -0,0 +1,49 @@

+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..9021c01 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+ logger = logging.getLogger(__name__)
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,24 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start() #block pytorch profiler for rpd profiler enabling
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.start()
++            self.rpd.rangePush("", "rpd profile range", "")
++            logger.info("rpd is enabled")
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done")
+         logger.info("Profiler is done")

sglang/3rdparty/amd/profiling/rpd_profile_server_enable_wCPU_activities.patch ADDED Viewed

	@@ -0,0 +1,126 @@

+diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
+index 62d1ff9..2edb427 100644
+--- a/python/sglang/srt/managers/scheduler.py
++++ b/python/sglang/srt/managers/scheduler.py
+@@ -71,6 +71,8 @@ from sglang.srt.utils import (
+     suppress_other_loggers,
+ )
+ from sglang.utils import get_exception_traceback
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+ logger = logging.getLogger(__name__)
+@@ -245,6 +247,7 @@ class Scheduler:
+                 ],
+                 with_stack=True,
+             )
++            self.rpd = rpdTracerControl()
+     @torch.inference_mode()
+     def event_loop(self):
+@@ -1027,15 +1030,26 @@ class Scheduler:
+     def start_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.start()
++        #self.profiler.start()
++        logger.info("torch profiler is disabled")
++        if self.tp_rank == 0 or self.tp_rank == 1:
++            self.rpd.setPythonTrace(True)
++            self.rpd.start()
++            self.rpd.rangePush("", "scheduler", "")
++        logger.info("rpd is enabled inside scheduler profiling")
+     def stop_profile(self) -> None:
+         if self.profiler is None:
+             raise RuntimeError("Profiler is not enabled.")
+-        self.profiler.stop()
+-        self.profiler.export_chrome_trace(
+-            self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
+-        )
++        #self.profiler.stop()
++        #self.profiler.export_chrome_trace(
++        #    self.torch_profiler_trace_dir + "/" + str(time.time()) + ".trace.json.gz"
++        #)
++        if self.tp_rank ==0 or self.tp_rank ==1:
++            self.rpd.rangePop()
++            self.rpd.stop()
++            self.rpd.flush()
++            logger.info("rpd is done inside scheduler")
+         logger.info("Profiler is done")
+diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
+index 2621ccd..181df85 100644
+--- a/python/sglang/srt/managers/tokenizer_manager.py
++++ b/python/sglang/srt/managers/tokenizer_manager.py
+@@ -58,6 +58,10 @@ from sglang.srt.sampling.sampling_params import SamplingParams
+ from sglang.srt.server_args import PortArgs, ServerArgs
+ from sglang.srt.utils import is_generation_model, is_multimodal_model
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
++
++
+ asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
+ logger = logging.getLogger(__name__)
+@@ -514,10 +518,20 @@ class TokenizerManager:
+         self.send_to_scheduler.send_pyobj(req)
+     def start_profile(self):
++        rpd = rpdTracerControl()
++        rpd.setPythonTrace(True)
++        rpd.start()
++        rpd.rangePush("", "tokenizer_manager", "")
++        logger.info("tokenizer_manager rpd profiling started!")
+         req = ProfileReq.START_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+     def stop_profile(self):
++        rpd = rpdTracerControl()
++        rpd.rangePop()
++        rpd.stop()
++        rpd.flush()
++        logger.info("rpd profiling is done inside tokenizer_manager!")
+         req = ProfileReq.STOP_PROFILE
+         self.send_to_scheduler.send_pyobj(req)
+diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
+index 7111c93..2bd722c 100644
+--- a/python/sglang/srt/server.py
++++ b/python/sglang/srt/server.py
+@@ -30,6 +30,8 @@ import threading
+ import time
+ from http import HTTPStatus
+ from typing import Dict, List, Optional, Union
++from rpdTracerControl import rpdTracerControl
++rpdTracerControl.skipCreate()
+ # Fix a bug of Python threading
+ setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
+@@ -152,6 +154,11 @@ async def flush_cache():
+ @app.post("/start_profile")
+ async def start_profile():
+     """Start profiling."""
++    rpd = rpdTracerControl()
++    rpd.setPythonTrace(True)
++    rpd.start()
++    rpd.rangePush("", "server rpd profile range", "")
++    logger.info("rpd profiling started in server.py!")
+     tokenizer_manager.start_profile()
+     return Response(
+         content="Start profiling.\n",
+@@ -164,6 +171,11 @@ async def start_profile():
+ async def stop_profile():
+     """Stop profiling."""
+     tokenizer_manager.stop_profile()
++    rpd = rpdTracerControl()
++    rpd.rangePop()
++    rpd.stop()
++    rpd.flush()
++    logger.info("rpd profiling is done in server.py!")
+     return Response(
+         content="Stop profiling. This will take some time.\n",
+         status_code=200,

sglang/3rdparty/amd/profiling/server.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+#!/bin/bash
+# export SGLANG_TORCH_PROFILER_DIR=/data/sglang/
+export SGLANG_TORCH_PROFILER_DIR=/sgl-workspace/sglang/profile/
+# Get the current timestamp
+TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+# Define the log file with a timestamp
+LOGFILE="sglang_server_log_$TIMESTAMP.json"
+# Run the Python command and save the output to the log file
+loadTracer.sh python3 -m sglang.launch_server \
+    --model-path /sgl-workspace/sglang/dummy_grok1 \
+    --tokenizer-path Xenova/grok-1-tokenizer \
+    --load-format dummy \
+    --quant fp8 \
+    --tp 8 \
+    --port 30000 \
+    --disable-radix-cache 2>&1 | tee "$LOGFILE"

sglang/3rdparty/amd/tuning/TUNING.md ADDED Viewed

	@@ -0,0 +1,118 @@

+## Tuning SGLang Infer System with AMD GPUs
+This AppNote describes the SGLang performance tuning technical, code harness and running steps for systems with AMD Instinct GPUs.
+Harness code, examples and steps are provided in detail, to facilitate easy reproduce & use to tune performance towards workloads.
+Three primary runtime areas are covered:
+## 1. Triton Kernels
+To maximize Triton kernel efficiency, several strategies can be employed:
+### Key Environment Variables:
+- **num_stages**: Adjusts the number of pipeline stages to optimize kernel efficiency based on the specific type of operations (e.g., General Matrix Multiplication - GEMM).
+- **waves_per_eu**: Controls the usage of Vector General Purpose Registers (VGPR) to enhance occupancy, thereby improving latency or throughput.
+- **BLOCK_M, BLOCK_N, BLOCK_K**: Tunable tile sizes that assist in balancing memory transfer and computational efficiency.
+- **matrix_instr_nonkdim**: Optimizes the usage of Matrix-Fused Multiply-Add (MFMA) instructions for specific kernel types, such as Flash Attention.
+- **OPTIMIZE_EPILOGUE**: An environment variable that can be set to `1` to enhance performance by eliminating the `convert_layout` operation in the kernel's epilogue.
+```python
+@triton.autotune(configs=[
+        triton.Config({'waves_per_eu': 1}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 1}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 1}, num_warps=16, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 2}, num_warps=16, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=4, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=8, num_stages=1),
+        triton.Config({'waves_per_eu': 4}, num_warps=16, num_stages=1),
+    ], key=['BLOCK_N', 'NUM_TOKEN_BLKS'], use_cuda_graph=True)
+@triton.jit
+def _triton_kernel_funtion():
+    ...
+```
+## 2. Torch Tunable Operations
+**TunableOp** is a feature in PyTorch that allows for the definition and optimization of custom kernels with tunable parameters. This feature is particularly useful for enhancing the performance of kernels by experimenting with different configurations.
+### Key Environment Variables:
+1. **PYTORCH_TUNABLEOP_ENABLED**:
+   - Default: `0`
+   - Set to `1` to enable TunableOp.
+2. **PYTORCH_TUNABLEOP_TUNING**:
+   - Default: `1`
+   - Set to `0` to disable tuning. If a tuned entry is not found, it will run the tuning step and record the entry when PYTORCH_TUNABLEOP_ENABLED is enabled.
+3. **PYTORCH_TUNABLEOP_VERBOSE**:
+   - Default: `0`
+   - Set to `1` to enable verbose output for TunableOp.
+### Usage Example:
+To enable TunableOp and tuning, and optionally enable verbose mode, you can run the following command in your terminal:
+```bash
+#Tuning
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=1 your_script.sh
+#Inference with tuning op
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 your_script.sh
+#Print out the log
+PYTORCH_TUNABLEOP_ENABLED=1 PYTORCH_TUNABLEOP_TUNING=0 PYTORCH_TUNABLEOP_VERBOSE=1 your_script.sh
+```
+## 3. Torch Compilation
+The following are suggestions for optimizing matrix multiplication (GEMM) and convolution (conv) operations in PyTorch using Inductor, a part of the PyTorch compilation framework. The goal is to leverage Triton to achieve better performance.
+To tune Triton kernels with GEMM and convolution ops (conv), use the `torch.compile` function with the max-autotune mode. This benchmarks a predefined list of Triton configurations and selects the fastest one for each shape.
+### Key Configurations:
+1. **Max Autotune**:
+   - Set `torch._inductor.config.max_autotune = True` or `TORCHINDUCTOR_MAX_AUTOTUNE=1`.
+2. **Fine-Grained Control**:
+   - Enable GEMM tuning: `torch._inductor.config.max_autotune_gemm = True`.
+   - Enable tuning for pointwise/reduction ops: `torch._inductor.config.max_autotune.pointwise = True`.
+3. **Backend Selection**:
+   - Use `torch._inductor.max_autotune_gemm_backends` to limit backends to TRITON for better performance.
+4. **Freezing for Inference**:
+   - Use `torch._inductor.config.freezing=True` to enable constant folding optimizations.
+5. **Debugging**:
+   - Set `TORCH_COMPILE_DEBUG=1` to extract Triton kernels generated by Inductor.
+### Example Code Block:
+```bash
+#Gemm Tuning
+TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 your_script.sh
+#Specify your backend to TRITON for Gemm Tuning
+TORCHINDUCTOR_MAX_AUTOTUNE=1 TORCHINDUCTOR_COORDINATE_DESCENT_TUNING=1 TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS=TRITON your_script.sh
+#Inference with large improvement on AMD GPU
+TORCHINDUCTOR_FREEZING=1 your_script.sh
+```
+## 4. Fused MOE kernel
+To maximize moe kernel efficiency, need to use below scripts to find out the best launch configuration
+### Key parameters:
+- **--model**: what moe model type to do tuning, it will automatically decide the size of d_model, model_intermediate_size, num_layers
+- **--tp-size**: simulate the whole model run configuration to set the dimension size using tp correctly
+- **--batch**: M dimension size of moe kernel, for prefill moe kernel the value is batch*input_len, for decode moe kernel the value is batch
+- **--dtype**: computation type
+```bash
+#Tuning
+#for example, we have one case like this "python3 -m sglang.bench_latency --model dummy_grok1/ --load-format dummy --tokenizer-path Xenova/grok-1-tokenizer --tp 8 --batch-size 32 --input 1024 --output 8 --attention-backend triton --sampling-backend pytorch --quant fp" to run, it defined batch-size 32 input lenth 1024 and output length 8, from "--batch" in moe view point, the prefill batch is 32*1024 = 32768, the decode batch is 32*1(only one output token generated in each run).
+#so we can tune decode moe use below command
+python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32"
+# and use this command to tune prefill moe
+python benchmark_moe_rocm.py --model grok1 --tp-size 8 --dtype float8 --batch "32768"
+```
+## Reference
+For more detailed information on tuning SGLang performance with AMD GPUs, please refer to the following link:
+[ROCm Documentation: Triton Kernel Performance Optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#triton-kernel-performance-optimization)

sglang/3rdparty/amd/tuning/benchmark_moe_rocm.py ADDED Viewed

	@@ -0,0 +1,377 @@

+import argparse
+import json
+import os
+import sys
+import torch
+import torch.nn.functional as F
+import triton
+import triton.language as tl
+from tqdm import tqdm
+from transformers import AutoConfig
+from sglang.srt.layers.fused_moe_triton.fused_moe import fused_moe, get_config_file_name
+padding_size = 128 if bool(int(os.getenv("MOE_PADDING", "0"))) else 0
+def main(model, tp_size, dtype: str, batches):
+    method = fused_moe
+    for bs in batches:
+        run_grid(int(bs), model=model, method=method, tp_size=tp_size, dtype=dtype)
+def prune_configs(M, N, K, configs):
+    pruned_configs = []
+    elemBytes_a = 1  # [DV Note] Hard-coded for float16 (2 bytes)
+    elemBytes_b = 1  # [DV Note] Hard-coded for float16 (2 bytes)
+    mfma = 16 if M < 32 or N < 32 else 32
+    # TODO (zhanglx): figure out the boundary between large and small gemms
+    large_gemm = False
+    if M >= 2048 and N >= 2048:
+        large_gemm = True
+    for config in configs:
+        BLOCK_SIZE_M = config.get("BLOCK_SIZE_M")
+        BLOCK_SIZE_N = config.get("BLOCK_SIZE_N")
+        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
+        num_warps = config.get("num_warps")
+        matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+        # kpack = config.get("kpack")
+        if matrix_instr_nonkdim > mfma:
+            continue
+        if mfma == 4 and BLOCK_SIZE_K < 64:
+            continue
+        # some layouts could not work properly in case
+        # number elements per thread is less 1
+        if BLOCK_SIZE_M * BLOCK_SIZE_N < 64:
+            continue
+        SPLIT_K = 1  # config.get("SPLIT_K")
+        GROUP_M = config.get("GROUP_SIZE_M")
+        if matrix_instr_nonkdim > BLOCK_SIZE_M or matrix_instr_nonkdim > BLOCK_SIZE_N:
+            continue
+        if matrix_instr_nonkdim >= M and matrix_instr_nonkdim != BLOCK_SIZE_M:
+            continue
+        if matrix_instr_nonkdim >= N and matrix_instr_nonkdim != BLOCK_SIZE_N:
+            continue
+        # Skip BLOCK_SIZE that is too large compare to M/N
+        # unless BLOCK_SIZE is already small enough
+        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
+            continue
+        if N * 2 < BLOCK_SIZE_N and BLOCK_SIZE_N != 16:
+            continue
+        # skip large split_k when not necessary
+        if SPLIT_K != 1 and not need_split_k(M, N, K):
+            continue
+        # skip split_k that leads to EVEN_K = false
+        leap = SPLIT_K * BLOCK_SIZE_K
+        modv = K % leap
+        if modv != 0:
+            continue
+        # skip large GROUP_M
+        if GROUP_M * BLOCK_SIZE_M > M and GROUP_M != 1:
+            continue
+        # out of shared memory resource
+        # TODO (zhanglx): This does not consider the LDS usage in the epilogue
+        LDS = (
+            BLOCK_SIZE_K * BLOCK_SIZE_M * elemBytes_a
+            + BLOCK_SIZE_K * BLOCK_SIZE_N * elemBytes_b
+        )
+        if LDS > 65536:
+            continue
+        # Skip small block sizes and num_warps for large gemm
+        # For fp16 and f8, we want to only use BLOCK_SIZE >= 64
+        if large_gemm:
+            if BLOCK_SIZE_M < 64 or BLOCK_SIZE_N < 64:
+                continue
+            if BLOCK_SIZE_K < 64:
+                continue
+            if num_warps < 4:
+                continue
+        pruned_configs.append(config)
+    return pruned_configs
+def union_of_list_of_dicts(l1, l2):
+    result = []
+    temp_list = l1.copy()
+    temp_list.extend(l2)
+    for myDict in temp_list:
+        if myDict not in result:
+            result.append(myDict)
+    return result
+def run_grid(bs, model, method, tp_size, dtype: str):
+    config = AutoConfig.from_pretrained(model)
+    top_k = config.num_experts_per_tok
+    d_model = config.hidden_size
+    model_intermediate_size = config.intermediate_size
+    num_layers = config.num_hidden_layers
+    hidden_states_dtype = config.torch_dtype
+    if config.num_experts_per_tok:
+        if config.architectures[0] == "Grok1ModelForCausalLM":
+            num_total_experts = config.num_experts
+        else:
+            num_total_experts = config.num_local_experts
+    else:
+        raise ValueError(f"Unsupported Mixtral model {model}")
+    # tp_size = 2
+    num_warmup_calls = 10
+    num_calls = 30
+    num_warmup_trials = 1
+    num_trials = 1
+    full_configs = []
+    block_m_range = [16, 32, 64, 128, 256]
+    block_n_range = [16, 32, 64, 128, 256]
+    block_k_range = [32, 64, 128, 256]  # MUST >= 32
+    num_warps_range = [1, 2, 4, 8]
+    group_m_range = [1, 4, 8, 16, 32]
+    # For now we see better perf with num_stages=0 for all gemm configs we care
+    # But keep this explicit so that we do not forget we may need to set it to
+    # other values in the future
+    num_stage_range = [2]
+    waves_per_eu_range = [0, 1, 2, 4, 8]
+    # Remove 32 because of triton compiling error
+    matrix_instr_nonkdim_range = [16]
+    kpack_range = [1, 2]
+    for block_size_m in block_m_range:
+        for block_size_n in block_n_range:
+            for block_size_k in block_k_range:
+                for group_size_m in group_m_range:
+                    for num_warps in num_warps_range:
+                        for num_stages in num_stage_range:
+                            for waves_per_eu in waves_per_eu_range:
+                                for matrix_instr_nonkdim in matrix_instr_nonkdim_range:
+                                    for kpack in kpack_range:
+                                        full_configs.append(
+                                            {
+                                                "BLOCK_SIZE_M": block_size_m,
+                                                "BLOCK_SIZE_N": block_size_n,
+                                                "BLOCK_SIZE_K": block_size_k,
+                                                "GROUP_SIZE_M": group_size_m,
+                                                "num_warps": num_warps,
+                                                "num_stages": num_stages,
+                                                "waves_per_eu": waves_per_eu,
+                                                "matrix_instr_nonkdim": matrix_instr_nonkdim,
+                                                "kpack": kpack,
+                                            }
+                                        )
+    M1 = bs * 2
+    N1 = model_intermediate_size * 2 // tp_size
+    K1 = d_model
+    prune_configs_1 = prune_configs(M1, N1, K1, full_configs)
+    M2 = bs * 2
+    N2 = d_model
+    K2 = model_intermediate_size // tp_size
+    prune_configs_2 = prune_configs(M2, N2, K2, full_configs)
+    configs = union_of_list_of_dicts(prune_configs_1, prune_configs_2)
+    print(
+        f"{bs=} || {len(full_configs)=} | {len(prune_configs_1)=} | \
+            {len(prune_configs_2)=} | {len(configs)=}"
+    )
+    best_config = None
+    best_time_us = 1e20
+    print(f"{tp_size=} {bs=}")
+    for config in tqdm(configs):
+        # warmup
+        try:
+            print(config)
+            for _ in range(num_warmup_trials):
+                run_timing(
+                    num_calls=num_warmup_calls,
+                    bs=bs,
+                    d_model=d_model,
+                    num_total_experts=num_total_experts,
+                    top_k=top_k,
+                    tp_size=tp_size,
+                    model_intermediate_size=model_intermediate_size,
+                    method=method,
+                    config=config,
+                    dtype=dtype,
+                    hidden_states_dtype=hidden_states_dtype,
+                )
+        except triton.runtime.autotuner.OutOfResources:
+            continue
+        # trial
+        for _ in range(num_trials):
+            kernel_dur_ms = run_timing(
+                num_calls=num_calls,
+                bs=bs,
+                d_model=d_model,
+                num_total_experts=num_total_experts,
+                top_k=top_k,
+                tp_size=tp_size,
+                model_intermediate_size=model_intermediate_size,
+                method=method,
+                config=config,
+                dtype=dtype,
+                hidden_states_dtype=hidden_states_dtype,
+            )
+            kernel_dur_us = 1000 * kernel_dur_ms
+            model_dur_ms = kernel_dur_ms * num_layers
+            if kernel_dur_us < best_time_us:
+                best_config = config
+                best_time_us = kernel_dur_us
+                tqdm.write(
+                    f"{kernel_dur_us=:.1f} {model_dur_ms=:.1f}"
+                    f" {bs=} {tp_size=} {top_k=} {num_total_experts=} "
+                    f"{d_model=} {model_intermediate_size=} {num_layers=}"
+                )
+    print("best_time_us", best_time_us)
+    print("best_config", best_config)
+    # holds Dict[str, Dict[str, int]]
+    filename = get_config_file_name(
+        num_total_experts,
+        model_intermediate_size // tp_size,
+        "float8" if dtype == "float8" else None,
+    )
+    print(f"writing config to file {filename}")
+    existing_content = {}
+    if os.path.exists(filename):
+        with open(filename, "r") as f:
+            existing_content = json.load(f)
+    existing_content[str(bs)] = best_config
+    with open(filename, "w") as f:
+        json.dump(existing_content, f, indent=4)
+        f.write("\n")
+def run_timing(
+    num_calls: int,
+    bs: int,
+    d_model: int,
+    num_total_experts: int,
+    top_k: int,
+    tp_size: int,
+    model_intermediate_size: int,
+    method,
+    config,
+    dtype: str,
+    hidden_states_dtype,
+) -> float:
+    shard_intermediate_size = model_intermediate_size // tp_size
+    hidden_states = torch.rand(
+        (bs, d_model),
+        device="cuda:0",
+        dtype=hidden_states_dtype,
+    )
+    w1 = torch.rand(
+        (num_total_experts, 2 * shard_intermediate_size, d_model + padding_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    w2 = torch.rand(
+        (num_total_experts, d_model, shard_intermediate_size + padding_size),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+    if dtype == "float8":
+        w1 = w1.to(torch.float8_e4m3fnuz)
+        w2 = w2.to(torch.float8_e4m3fnuz)
+        w1_scale = torch.ones(
+            num_total_experts, device=hidden_states.device, dtype=torch.float32
+        )
+        w2_scale = torch.ones(
+            num_total_experts, device=hidden_states.device, dtype=torch.float32
+        )
+        a1_scale = torch.ones(1, device=hidden_states.device, dtype=torch.float32)
+        a2_scale = torch.ones(1, device=hidden_states.device, dtype=torch.float32)
+    gating_output = F.softmax(
+        torch.rand(
+            (num_calls, bs, num_total_experts),
+            device=hidden_states.device,
+            dtype=torch.float32,
+        ),
+        dim=-1,
+    )
+    ##################################
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+    start_event.record()
+    for i in range(num_calls):
+        hidden_states = method(
+            hidden_states=hidden_states,
+            w1=w1,
+            w2=w2,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+            gating_output=gating_output[0],
+            topk=top_k,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+            use_fp8=dtype == "float8",
+        )
+    end_event.record()
+    end_event.synchronize()
+    dur_ms = start_event.elapsed_time(end_event) / num_calls
+    return dur_ms
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog="benchmark_mixtral_moe",
+        description="Benchmark and tune the fused_moe kernel",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="auto",
+        choices=["float8", "float16", "bfloat16"],
+        help="Data type used for fused_moe kernel computations",
+    )
+    parser.add_argument("--model", type=str, default="hpcai-tech/grok-1")
+    parser.add_argument("--tp-size", type=int, default=2, help="Tensor paralleli size")
+    parser.add_argument("-b", "--batches", type=str)
+    args = parser.parse_args()
+    batches = args.batches.split(",")
+    sys.exit(main(args.model, args.tp_size, args.dtype, batches))

sglang/benchmark/blog_v0_2/405b_sglang.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+# Create dummy weights:
+# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
+# 2. Get `config.json`` from ./config.md
+# 3. Download the tokenizer
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
+# Launch sglang
+# python -m sglang.launch_server --model-path ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --tp 8 --quant fp8 --disable-radix --mem-frac 0.87
+# offline
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > sglang_log11
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > sglang_log12
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > sglang_log13
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > sglang_log14
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > sglang_log15
+python3 -m sglang.bench_serving --backend sglang --dataset-name sharegpt --num-prompt 2000 > sglang_log21
+# online
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > sglang_log31
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > sglang_log32
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > sglang_log33
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > sglang_log34
+python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > sglang_log35

sglang/benchmark/blog_v0_2/405b_trt.sh ADDED Viewed

	@@ -0,0 +1,17 @@

+# Launch trtllm
+# https://github.com/sgl-project/tensorrt-demo
+# offline
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log11
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log12
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log13
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log14
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log15
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name sharegpt --num-prompt 2000 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log21
+# online
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log31
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log32
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log33
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log34
+python3 ../../python/sglang/bench_serving.py --backend trt --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 --model /root/Meta-Llama-3-8B-Instruct > trtllm_log35

sglang/benchmark/blog_v0_2/405b_vllm.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+# Create dummy weights:
+# 1. Create a folder `~/llama-3.1-405b-fp8-dummy` and create `config.json` and tokenizer under this folder.
+# 2. Get `config.json`` from ./config.md
+# 3. Download the tokenizer
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer.json
+#   wget https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8/resolve/main/tokenizer_config.json
+# Launch vllm
+# python3 -m vllm.entrypoints.openai.api_server --model ~/llama-3.1-405b-fp8-dummy/ --load-format dummy --disable-log-requests --tensor-parallel-size 8 --max-model-len 10000
+# offline
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3000 --random-input 1024 --random-output 1024 > vllm_log11
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 4000 --random-input 1024 --random-output 512 > vllm_log12
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 800 --random-input 4096 --random-output 2048 > vllm_log13
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1500 --random-input 4096 --random-output 1024 > vllm_log14
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 6000 --random-input 256 --random-output 512 > vllm_log15
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name sharegpt --num-prompt 2000 > vllm_log21
+# online
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 300 --request-rate 1 --random-input 1024 --random-output 1024 > vllm_log31
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 600 --request-rate 2 --random-input 1024 --random-output 1024 > vllm_log32
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 1200 --request-rate 4 --random-input 1024 --random-output 1024 > vllm_log33
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 2400 --request-rate 8 --random-input 1024 --random-output 1024 > vllm_log34
+python3 ../../python/sglang/bench_serving.py --backend vllm --dataset-name random --num-prompt 3200 --request-rate 16 --random-input 1024 --random-output 1024 > vllm_log35

sglang/benchmark/dspy/README.md ADDED Viewed

	@@ -0,0 +1,51 @@

+## Install
+```
+pip3 install dspy-ai
+```
+Turn off cache at https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa271825c1d3bf391e1277/dsp/modules/cache_utils.py#L10.
+```
+cache_turn_on = False
+```
+or set the environment variable
+```
+export DSP_CACHEBOOL=false
+```
+## Benchmark SGLang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+```
+python3 bench_dspy_intro.py --backend sglang
+```
+## Benchmark TGI
+```
+docker run --name tgi --rm -ti --gpus all --network host \
+  -v /home/ubuntu/model_weights/Llama-2-7b-chat-hf:/Llama-2-7b-chat-hf \
+  ghcr.io/huggingface/text-generation-inference:1.3.0 \
+  --model-id /Llama-2-7b-chat-hf --num-shard 1  --trust-remote-code \
+  --max-input-length 2048 --max-total-tokens 4096 \
+  --port 24000
+```
+```
+python3 bench_dspy_intro.py --backend tgi
+```
+## Benchmark vLLM
+```
+python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests  --port 21000
+```
+```
+python3 bench_dspy_intro.py --backend vllm
+```

sglang/benchmark/dspy/bench_dspy_intro.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+Adapted from
+https://github.com/stanfordnlp/dspy/blob/34d8420383ec752037aa271825c1d3bf391e1277/intro.ipynb#L9
+"""
+import argparse
+import dspy
+from dspy.datasets import HotPotQA
+class BasicQA(dspy.Signature):
+    """Answer questions with short factoid answers."""
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+class GenerateAnswer(dspy.Signature):
+    """Answer questions with short factoid answers."""
+    context = dspy.InputField(desc="may contain relevant facts")
+    question = dspy.InputField()
+    answer = dspy.OutputField(desc="often between 1 and 5 words")
+class RAG(dspy.Module):
+    def __init__(self, num_passages=3):
+        super().__init__()
+        self.retrieve = dspy.Retrieve(k=num_passages)
+        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+    def forward(self, question):
+        context = self.retrieve(question).passages
+        prediction = self.generate_answer(context=context, question=question)
+        return dspy.Prediction(context=context, answer=prediction.answer)
+def main(args):
+    # lm = dspy.OpenAI(model='gpt-3.5-turbo')
+    if args.backend == "tgi":
+        lm = dspy.HFClientTGI(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            port=args.port,
+            url="http://localhost",
+        )
+    elif args.backend == "sglang":
+        lm = dspy.HFClientSGLang(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            port=args.port,
+            url="http://localhost",
+        )
+    elif args.backend == "vllm":
+        lm = dspy.HFClientVLLM(
+            model="meta-llama/Llama-2-7b-chat-hf",
+            port=args.port,
+            url="http://localhost",
+        )
+    else:
+        raise ValueError(f"Invalid backend: {args.backend}")
+    colbertv2_wiki17_abstracts = dspy.ColBERTv2(
+        url="http://20.102.90.50:2017/wiki17_abstracts"
+    )
+    dspy.settings.configure(lm=lm, rm=colbertv2_wiki17_abstracts)
+    # Load the dataset.
+    dataset = HotPotQA(
+        train_seed=1, train_size=20, eval_seed=2023, dev_size=args.dev_size, test_size=0
+    )
+    # Tell DSPy that the 'question' field is the input. Any other fields are labels and/or metadata.
+    trainset = [x.with_inputs("question") for x in dataset.train]
+    devset = [x.with_inputs("question") for x in dataset.dev]
+    print(len(trainset), len(devset))
+    train_example = trainset[0]
+    print(f"Question: {train_example.question}")
+    print(f"Answer: {train_example.answer}")
+    dev_example = devset[18]
+    print(f"Question: {dev_example.question}")
+    print(f"Answer: {dev_example.answer}")
+    print(f"Relevant Wikipedia Titles: {dev_example.gold_titles}")
+    print(
+        f"For this dataset, training examples have input keys {train_example.inputs().keys()} and label keys {train_example.labels().keys()}"
+    )
+    print(
+        f"For this dataset, dev examples have input keys {dev_example.inputs().keys()} and label keys {dev_example.labels().keys()}"
+    )
+    # Define the predictor.
+    generate_answer = dspy.Predict(BasicQA)
+    # Call the predictor on a particular input.
+    pred = generate_answer(question=dev_example.question)
+    # Print the input and the prediction.
+    print(f"Question: {dev_example.question}")
+    print(f"Predicted Answer: {pred.answer}")
+    lm.inspect_history(n=1)
+    # Define the predictor. Notice we're just changing the class. The signature BasicQA is unchanged.
+    generate_answer_with_chain_of_thought = dspy.ChainOfThought(BasicQA)
+    # Call the predictor on the same input.
+    pred = generate_answer_with_chain_of_thought(question=dev_example.question)
+    # Print the input, the chain of thought, and the prediction.
+    print(f"Question: {dev_example.question}")
+    print(f"Thought: {pred.rationale.split('.', 1)[1].strip()}")
+    print(f"Predicted Answer: {pred.answer}")
+    retrieve = dspy.Retrieve(k=3)
+    topK_passages = retrieve(dev_example.question).passages
+    print(
+        f"Top {retrieve.k} passages for question: {dev_example.question} \n",
+        "-" * 30,
+        "\n",
+    )
+    for idx, passage in enumerate(topK_passages):
+        print(f"{idx+1}]", passage, "\n")
+    retrieve("When was the first FIFA World Cup held?").passages[0]
+    from dspy.teleprompt import BootstrapFewShot
+    # Validation logic: check that the predicted answer is correct.
+    # Also check that the retrieved context does actually contain that answer.
+    def validate_context_and_answer(example, pred, trace=None):
+        answer_EM = dspy.evaluate.answer_exact_match(example, pred)
+        answer_PM = dspy.evaluate.answer_passage_match(example, pred)
+        return answer_EM and answer_PM
+    # Set up a basic teleprompter, which will compile our RAG program.
+    teleprompter = BootstrapFewShot(metric=validate_context_and_answer)
+    # Compile!
+    compiled_rag = teleprompter.compile(RAG(), trainset=trainset)
+    # Ask any question you like to this simple RAG program.
+    my_question = "What castle did David Gregory inherit?"
+    # Get the prediction. This contains `pred.context` and `pred.answer`.
+    pred = compiled_rag(my_question)
+    # Print the contexts and the answer.
+    print(f"Question: {my_question}")
+    print(f"Predicted Answer: {pred.answer}")
+    print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")
+    from dspy.evaluate.evaluate import Evaluate
+    # Set up the `evaluate_on_hotpotqa` function. We'll use this many times below.
+    evaluate_on_hotpotqa = Evaluate(
+        devset=devset,
+        num_threads=args.num_threads,
+        display_progress=True,
+        display_table=5,
+    )
+    # Evaluate the `compiled_rag` program with the `answer_exact_match` metric.
+    metric = dspy.evaluate.answer_exact_match
+    evaluate_on_hotpotqa(compiled_rag, metric=metric)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--num-threads", type=int, default=32)
+    parser.add_argument("--dev-size", type=int, default=150)
+    parser.add_argument(
+        "--backend", type=str, choices=["sglang", "tgi", "vllm"], default="sglang"
+    )
+    args = parser.parse_args()
+    if args.port is None:
+        default_port = {
+            "vllm": 21000,
+            "lightllm": 22000,
+            "tgi": 24000,
+            "sglang": 30000,
+        }
+        args.port = default_port.get(args.backend, None)
+    main(args)

sglang/benchmark/gsm8k/README.md ADDED Viewed

	@@ -0,0 +1,47 @@

+## Run benchmark
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+```
+python3 bench_sglang.py --num-questions 200
+```
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+```
+python3 bench_other.py --num-questions 200 --backend vllm
+```
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+```
+python3 bench_other.py --num-questions 200 --backend lightllm
+```
+### Benchmark guidance
+```
+python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+### Benchmark lmql
+```
+CUDA_VISIBLE_DEVICES=0,1 lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
+```
+```
+python3 bench_other.py --num-questions 100 --backend lmql --parallel 2
+```

sglang/benchmark/gsm8k/bench_other.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import argparse
+import ast
+import asyncio
+import json
+import re
+import time
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+from tqdm import tqdm
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
+INVALID = -9999999
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+def main(args):
+    # Select backend
+    call_generate = get_call_generate(args)
+    # Read data
+    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    filename = download_and_cache_file(url)
+    lines = list(read_jsonl(filename))
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    states = [None] * len(labels)
+    # Run requests
+    if args.backend != "lmql":
+        # Use thread pool
+        def get_one_answer(i):
+            answer = call_generate(
+                prompt=few_shot_examples + questions[i],
+                temperature=0,
+                max_tokens=256,
+                stop=["Question", "Assistant:", "<|separator|>"],
+            )
+            states[i] = answer
+        tic = time.time()
+        if args.parallel == 1:
+            for i in tqdm(range(len(questions))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(questions)))),
+                        total=len(questions),
+                    )
+                )
+    else:
+        # Use asyncio
+        async def batched_call(batch_size):
+            for i in range(0, len(questions), batch_size):
+                tasks = []
+                for q in questions[i : i + batch_size]:
+                    tasks.append(
+                        call_generate(
+                            few_shot_examples + q,
+                            temperature=0,
+                            max_tokens=256,
+                            stop="Question",
+                        )
+                    )
+                rets = await asyncio.gather(*tasks)
+                for j in range(len(rets)):
+                    states[i + j] = rets[j]
+        tic = time.time()
+        asyncio.run(batched_call(batch_size=args.parallel))
+    latency = time.time() - tic
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]))
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    # Dump results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_other_args_and_parse(parser)
+    main(args)

sglang/benchmark/gsm8k/bench_sglang.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import argparse
+import ast
+import json
+import re
+import time
+import numpy as np
+from sglang.api import set_default_backend
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import download_and_cache_file, dump_state_text, read_jsonl
+INVALID = -9999999
+def get_one_example(lines, i, include_answer):
+    ret = "Question: " + lines[i]["question"] + "\nAnswer:"
+    if include_answer:
+        ret += " " + lines[i]["answer"]
+    return ret
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+def main(args):
+    # Select backend
+    set_default_backend(select_sglang_backend(args))
+    # Read data
+    url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
+    filename = download_and_cache_file(url)
+    lines = list(read_jsonl(filename))
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+    questions = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+    import sglang as sgl
+    @sgl.function
+    def few_shot_gsm8k(s, question):
+        s += few_shot_examples + question
+        s += sgl.gen(
+            "answer", max_tokens=512, stop=["Question", "Assistant:", "<|separator|>"]
+        )
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+    # Run requests
+    tic = time.time()
+    states = few_shot_gsm8k.run_batch(
+        arguments,
+        temperature=0,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.time() - tic
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]["answer"]))
+    # print(f"{preds=}")
+    # print(f"{labels=}")
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    # Compute speed
+    num_output_tokens = sum(
+        s.get_meta_info("answer")["completion_tokens"] for s in states
+    )
+    output_throughput = num_output_tokens / latency
+    # Print results
+    print(f"Accuracy: {acc:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Latency: {latency:.3f} s")
+    print(f"Output throughput: {output_throughput:.3f} token/s")
+    # Dump results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)

sglang/benchmark/hellaswag/README.md ADDED Viewed

	@@ -0,0 +1,47 @@

+## Run benchmark
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+```
+python3 bench_sglang.py --num-questions 200
+```
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+```
+python3 bench_other.py --num-questions 200 --backend vllm
+```
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+```
+python3 bench_other.py --num-questions 200 --backend lightllm
+```
+### Benchmark guidance
+```
+CUDA_VISIBLE_DEVICES=0,1 python3 bench_other.py --num-questions 200 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+### Benchmark lmql
+```
+lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
+```
+```
+python3 bench_other.py --num-questions 200 --backend lmql --port 23000 --parallel 1
+```

sglang/benchmark/hellaswag/bench_other.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import argparse
+import asyncio
+import json
+import time
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+from tqdm import tqdm
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_select
+from sglang.utils import download_and_cache_file, read_jsonl
+def get_one_example(lines, i, include_answer):
+    ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
+    if include_answer:
+        ret += lines[i]["endings"][lines[i]["label"]]
+    return ret
+def get_few_shot_examples(lines, k):
+    ret = ""
+    for i in range(k):
+        ret += get_one_example(lines, i, True) + "\n\n"
+    return ret
+def main(args):
+    # Select backend
+    call_select = get_call_select(args)
+    # Read data
+    url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
+    filename = download_and_cache_file(url)
+    lines = list(read_jsonl(filename))
+    # Construct prompts
+    num_questions = args.num_questions
+    num_shots = args.num_shots
+    few_shot_examples = get_few_shot_examples(lines, num_shots)
+    questions = []
+    choices = []
+    labels = []
+    for i in range(len(lines[:num_questions])):
+        questions.append(get_one_example(lines, i, False))
+        choices.append(lines[i]["endings"])
+        labels.append(lines[i]["label"])
+    preds = [None] * len(labels)
+    # Run requests
+    if args.backend != "lmql":
+        # Use thread pool
+        def get_one_answer(i):
+            preds[i] = call_select(
+                context=few_shot_examples + questions[i], choices=choices[i]
+            )
+        tic = time.time()
+        if args.parallel == 1:
+            for i in tqdm(range(len(questions))):
+                get_one_answer(i)
+        else:
+            with ThreadPoolExecutor(args.parallel) as executor:
+                list(
+                    tqdm(
+                        executor.map(get_one_answer, list(range(len(questions)))),
+                        total=len(questions),
+                    )
+                )
+    else:
+        # Use asyncio
+        async def batched_call(batch_size):
+            for i in range(0, len(questions), batch_size):
+                tasks = []
+                for q, c in zip(
+                    questions[i : i + batch_size], choices[i : i + batch_size]
+                ):
+                    tasks.append(call_select(context=few_shot_examples + q, choices=c))
+                rets = await asyncio.gather(*tasks)
+                for j in range(len(rets)):
+                    preds[i + j] = rets[j]
+        tic = time.time()
+        asyncio.run(batched_call(batch_size=args.parallel))
+    latency = time.time() - tic
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    print(f"Latency: {latency:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+    # Write results
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "hellaswag",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shots", type=int, default=20)
+    parser.add_argument("--data-path", type=str, default="hellaswag_val.jsonl")
+    parser.add_argument("--num-questions", type=int, default=200)
+    args = add_common_other_args_and_parse(parser)
+    main(args)

sglang/benchmark/lora/launch_server.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import argparse
+import os
+NUM_LORAS = 8
+LORA_PATH = {
+    "base": "mistralai/Mistral-7B-Instruct-v0.3",
+    "lora": "/home/ying/test_lora",
+}
+def launch_server(args):
+    base_path = LORA_PATH["base"]
+    lora_path = LORA_PATH["lora"]
+    if args.base_only:
+        cmd = f"python3 -m sglang.launch_server --model {base_path} "
+    else:
+        cmd = f"python3 -m sglang.launch_server --model {base_path} --lora-paths "
+        for i in range(NUM_LORAS):
+            lora_name = f"lora{i}"
+            cmd += f"{lora_name}={lora_path} "
+    cmd += f"--disable-radix --disable-cuda-graph "
+    cmd += f"--max-loras-per-batch {args.max_loras_per_batch} "
+    cmd += f"--max-running-requests {args.max_running_requests}"
+    print(cmd)
+    os.system(cmd)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--base-only",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--max-loras-per-batch",
+        type=int,
+        default=8,
+    )
+    parser.add_argument(
+        "--max-running-requests",
+        type=int,
+        default=8,
+    )
+    args = parser.parse_args()
+    launch_server(args)

sglang/benchmark/lora/lora_bench.py ADDED Viewed

	@@ -0,0 +1,484 @@

+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import argparse
+import asyncio
+import json
+import os
+import random
+import resource
+import sys
+import time
+import traceback
+import warnings
+from argparse import ArgumentParser
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
+import aiohttp
+import numpy as np
+import requests
+from launch_server import LORA_PATH, NUM_LORAS
+from tqdm.asyncio import tqdm
+from transformers import (
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerBase,
+    PreTrainedTokenizerFast,
+)
+from sglang.bench_serving import (
+    AIOHTTP_TIMEOUT,
+    SHAREGPT_URL,
+    BenchmarkMetrics,
+    RequestFuncInput,
+    RequestFuncOutput,
+    calculate_metrics,
+    check_chat_template,
+    get_model,
+    get_request,
+    get_tokenizer,
+    parse_request_rate_range,
+    remove_prefix,
+    sample_random_requests,
+)
+global args
+# set ignore_eos True by default
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    # assert api_url.endswith(
+    #     "completions"
+    # ), "OpenAI Completions API URL must end with 'completions'."
+    prompt = request_func_input.prompt
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        # payload = {
+        #     "model": request_func_input.model,
+        #     "prompt": prompt,
+        #     "temperature": 0.0,
+        #     "best_of": 1,
+        #     "max_tokens": request_func_input.output_len,
+        #     "stream": not args.disable_stream,
+        #     "ignore_eos": not args.disable_ignore_eos,
+        #     **request_func_input.extra_request_body,
+        # }
+        # headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+        if args.base_only:
+            payload = {
+                "text": prompt,
+                "sampling_params": {"max_new_tokens": request_func_input.output_len},
+            }
+        else:
+            payload = {
+                "text": prompt,
+                "sampling_params": {"max_new_tokens": request_func_input.output_len},
+                "lora_path": f"lora{random.randint(0, NUM_LORAS - 1)}",
+            }
+        headers = {"Authorization": ""}
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data["text"]:
+                                # if data["choices"][0]["text"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+                                most_recent_timestamp = timestamp
+                                # generated_text += data["choices"][0]["text"]
+                                generated_text += data["text"]
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+    if pbar:
+        pbar.update(1)
+    return output
+ASYNC_REQUEST_FUNCS = {
+    "sglang": async_request_openai_completions,
+}
+async def benchmark(
+    backend: str,
+    api_url: str,
+    model_id: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: List[Tuple[str, int, int]],
+    request_rate: float,
+    disable_tqdm: bool,
+    extra_request_body: Dict[str, Any],
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+    print("Starting initial single prompt test run...")
+    test_prompt, test_prompt_len, test_output_len = input_requests[0]
+    test_input = RequestFuncInput(
+        model=model_id,
+        prompt=test_prompt,
+        api_url=api_url,
+        prompt_len=test_prompt_len,
+        output_len=test_output_len,
+        extra_request_body=extra_request_body,
+    )
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}"
+        )
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+    benchmark_start_time = time.perf_counter()
+    tasks: List[asyncio.Task] = []
+    async for request in get_request(input_requests, request_rate):
+        prompt, prompt_len, output_len = request
+        request_func_input = RequestFuncInput(
+            model=model_id,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            extra_request_body=extra_request_body,
+        )
+        tasks.append(
+            asyncio.create_task(
+                request_func(request_func_input=request_func_input, pbar=pbar)
+            )
+        )
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+    if pbar is not None:
+        pbar.close()
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+    metrics, output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        backend=backend,
+    )
+    print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Backend:", backend))
+    print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10}".format(
+            "Total generated tokens (retokenized):", metrics.total_output_retokenized
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Input token throughput (tok/s):", metrics.input_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
+    print(
+        "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Median E2E Latency (ms):", metrics.median_e2e_latency_ms
+        )
+    )
+    print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
+    print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+    print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
+    print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+    print(
+        "{s:{c}^{n}}".format(s="Time per Output Token (excl. 1st token)", n=50, c="-")
+    )
+    print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+    print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms))
+    print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("{s:{c}^{n}}".format(s="Inter-token Latency", n=50, c="-"))
+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
+    print("=" * 50)
+    if (
+        metrics.median_ttft_ms is not None
+        and metrics.mean_itl_ms is not None
+        and metrics.output_throughput is not None
+    ):
+        result = {
+            "backend": args.backend,
+            "request_rate": request_rate,
+            "total_input_tokens": metrics.total_input,
+            "total_output_tokens": metrics.total_output,
+            "total_output_tokens_retokenized": metrics.total_output_retokenized,
+            "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+            "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+            "median_ttft_ms": metrics.median_ttft_ms,
+            "median_itl_ms": metrics.median_itl_ms,
+            "output_throughput": metrics.output_throughput,
+            "random_input_len": args.random_input_len,
+            "random_output_len": args.random_output_len,
+            "random_range_ratio": args.random_range_ratio,
+            "duration": benchmark_duration,
+            "completed": metrics.completed,
+        }
+    else:
+        print(f"Error running benchmark for request rate: {request_rate}")
+        print("-" * 30)
+    # Determine output file name
+    if args.output_file:
+        output_file_name = args.output_file
+    else:
+        now = datetime.now().strftime("%m%d")
+        output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
+    # Append results to a JSONL file
+    with open(output_file_name, "a") as file:
+        file.write(json.dumps(result) + "\n")
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "total_output_tokens_retokenized": metrics.total_output_retokenized,
+        "request_throughput": metrics.request_throughput,
+        "input_throughput": metrics.input_throughput,
+        "output_throughput": metrics.output_throughput,
+        "mean_ttft_ms": metrics.mean_ttft_ms,
+        "median_ttft_ms": metrics.median_ttft_ms,
+        "std_ttft_ms": metrics.std_ttft_ms,
+        "p99_ttft_ms": metrics.p99_ttft_ms,
+        "mean_tpot_ms": metrics.mean_tpot_ms,
+        "median_tpot_ms": metrics.median_tpot_ms,
+        "std_tpot_ms": metrics.std_tpot_ms,
+        "p99_tpot_ms": metrics.p99_tpot_ms,
+        "mean_itl_ms": metrics.mean_itl_ms,
+        "median_itl_ms": metrics.median_itl_ms,
+        "std_itl_ms": metrics.std_itl_ms,
+        "p99_itl_ms": metrics.p99_itl_ms,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+        "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
+        "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
+    }
+    return result
+def run_benchmark(args_: argparse.Namespace):
+    global args
+    args = args_
+    # Set global environments
+    set_ulimit()
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    # Set url
+    if args.port is None:
+        args.port = {
+            "sglang": 30000,
+        }.get(args.backend, 30000)
+    # api_url = (
+    #     f"{args.base_url}/v1/completions"
+    #     if args.base_url
+    #     else f"http://{args.host}:{args.port}/v1/completions"
+    # )
+    api_url = (
+        f"{args.base_url}/generate"
+        if args.base_url
+        else f"http://{args.host}:{args.port}/generate"
+    )
+    print(f"{args}\n")
+    # Read dataset
+    backend = args.backend
+    model_id = args.model = LORA_PATH["base"]
+    tokenizer_id = args.model
+    tokenizer = get_tokenizer(tokenizer_id)
+    input_requests = sample_random_requests(
+        input_len=args.random_input_len,
+        output_len=args.random_output_len,
+        num_prompts=args.num_prompts,
+        range_ratio=args.random_range_ratio,
+        tokenizer=tokenizer,
+        dataset_path="",
+    )
+    return asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            model_id=model_id,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            request_rate=args.request_rate,
+            disable_tqdm=False,
+            extra_request_body={},
+        )
+    )
+def set_ulimit(target_soft_limit=65535):
+    resource_type = resource.RLIMIT_NOFILE
+    current_soft, current_hard = resource.getrlimit(resource_type)
+    if current_soft < target_soft_limit:
+        try:
+            resource.setrlimit(resource_type, (target_soft_limit, current_hard))
+        except ValueError as e:
+            print(f"Fail to set RLIMIT_NOFILE: {e}")
+if __name__ == "__main__":
+    parser = ArgumentParser(description="Benchmark the online lora serving throughput.")
+    parser.add_argument(
+        "--backend",
+        type=str,
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+        default="sglang",
+        help="Must specify a backend, depending on the LLM Inference Engine.",
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    parser.add_argument(
+        "--host", type=str, default="0.0.0.0", help="Default host is 0.0.0.0."
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        help="If not set, the default port is configured according to its default value for different LLM Inference Engines.",
+    )
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=50,
+        help="Number of prompts to process. Default is 1000.",
+    )
+    parser.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help="Number of input tokens per request, used only for random dataset.",
+    )
+    parser.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens per request, used only for random dataset.",
+    )
+    parser.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random dataset.",
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
+    )
+    parser.add_argument(
+        "--base-only",
+        action="store_true",
+    )
+    parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
+    parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+    args = parser.parse_args()
+    run_benchmark(args)

sglang/benchmark/mmlu/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+## Download data
+```
+bash download_data.sh
+```
+## Run benchmark
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+```
+python3 bench_sglang.py --nsub 10
+```
+```
+# OpenAI models
+python3 bench_sglang.py --backend gpt-3.5-turbo --parallel 8
+```
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+```
+python3 bench_other.py --nsub 10 --backend vllm
+```
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+# V100
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 4500 --port 22000
+```
+```
+python3 bench_other.py --nsub 10 --backend lightllm
+```
+### Benchmark guidance
+```
+python3 bench_other.py --nsub 10 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+### Benchmark lmql
+```
+CUDA_VISIBLE_DEVICES=0,1 lmql serve-model meta-llama/Llama-2-7b-chat-hf --cuda --port 23000
+```
+```
+python3 bench_other.py --nsub 10 --backend lmql --parallel 2
+```

sglang/benchmark/mtbench/README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+## Download Dataset
+```sh
+wget -O question.jsonl https://raw.githubusercontent.com/lm-sys/FastChat/main/fastchat/llm_judge/data/mt_bench/question.jsonl
+```
+## Run benchmark
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+```
+python3 bench_sglang.py --num-questions 80
+```
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+```
+python3 bench_other.py --num-questions 80 --backend vllm
+```
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+```
+python3 bench_other.py --num-questions 80 --backend lightllm
+```

sglang/benchmark/mtbench/bench_other.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import argparse
+import json
+import os
+import time
+import uuid
+from concurrent.futures import ThreadPoolExecutor
+from fastchat.model import get_conversation_template
+from tqdm import tqdm
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+def load_questions(filename):
+    questions = []
+    with open(filename, "r") as fin:
+        for line in fin:
+            obj = json.loads(line)
+            questions.append(obj)
+    return questions
+def write_answers(filename, model_id, questions, answers):
+    with open(os.path.expanduser(filename), "w") as fout:
+        for i in range(len(answers)):
+            ans_json = {
+                "question_id": questions[i]["question_id"],
+                "answer_id": uuid.uuid4().hex,
+                "model_id": model_id,
+                "choices": {
+                    "index": 0,
+                    "turns": [answers[i][0], answers[i][1]],
+                },
+                "tstamp": time.time(),
+            }
+            fout.write(json.dumps(ans_json) + "\n")
+def main(args):
+    questions = load_questions(args.question_file)
+    questions = (questions * 10)[: args.num_questions]
+    max_tokens = 256
+    model_id = "llama-2-chat"
+    conv_main = get_conversation_template(model_id)
+    # Select backend
+    call_generate = get_call_generate(args)
+    answers = [None] * len(questions)
+    def get_answer(i):
+        conv = conv_main.copy()
+        cur_answers = []
+        for j in range(2):
+            q = questions[i]["turns"][j]
+            conv.append_message(conv.roles[0], q)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            output = call_generate(prompt, temperature=0, max_tokens=max_tokens).strip()
+            cur_answers.append(output)
+            conv.update_last_message(output)
+        answers[i] = cur_answers
+    # Run requests
+    tic = time.time()
+    if args.parallel == 1:
+        for i in tqdm(range(len(questions))):
+            get_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            list(
+                tqdm(
+                    executor.map(get_answer, list(range(len(questions)))),
+                    total=len(questions),
+                )
+            )
+    latency = time.time() - tic
+    print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
+    # Write results
+    answer_file = args.answer_file or f"tmp_output_{args.backend}.txt"
+    write_answers(answer_file, model_id, questions, answers)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mtbench",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--question-file", type=str, default="question.jsonl")
+    parser.add_argument("--answer-file", type=str, default=None)
+    parser.add_argument("--num-questions", type=int, default=80)
+    args = add_common_other_args_and_parse(parser)
+    main(args)

sglang/benchmark/mtbench/bench_sglang.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import argparse
+import json
+import os
+import time
+import uuid
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+def load_questions(filename):
+    questions = []
+    with open(filename, "r") as fin:
+        for line in fin:
+            obj = json.loads(line)
+            questions.append(obj)
+    return questions
+def write_answers(filename, model_id, questions, answers):
+    with open(os.path.expanduser(filename), "w") as fout:
+        for i in range(len(answers)):
+            ans_json = {
+                "question_id": questions[i]["question_id"],
+                "answer_id": uuid.uuid4().hex,
+                "model_id": model_id,
+                "choices": {
+                    "index": 0,
+                    "turns": [answers[i][0], answers[i][1]],
+                },
+                "tstamp": time.time(),
+            }
+            fout.write(json.dumps(ans_json) + "\n")
+@sgl.function
+def answer_mt_bench(s, question_1, question_2):
+    s += sgl.system()
+    s += sgl.user(question_1)
+    s += sgl.assistant(sgl.gen("answer_1"))
+    s += sgl.user(question_2)
+    s += sgl.assistant(sgl.gen("answer_2"))
+def main(args):
+    # Construct prompts
+    questions = load_questions(args.question_file)[: args.num_questions]
+    arguments = [
+        {"question_1": q["turns"][0], "question_2": q["turns"][1]} for q in questions
+    ]
+    # Select backend
+    backend = select_sglang_backend(args)
+    sgl.set_default_backend(backend)
+    # Run requests
+    tic = time.time()
+    rets = answer_mt_bench.run_batch(
+        arguments,
+        temperature=0,
+        max_new_tokens=256,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    answers = [[s["answer_1"], s["answer_2"]] for s in rets]
+    latency = time.time() - tic
+    print(f"#questions: {len(questions)}, Latency: {latency:.2f}")
+    # Write results
+    model_id = backend.model_info["model_path"]
+    answer_file = args.answer_file or f"tmp_output_{args.backend}.txt"
+    write_answers(answer_file, model_id, questions, answers)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "mtbench",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--question-file", type=str, default="question.jsonl")
+    parser.add_argument("--answer-file", type=str, default=None)
+    parser.add_argument("--num-questions", type=int, default=80)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)

sglang/benchmark/multi_chain_reasoning/README.md ADDED Viewed

	@@ -0,0 +1,49 @@

+## Download data
+```
+wget https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+```
+## Run benchmark
+### Benchmark sglang
+```
+python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000  --schedule-conservativeness 1.3
+```
+```
+python3 bench_sglang.py --num-questions 64
+python3 bench_sglang.py --num-questions 32 --parallel 1
+```
+### Benchmark vllm
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf --disable-log-requests --port 21000
+```
+```
+python3 bench_other.py --num-questions 64 --backend vllm
+```
+### Benchmark lightllm
+```
+# A10G
+python -m lightllm.server.api_server --tokenizer_mode auto --model_dir ~/model_weights/llama-2-7b-chat-hf --max_total_token_num 16000 --port 22000
+```
+```
+python3 bench_other.py --num-questions 64 --backend lightllm
+```
+### Benchmark guidance
+```
+python3 bench_other.py --num-questions 8 --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+### Benchmark lmql
+```
+python3 bench_other.py --num-questions 64 --backend lmql --parallel 1
+```

sglang/benchmark/multi_chain_reasoning/bench_sglang.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import argparse
+import ast
+import json
+import re
+import time
+import numpy as np
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text, read_jsonl
+INVALID = -9999999
+def get_answer_value(answer_str):
+    answer_str = answer_str.replace(",", "")
+    numbers = re.findall(r"\d+", answer_str)
+    if len(numbers) < 1:
+        return INVALID
+    try:
+        return ast.literal_eval(numbers[-1])
+    except SyntaxError:
+        return INVALID
+prompt_lib = [
+    "Let us think step by step.",
+    "Approach this methodically. Let's dissect the problem into smaller, more manageable parts.",
+    "It's important to proceed step by step, ensuring accuracy at each stage.",
+    "Take a deep breath and break this down.",
+    "A little bit of arithmetic and a logical approach will help us quickly arrive at the solution to this problem.",
+    "I am extremely good at math.",
+]
+def main(args):
+    lines = read_jsonl(args.data_path)
+    # Construct prompts
+    # k = args.num_shot
+    # few_shot_examples = get_few_shot_examples(lines, k)
+    questions = []
+    labels = []
+    for i in range(len(lines[: args.num_questions])):
+        questions.append(lines[i]["question"])
+        labels.append(get_answer_value(lines[i]["answer"]))
+    assert all(l != INVALID for l in labels)
+    arguments = [{"question": q} for q in questions]
+    num_chains = args.num_chains
+    #####################################
+    ######### SGL Program Begin #########
+    #####################################
+    import sglang as sgl
+    @sgl.function
+    def multi_chain_gsm8k(s, question):
+        s += "Question: " + question + "\n"
+        # s += "Answer: " + prompt_lib[0] + sgl.gen("answer", max_tokens=256, stop="Question",
+        #    temperature=0)
+        # return
+        forks = s.fork(num_chains)
+        for i in range(num_chains):
+            forks[i] += (
+                "Answer: "
+                + prompt_lib[i % num_chains]
+                + sgl.gen("chain", max_tokens=256, temperature=0.3, stop="Question")
+            )
+        forks.join()
+        s += "Answer: To answer this question, here are some possible solutions. "
+        s += "After considering all of them, I will do a majority vote.\n\n"
+        for i in range(num_chains):
+            s += f"Solution {i+1}: " + forks[i]["chain"].strip() + "\n\n"
+        s += "\nBy considering the above solutions and doing a majority vote, I think the final answer (a single integer number) is "
+        s += sgl.gen("answer", max_tokens=16)
+    #####################################
+    ########## SGL Program End ##########
+    #####################################
+    # Select backend
+    backend = select_sglang_backend(args)
+    # Run requests
+    tic = time.time()
+    states = multi_chain_gsm8k.run_batch(
+        arguments,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.time() - tic
+    preds = []
+    for i in range(len(states)):
+        preds.append(get_answer_value(states[i]["answer"]))
+    # Compute accuracy
+    acc = np.mean(np.array(preds) == np.array(labels))
+    invalid = np.mean(np.array(preds) == INVALID)
+    print(f"Latency: {latency:.3f}")
+    print(f"Invalid: {invalid:.3f}")
+    print(f"Accuracy: {acc:.3f}")
+    # Write results
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_chain_gsm8k",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "accuracy": round(acc, 3),
+            "num_requests": args.num_questions,
+            "other": {
+                "num_questions": args.num_questions,
+                "parallel": args.parallel,
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-shot", type=int, default=0)
+    parser.add_argument("--num-chains", type=int, default=5)
+    parser.add_argument("--data-path", type=str, default="test.jsonl")
+    parser.add_argument("--num-questions", type=int, default=50)
+    args = add_common_sglang_args_and_parse(parser)
+    main(args)

sglang/benchmark/multi_turn_chat/README.md ADDED Viewed

	@@ -0,0 +1,66 @@

+### Benchmark sglang
+Run Llama-7B
+```
+python3 -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000
+```
+Run Mixtral-8x7B
+(When there is a CUDA out-of-memory error, try to reduce the `--mem-fraction-static`)
+```
+python3 -m sglang.launch_server --model-path mistralai/Mixtral-8x7B-Instruct-v0.1 --port 30000 --tp-size 8
+```
+Benchmark(short output)
+```
+python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf
+```
+Benchmark(long output)
+```
+python3 bench_sglang.py --tokenizer meta-llama/Llama-2-7b-chat-hf --long
+```
+### Benchmark vLLM
+Run Llama-7B
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model meta-llama/Llama-2-7b-chat-hf  --disable-log-requests --port 21000
+```
+Run Mixtral-8x7B
+```
+python3 -m vllm.entrypoints.api_server --tokenizer-mode auto --model mistralai/Mixtral-8x7B-Instruct-v0.1 --disable-log-requests --port 21000 --tensor-parallel-size 8
+```
+Benchmark(short output)
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm
+```
+Benchmark(long output)
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend vllm --long
+```
+### Benchmark guidance
+Benchmark Llama-7B (short output)
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf
+```
+Benchmark Llama-7B (long output)
+```
+python3 bench_other.py --tokenizer meta-llama/Llama-2-7b-chat-hf --backend guidance --parallel 1 --n-ctx 4096 --model-path path/to/gguf --long
+```

sglang/benchmark/multi_turn_chat/bench_other.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import json
+import time
+from argparse import ArgumentParser
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from data_gen import gen_arguments
+from tqdm import tqdm
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from sglang.test.test_utils import add_common_other_args_and_parse, get_call_generate
+from sglang.utils import dump_state_text
+def multi_turns(generate, qas):
+    s = ""
+    for qa in qas:
+        s += qa["prompt"]
+        s += generate(s, max_tokens=qa["new_tokens"])
+    return s
+def main(args):
+    print(args)
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
+    multi_qas = gen_arguments(args, tokenizer)
+    states = [None] * args.num_qa
+    call_generate = partial(get_call_generate(args), temperature=0)
+    def get_one_answer(i):
+        states[i] = multi_turns(generate=call_generate, **multi_qas[i])
+    tic = time.time()
+    if args.parallel == 1:
+        for i in tqdm(range(len(multi_qas))):
+            get_one_answer(i)
+    else:
+        with ThreadPoolExecutor(args.parallel) as executor:
+            rets = list(
+                tqdm(
+                    executor.map(get_one_answer, list(range(len(multi_qas)))),
+                    total=len(multi_qas),
+                )
+            )
+            for _ in rets:
+                pass
+    latency = time.time() - tic
+    # Compute accuracy
+    print(f"Latency: {latency:.3f}")
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_turn_chat",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_qa,
+            "num_turns": args.turns,
+            "other": {
+                "parallel": args.parallel,
+                "output_mode": "long" if args.long else "short",
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--turns", type=int, default=4)
+    parser.add_argument("--num-qa", type=int, default=20)
+    parser.add_argument("--min-len-q", type=int, default=256)
+    parser.add_argument("--max-len-q", type=int, default=512)
+    parser.add_argument("--min-len-a", type=int, default=4)
+    parser.add_argument("--max-len-a", type=int, default=8)
+    parser.add_argument("--tokenizer", type=str, required=True)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--long", action="store_true")
+    args = add_common_other_args_and_parse(parser)
+    if args.long:
+        args.min_len_a = 256
+        args.max_len_a = 512
+        args.num_qa = 20
+    main(args)

sglang/benchmark/multi_turn_chat/bench_sglang.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import json
+import time
+from argparse import ArgumentParser
+from data_gen import gen_arguments
+from vllm.transformers_utils.tokenizer import get_tokenizer
+import sglang as sgl
+from sglang.test.test_utils import (
+    add_common_sglang_args_and_parse,
+    select_sglang_backend,
+)
+from sglang.utils import dump_state_text
+@sgl.function
+def multi_turns(s, qas):
+    for qa in qas:
+        s += qa["prompt"]
+        s += sgl.gen(max_tokens=qa["new_tokens"], ignore_eos=True)
+def main(args):
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
+    multi_qas = gen_arguments(args, tokenizer)
+    backend = select_sglang_backend(args)
+    tic = time.time()
+    states = multi_turns.run_batch(
+        multi_qas,
+        temperature=0,
+        backend=backend,
+        num_threads=args.parallel,
+        progress_bar=True,
+    )
+    latency = time.time() - tic
+    print(f"Latency: {latency:.3f}")
+    dump_state_text(f"tmp_output_{args.backend}.txt", states)
+    with open(args.result_file, "a") as fout:
+        value = {
+            "task": "multi_turn_chat",
+            "backend": args.backend,
+            "num_gpus": 1,
+            "latency": round(latency, 3),
+            "num_requests": args.num_qa,
+            "num_turns": args.turns,
+            "other": {
+                "parallel": args.parallel,
+                "output_mode": "long" if args.long else "short",
+            },
+        }
+        fout.write(json.dumps(value) + "\n")
+if __name__ == "__main__":
+    parser = ArgumentParser()
+    parser.add_argument("--turns", type=int, default=4)
+    parser.add_argument("--num-qa", type=int, default=20)
+    parser.add_argument("--min-len-q", type=int, default=256)
+    parser.add_argument("--max-len-q", type=int, default=512)
+    parser.add_argument("--min-len-a", type=int, default=4)
+    parser.add_argument("--max-len-a", type=int, default=8)
+    parser.add_argument("--tokenizer", type=str, required=True)
+    parser.add_argument("--trust-remote-code", action="store_true")
+    parser.add_argument("--long", action="store_true")
+    args = add_common_sglang_args_and_parse(parser)
+    if args.long:
+        args.min_len_a = 256
+        args.max_len_a = 512
+        args.num_qa = 20
+    print(args)
+    main(args)