Update README.md
Browse files
README.md
CHANGED
@@ -130,7 +130,7 @@ We can use the following code to get a sense of peak memory usage during inferen
|
|
130 |
| Benchmark | | |
|
131 |
|------------------|----------------|--------------------------------|
|
132 |
| | Phi-4 mini-Ins | Phi-4-mini-instruct-float8dq |
|
133 |
-
| Peak Memory (GB) | 8.91 | 5.70
|
134 |
|
135 |
|
136 |
## Benchmark Peak Memory
|
@@ -186,8 +186,8 @@ pip install vllm --pre --extra-index-url https://wheels.vllm.ai/nightly
|
|
186 |
| Benchmark | | |
|
187 |
|----------------------------------|----------------|--------------------------|
|
188 |
| | Phi-4 mini-Ins | phi4-mini-float8dq |
|
189 |
-
| latency (batch_size=1) | 1.64s
|
190 |
-
| latency (batch_size=128) | 3.1s
|
191 |
| serving (num_prompts=1) | 1.35 req/s | 1.57 req/s (16% speedup) |
|
192 |
| serving (num_prompts=1000) | 66.68 req/s | 80.53 req/s (21% speedup)|
|
193 |
|
|
|
130 |
| Benchmark | | |
|
131 |
|------------------|----------------|--------------------------------|
|
132 |
| | Phi-4 mini-Ins | Phi-4-mini-instruct-float8dq |
|
133 |
+
| Peak Memory (GB) | 8.91 | 5.70 (36% reduction) |
|
134 |
|
135 |
|
136 |
## Benchmark Peak Memory
|
|
|
186 |
| Benchmark | | |
|
187 |
|----------------------------------|----------------|--------------------------|
|
188 |
| | Phi-4 mini-Ins | phi4-mini-float8dq |
|
189 |
+
| latency (batch_size=1) | 1.64s | 1.41s (16% speedup) |
|
190 |
+
| latency (batch_size=128) | 3.1s | 2.72s (14% speedup) |
|
191 |
| serving (num_prompts=1) | 1.35 req/s | 1.57 req/s (16% speedup) |
|
192 |
| serving (num_prompts=1000) | 66.68 req/s | 80.53 req/s (21% speedup)|
|
193 |
|