mashirong
commited on
Commit
·
31c5617
1
Parent(s):
730833f
Update README.md
Browse files
README.md
CHANGED
@@ -217,6 +217,8 @@ We also provide OpenAI-Compatible API at DeepSeek Platform: [platform.deepseek.c
|
|
217 |
|
218 |
## 8. How to run locally
|
219 |
**To utilize DeepSeek-V2 in BF16 format for inference, 80GB*8 GPUs are required.**
|
|
|
|
|
220 |
### Inference with Huggingface's Transformers
|
221 |
You can directly employ [Huggingface's Transformers](https://github.com/huggingface/transformers) for model inference.
|
222 |
|
@@ -225,12 +227,9 @@ You can directly employ [Huggingface's Transformers](https://github.com/huggingf
|
|
225 |
import torch
|
226 |
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
227 |
|
228 |
-
model_name = "deepseek-ai/DeepSeek-V2"
|
229 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
230 |
-
|
231 |
-
max_memory = {i: "75GB" for i in range(8)}
|
232 |
-
# `device_map` cannot be set to `auto`
|
233 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="sequential", torch_dtype=torch.bfloat16, max_memory=max_memory, attn_implementation="eager")
|
234 |
model.generation_config = GenerationConfig.from_pretrained(model_name)
|
235 |
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
236 |
|
@@ -247,12 +246,9 @@ print(result)
|
|
247 |
import torch
|
248 |
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
249 |
|
250 |
-
model_name = "deepseek-ai/DeepSeek-V2-Chat"
|
251 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
252 |
-
|
253 |
-
max_memory = {i: "75GB" for i in range(8)}
|
254 |
-
# `device_map` cannot be set to `auto`
|
255 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="sequential", torch_dtype=torch.bfloat16, max_memory=max_memory, attn_implementation="eager")
|
256 |
model.generation_config = GenerationConfig.from_pretrained(model_name)
|
257 |
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
258 |
|
@@ -297,8 +293,8 @@ To utilize [vLLM](https://github.com/vllm-project/vllm) for model inference, ple
|
|
297 |
from transformers import AutoTokenizer
|
298 |
from vllm import LLM, SamplingParams
|
299 |
|
300 |
-
max_model_len, tp_size = 8192,
|
301 |
-
model_name = "deepseek-ai/DeepSeek-V2-Chat"
|
302 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
303 |
llm = LLM(model=model_name, tensor_parallel_size=tp_size, max_model_len=max_model_len, trust_remote_code=True, enforce_eager=True)
|
304 |
sampling_params = SamplingParams(temperature=0.3, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])
|
|
|
217 |
|
218 |
## 8. How to run locally
|
219 |
**To utilize DeepSeek-V2 in BF16 format for inference, 80GB*8 GPUs are required.**
|
220 |
+
|
221 |
+
**To utilize DeepSeek-V2-Lite in BF16 format for inference, 40GB*1 GPU is required.**
|
222 |
### Inference with Huggingface's Transformers
|
223 |
You can directly employ [Huggingface's Transformers](https://github.com/huggingface/transformers) for model inference.
|
224 |
|
|
|
227 |
import torch
|
228 |
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
229 |
|
230 |
+
model_name = "deepseek-ai/DeepSeek-V2-Lite"
|
231 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
232 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
|
|
|
|
|
|
|
233 |
model.generation_config = GenerationConfig.from_pretrained(model_name)
|
234 |
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
235 |
|
|
|
246 |
import torch
|
247 |
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
248 |
|
249 |
+
model_name = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
250 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
251 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16).cuda()
|
|
|
|
|
|
|
252 |
model.generation_config = GenerationConfig.from_pretrained(model_name)
|
253 |
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
254 |
|
|
|
293 |
from transformers import AutoTokenizer
|
294 |
from vllm import LLM, SamplingParams
|
295 |
|
296 |
+
max_model_len, tp_size = 8192, 1
|
297 |
+
model_name = "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
298 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
299 |
llm = LLM(model=model_name, tensor_parallel_size=tp_size, max_model_len=max_model_len, trust_remote_code=True, enforce_eager=True)
|
300 |
sampling_params = SamplingParams(temperature=0.3, max_tokens=256, stop_token_ids=[tokenizer.eos_token_id])
|