miaoyibo commited on
Commit
a7f820f
·
1 Parent(s): 2689cfa
Files changed (3) hide show
  1. app.py +1 -1
  2. kimi_dev/serve/inference.py +3 -0
  3. requirements.txt +1 -0
app.py CHANGED
@@ -257,7 +257,7 @@ def predict(
257
  repair_time = repair_end_time - repair_start_time
258
 
259
  encoded_answer = tokenizer(partial_output_repair, padding=True, truncation=True, return_tensors='pt')
260
- print("loc token/s:",len(encoded_answer['input_ids'][0])/repair_time)
261
 
262
  # yield response, "null test", "Generate: Success"
263
  yield [[prompt,highlight_response],[repair_prompt,highlight_response_repair]], [["null test","null test2"]], "Generate: Success"
 
257
  repair_time = repair_end_time - repair_start_time
258
 
259
  encoded_answer = tokenizer(partial_output_repair, padding=True, truncation=True, return_tensors='pt')
260
+ print("repair token/s:",len(encoded_answer['input_ids'][0])/repair_time)
261
 
262
  # yield response, "null test", "Generate: Success"
263
  yield [[prompt,highlight_response],[repair_prompt,highlight_response_repair]], [["null test","null test2"]], "Generate: Success"
kimi_dev/serve/inference.py CHANGED
@@ -5,6 +5,7 @@ from transformers import (
5
  AutoConfig,
6
  AutoTokenizer
7
  )
 
8
 
9
  logger = logging.getLogger(__name__)
10
 
@@ -20,6 +21,8 @@ def load_model(model_path: str = "moonshotai/Kimi-Dev-72B"):
20
  device_map="auto",
21
  trust_remote_code=True,
22
  )
 
 
23
 
24
  tokenizer = AutoTokenizer.from_pretrained(model_path)
25
 
 
5
  AutoConfig,
6
  AutoTokenizer
7
  )
8
+ import tensor_parallel as tp
9
 
10
  logger = logging.getLogger(__name__)
11
 
 
21
  device_map="auto",
22
  trust_remote_code=True,
23
  )
24
+ model = tp.tensor_parallel(model, ["cuda:0", "cuda:1", "cuda:2", "cuda:3"]) # <- each GPU has half the weights
25
+
26
 
27
  tokenizer = AutoTokenizer.from_pretrained(model_path)
28
 
requirements.txt CHANGED
@@ -18,3 +18,4 @@ colorama
18
  Pygments
19
  markdown
20
  SentencePiece
 
 
18
  Pygments
19
  markdown
20
  SentencePiece
21
+ tensor_parallel