Spaces:

moonshotai
/

Kimi-Dev-72B

Running on L40S

App Files Files Community

miaoyibo commited on 8 days ago

Commit

5ce5804

1 Parent(s): 1d5f555

1

Browse files

Files changed (4) hide show

app.py +40 -114
kimi_dev/serve/inference.py +0 -26
requirements.txt +3 -2
start.sh +42 -0

app.py CHANGED Viewed

@@ -8,7 +8,8 @@ import json
 import subprocess
 import ast
 import pdb
-from transformers import TextIteratorStreamer
 import threading
@@ -22,9 +23,8 @@ from kimi_dev.serve.gradio_utils import (
     transfer_input,
     wrap_gen_fn,
 )
-from kimi_dev.serve.inference import load_model
 from kimi_dev.serve.examples import get_examples
-from kimi_dev.serve.templates import post_process,get_loc_prompt, clone_github_repo, build_repo_structure, show_project_structure,get_repair_prompt,get_repo_files,get_full_file_paths_and_classes_and_functions,correct_file_path_in_structure
 TITLE = """<h1 align="left" style="min-width:200px; margin-top:0;">Chat with Kimi-Dev-72B🔥 </h1>"""
 DESCRIPTION_TOP = """<a href="https://github.com/MoonshotAI/Kimi-Dev" target="_blank">Kimi-Dev-72B</a> is a strong and open-source coding LLM for software engineering tasks."""
@@ -33,6 +33,12 @@ ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 DEPLOY_MODELS = dict()
 logger = configure_logger()
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", type=str, default="Kimi-Dev-72B")
@@ -47,26 +53,6 @@ def parse_args():
     return parser.parse_args()
-def fetch_model(model_name: str):
-    global args, DEPLOY_MODELS
-    if args.local_path:
-        model_path = args.local_path
-    else:
-        model_path = f"moonshotai/{args.model}"
-    if model_name in DEPLOY_MODELS:
-        model_info = DEPLOY_MODELS[model_name]
-        print(f"{model_name} has been loaded.")
-    else:
-        print(f"{model_name} is loading...")
-        DEPLOY_MODELS[model_name] = load_model(model_path)
-        print(f"Load {model_name} successfully...")
-        model_info = DEPLOY_MODELS[model_name]
-    return model_info
 def get_prompt(conversation) -> str:
     """
     Get the prompt for the conversation.
@@ -111,20 +97,12 @@ def predict(
     """
     print("running the prediction function")
-    try:
-        model, tokenizer = fetch_model(args.model)
-        if text == "":
-            yield chatbot, history, "Empty context."
-            return
-    except KeyError:
-        yield [[text, "No Model Found"]], [], "No Model Found"
-        return
     prompt = text
     repo_name = url.split("/")[-1]
     print(url)
-    print(commit_hash)
     repo_path = './local_path/'+repo_name  # Local clone path
@@ -141,50 +119,22 @@ def predict(
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": loc_prompt}
     ]
-    text_for_model = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
     )
-    model_inputs = tokenizer([text_for_model], return_tensors="pt").to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    # print("start generating")
-    loc_start_time = time.time()
-    if temperature > 0:
-        generation_kwargs = dict(
-            **model_inputs,
-            do_sample=True,
-            temperature=temperature,
-            top_p=top_p,
-            max_new_tokens=max_length_tokens,
-            streamer=streamer
-        )
-    else:
-        generation_kwargs = dict(
-            **model_inputs,
-            do_sample=False,
-            max_new_tokens=max_length_tokens,
-            streamer=streamer
-        )
-    gen_thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
-    gen_thread.start()
     partial_output =  "Start Locating...\n"
-    for new_text in streamer:
-        partial_output += new_text
-        highlight_response = highlight_thinking(partial_output)
-        yield [[prompt, highlight_response]], [["null test", "null test2"]], "Generating file locations..."
-    gen_thread.join()
-    loc_end_time = time.time()
-    loc_time = loc_end_time - loc_start_time
-    encoded_answer = tokenizer(partial_output, padding=True, truncation=True, return_tensors='pt')
-    print("loc token/s:",len(encoded_answer['input_ids'][0])/loc_time)
     response = partial_output
     raw_answer=post_process(response)
@@ -213,53 +163,29 @@ def predict(
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": repair_prompt}
     ]
-    text = tokenizer.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
     subprocess.run(["rm", "-rf", repo_path], check=True)
-    repair_start_time = time.time()
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    if temperature > 0:
-        generation_kwargs = dict(
-            **model_inputs,
-            do_sample=True,
-            temperature=temperature,
-            top_p=top_p,
-            max_new_tokens=max_length_tokens,
-            streamer=streamer
-        )
-    else:
-        generation_kwargs = dict(
-            **model_inputs,
-            do_sample=False,
-            max_new_tokens=max_length_tokens,
-            streamer=streamer
-        )
-    gen_thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
-    gen_thread.start()
-    partial_output_repair = "Start Repairing...\n"
-    yield [[prompt,highlight_response],[repair_prompt,partial_output_repair]], [["null test","null test2"]], "Generate: Success"
     time.sleep(5)
-    for new_text in streamer:
-        partial_output_repair += new_text
-        highlight_response = highlight_thinking(partial_output)
-        highlight_response_repair = highlight_thinking(partial_output_repair)
-        yield [[prompt, highlight_response], [repair_prompt, highlight_response_repair]], [["null test", "null test2"]], "Generating repair suggestion..."
-    gen_thread.join()
-    repair_end_time = time.time()
-    repair_time = repair_end_time - repair_start_time
-    encoded_answer = tokenizer(partial_output_repair, padding=True, truncation=True, return_tensors='pt')
-    print("repair token/s:",len(encoded_answer['input_ids'][0])/repair_time)
-    # yield response, "null test", "Generate: Success"
     yield [[prompt,highlight_response],[repair_prompt,highlight_response_repair]], [["null test","null test2"]], "Generate: Success"

 import subprocess
 import ast
 import pdb
+import openai
 import threading
     transfer_input,
     wrap_gen_fn,
 )
 from kimi_dev.serve.examples import get_examples
+from kimi_dev.serve.templates import post_process,get_loc_prompt, clone_github_repo, build_repo_structure, show_project_structure,get_repair_prompt
 TITLE = """<h1 align="left" style="min-width:200px; margin-top:0;">Chat with Kimi-Dev-72B🔥 </h1>"""
 DESCRIPTION_TOP = """<a href="https://github.com/MoonshotAI/Kimi-Dev" target="_blank">Kimi-Dev-72B</a> is a strong and open-source coding LLM for software engineering tasks."""
 DEPLOY_MODELS = dict()
 logger = configure_logger()
+client = openai.OpenAI(
+    base_url="http://localhost:8080/v1",  # vLLM 服务地址
+    api_key="EMPTY"                       # 不验证，只要不是 None
+)
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", type=str, default="Kimi-Dev-72B")
     return parser.parse_args()
 def get_prompt(conversation) -> str:
     """
     Get the prompt for the conversation.
     """
     print("running the prediction function")
+    openai.api_key = "EMPTY"
+    openai.base_url = "http://localhost:8080/v1"
     prompt = text
     repo_name = url.split("/")[-1]
     print(url)
+    # print(commit_hash)
     repo_path = './local_path/'+repo_name  # Local clone path
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": loc_prompt}
     ]
+    response = client.chat.completions.create(
+        model="kimi-dev",  # 和vLLM启动时的一致
+        messages=messages,
+        stream=True,
+        temperature=temperature,
+        max_tokens=max_length_tokens,
     )
     partial_output =  "Start Locating...\n"
+    for chunk in response:
+        delta = chunk.choices[0].delta
+        if delta and delta.content:
+            partial_output += delta.content
+            highlight_response = highlight_thinking(partial_output)
+            yield [[prompt, highlight_response]], [["null test", "null test2"]], "Generating file locations..."
     response = partial_output
     raw_answer=post_process(response)
         {"role": "system", "content": "You are a helpful assistant."},
         {"role": "user", "content": repair_prompt}
     ]
     subprocess.run(["rm", "-rf", repo_path], check=True)
     time.sleep(5)
+    response = client.chat.completions.create(
+        model="kimi-dev",  # 和vLLM启动时的一致
+        messages=messages,
+        stream=True,
+        temperature=temperature,
+        max_tokens=max_length_tokens,
+    )
+    partial_output_repair =  "Start Repairing...\n"
+    for chunk in response:
+        delta = chunk.choices[0].delta
+        if delta and delta.content:
+            partial_output_repair += delta.content
+            highlight_response_repair = highlight_thinking(partial_output_repair)
+            yield [[prompt,highlight_response],[repair_prompt,highlight_response_repair]], [["null test","null test2"]], "Generating file repairing..."
     yield [[prompt,highlight_response],[repair_prompt,highlight_response_repair]], [["null test","null test2"]], "Generate: Success"

kimi_dev/serve/inference.py DELETED Viewed

@@ -1,26 +0,0 @@
-import logging
-from transformers import (
-    AutoModelForCausalLM,
-    AutoConfig,
-    AutoTokenizer
-)
-logger = logging.getLogger(__name__)
-def load_model(model_path: str = "moonshotai/Kimi-Dev-72B"):
-    # hotfix the model to use flash attention 2
-    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        config=config,
-        torch_dtype="auto",
-        device_map="auto",
-        trust_remote_code=True,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    return model, tokenizer

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-torchvision==0.20.0
 transformers==4.51.1
 accelerate
 sentencepiece
@@ -17,4 +16,6 @@ tqdm
 colorama
 Pygments
 markdown
-SentencePiece

 transformers==4.51.1
 accelerate
 sentencepiece
 colorama
 Pygments
 markdown
+SentencePiece
+vllm
+openai

start.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/bin/bash
+python -m vllm.entrypoints.openai.api_server \
+    --model moonshotai/Kimi-Dev-72B \
+    --tensor-parallel-size 4 \
+    --max-num-seqs 8 \
+    --max-model-len 131072 \
+    --gpu-memory-utilization 0.9 \
+    --host localhost \
+    --served-model-name kimi-dev \
+    --port 8080
+SERVICE_URL="http://localhost:8080/v1/models"
+TIMEOUT=300      # 最大等待秒数
+INTERVAL=5       # 检测间隔秒数
+ELAPSED=0
+echo "[*] 等待 vLLM 服务启动，最长等待 ${TIMEOUT}s ..."
+while true; do
+    # 尝试请求模型列表接口，检查是否包含指定模型
+    if curl -s "$SERVICE_URL" | grep -q "moonshotai"; then
+        echo "✅ vLLM 服务已成功启动！"
+        break
+    fi
+    if [ $ELAPSED -ge $TIMEOUT ]; then
+        echo "❌ 等待超时，vLLM 服务未启动成功。"
+        exit 1
+    fi
+    echo "⏳ 服务尚未就绪，等待 ${INTERVAL}s 后重试..."
+    sleep $INTERVAL
+    ELAPSED=$((ELAPSED + INTERVAL))
+done
+# 这里写部署成功后要执行的命令
+echo "[*] 现在执行后续操作..."
+# 例如启动前端服务、运行测试脚本等
+# ./start_frontend.sh
+python app.py