Spaces:

moonshotai
/

Kimi-Dev-72B

Running on L40S

miaoyibo commited on 10 days ago

Commit

5d6758b

1 Parent(s): f3a9564

1

Files changed (2) hide show

app.py CHANGED Viewed

@@ -127,7 +127,7 @@ def predict(
     """
     print("running the prediction function")
     try:
-        model, processor = fetch_model(args.model)
         if text == "":
             yield chatbot, history, "Empty context."
@@ -157,9 +157,9 @@ def predict(
         text,
         pil_images,
         history,
-        processor,
         max_length=max_context_length_tokens,
     )
     all_conv, last_image = convert_conversation_to_prompts(conversation)
     stop_words = conversation.stop_str
     gradio_chatbot_output = to_gradio_chatbot(conversation)

     """
     print("running the prediction function")
     try:
+        model = fetch_model(args.model)
         if text == "":
             yield chatbot, history, "Empty context."
         text,
         pil_images,
         history,
         max_length=max_context_length_tokens,
     )
+    print(conversation)
     all_conv, last_image = convert_conversation_to_prompts(conversation)
     stop_words = conversation.stop_str
     gradio_chatbot_output = to_gradio_chatbot(conversation)

kimi_vl/serve/inference.py CHANGED Viewed

@@ -19,13 +19,13 @@ from .chat_utils import Conversation, get_conv_template
 logger = logging.getLogger(__name__)
-def load_model(model_path: str = "moonshotai/Kimi-VL-A3B-Thinking"):
     # hotfix the model to use flash attention 2
     config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
-    config._attn_implementation = "flash_attention_2"
-    config.vision_config._attn_implementation = "flash_attention_2"
-    config.text_config._attn_implementation = "flash_attention_2"
-    print("Successfully set the attn_implementation to flash_attention_2")
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
@@ -34,9 +34,9 @@ def load_model(model_path: str = "moonshotai/Kimi-VL-A3B-Thinking"):
         device_map="auto",
         trust_remote_code=True,
     )
-    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True)
-    return model, processor
 class StoppingCriteriaSub(StoppingCriteria):

 logger = logging.getLogger(__name__)
+def load_model(model_path: str = "moonshotai/Kimi-Dev-72B"):
     # hotfix the model to use flash attention 2
     config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    # config._attn_implementation = "flash_attention_2"
+    # config.vision_config._attn_implementation = "flash_attention_2"
+    # config.text_config._attn_implementation = "flash_attention_2"
+    # print("Successfully set the attn_implementation to flash_attention_2")
     model = AutoModelForCausalLM.from_pretrained(
         model_path,
         device_map="auto",
         trust_remote_code=True,
     )
+    # processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True)
+    return model
 class StoppingCriteriaSub(StoppingCriteria):