Final_Assignment_Template

Running

mjschock commited on May 5

Commit

d1da8fd

unverified ·

1 Parent(s): 95d9fdc

Add serve_test.py for testing chat completion functionality with the OpenAI client. Update serve.py to use FastModel for improved performance and adjust input handling for optional image processing. Include debugging output for better error tracking.

Files changed (3) hide show

serve.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Any, Dict, List
 # isort: off
 from unsloth import (
     FastLanguageModel,
     FastVisionModel,
     is_bfloat16_supported,
 )  # noqa: E402
@@ -92,17 +93,17 @@ class ModelDeployment:
     ):
         self.model_name = model_name
-        model, processor = FastVisionModel.from_pretrained(
             load_in_4bit=load_in_4bit,
             max_seq_length=max_seq_length,
             model_name=self.model_name,
         )
-        with open("chat_template.txt", "r") as f:
-            processor.chat_template = f.read()
-            processor.tokenizer.chat_template = processor.chat_template
-        FastVisionModel.for_inference(model)  # Enable native 2x faster inference
         self.model = model
         self.processor = processor
@@ -166,12 +167,17 @@ class ModelDeployment:
             conversation=messages,
             # documents=documents,
             tools=tools,
         )
         print("prompt:")
         print(prompt)
-        inputs = self.processor(text=prompt, images=images, return_tensors="pt")
         inputs = inputs.to(self.model.device)
         input_ids = inputs.input_ids
@@ -372,3 +378,7 @@ def build_app(cli_args: Dict[str, str]) -> serve.Application:
     return ModelDeployment.options().bind(
         cli_args.get("model_name"),
     )

 # isort: off
 from unsloth import (
     FastLanguageModel,
+    FastModel,
     FastVisionModel,
     is_bfloat16_supported,
 )  # noqa: E402
     ):
         self.model_name = model_name
+        model, processor = FastModel.from_pretrained(
             load_in_4bit=load_in_4bit,
             max_seq_length=max_seq_length,
             model_name=self.model_name,
         )
+        # with open("chat_template.txt", "r") as f:
+        #     processor.chat_template = f.read()
+        #     processor.tokenizer.chat_template = processor.chat_template
+        FastModel.for_inference(model)  # Enable native 2x faster inference
         self.model = model
         self.processor = processor
             conversation=messages,
             # documents=documents,
             tools=tools,
+            tokenize=False,  # Return string instead of token IDs
         )
         print("prompt:")
         print(prompt)
+        if images:
+            inputs = self.processor(text=prompt, images=images, return_tensors="pt")
+        else:
+            inputs = self.processor(text=prompt, return_tensors="pt")
         inputs = inputs.to(self.model.device)
         input_ids = inputs.input_ids
     return ModelDeployment.options().bind(
         cli_args.get("model_name"),
     )
+# uv run serve run serve:build_app model_name="HuggingFaceTB/SmolVLM-Instruct"
+# uv run serve run serve:build_app model_name="unsloth/SmolLM2-135M-Instruct-bnb-4bit"

serve_test.py ADDED Viewed

+import json
+from openai import OpenAI
+# Initialize the OpenAI client with the local server
+client = OpenAI(
+    base_url="http://localhost:8000/v1",
+    api_key="not-needed",  # API key is not needed for local server
+)
+def test_chat_completion():
+    try:
+        print("Sending chat completion request...")
+        response = client.chat.completions.create(
+            model="unsloth/SmolLM2-135M-Instruct-bnb-4bit",
+            messages=[{"role": "user", "content": "Hello"}],
+            temperature=0.7,
+            max_tokens=50,
+        )
+        # Print the response
+        print("\nResponse:")
+        print(response.choices[0].message.content)
+        # Print full response object for debugging
+        print("\nFull response object:")
+        print(json.dumps(response.model_dump(), indent=2))
+    except Exception as e:
+        print(f"Error occurred: {str(e)}")
+        import traceback
+        print("\nFull traceback:")
+        print(traceback.format_exc())
+if __name__ == "__main__":
+    print("Testing chat completions endpoint...")
+    test_chat_completion()

train.py CHANGED Viewed

@@ -412,3 +412,5 @@ Please format your response as a JSON object with two keys:
 if __name__ == "__main__":
     main()

 if __name__ == "__main__":
     main()
+# uv run python train.py