Final_Assignment_Template

Sleeping

App Files Files Community

mjschock commited on May 5

Commit

145385b

unverified ·

1 Parent(s): 4395ceb

Enhance serve.py with fine-tuning job management, including job creation, status tracking, and training process in a separate thread. Update serve_test.py to include a test for fine-tuning functionality. Modify .gitignore to exclude model files. This update improves model training capabilities and API integration.

Browse files

Files changed (3) hide show

.gitignore +1 -0
serve.py +254 -9
serve_test.py +64 -0

.gitignore CHANGED Viewed

@@ -2,6 +2,7 @@
 logs
 lora_model
 memory_snapshot.pickle
 outputs
 __pycache__
 .pytest_cache

 logs
 lora_model
 memory_snapshot.pickle
+models
 outputs
 __pycache__
 .pytest_cache

serve.py CHANGED Viewed

@@ -17,7 +17,21 @@ from unsloth.chat_templates import get_chat_template  # noqa: E402
 # isort: on
-from fastapi import FastAPI, Request
 from openai.types.chat.chat_completion import ChatCompletion
 from openai.types.chat.chat_completion import Choice as ChatCompletionChoice
 from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
@@ -25,12 +39,24 @@ from openai.types.chat.chat_completion_chunk import Choice as ChatCompletionChun
 from openai.types.chat.chat_completion_chunk import ChoiceDelta
 from openai.types.chat.chat_completion_message import ChatCompletionMessage
 from openai.types.chat.completion_create_params import CompletionCreateParams
 from pydantic import TypeAdapter
 from ray import serve
 from sse_starlette import EventSourceResponse
 from starlette.responses import JSONResponse
 from transformers.generation.streamers import AsyncTextIteratorStreamer
 from transformers.image_utils import load_image
 dtype = (
     None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
@@ -92,21 +118,30 @@ class ModelDeployment:
         model_name: str,
     ):
         self.model_name = model_name
-        model, processor = FastModel.from_pretrained(
             load_in_4bit=load_in_4bit,
             max_seq_length=max_seq_length,
             model_name=self.model_name,
         )
-        # with open("chat_template.txt", "r") as f:
-        #     processor.chat_template = f.read()
-        #     processor.tokenizer.chat_template = processor.chat_template
-        FastModel.for_inference(model)  # Enable native 2x faster inference
-        self.model = model
-        self.processor = processor
     def reconfigure(self, config: Dict[str, Any]):
         print("=== reconfigure ===")
@@ -114,6 +149,216 @@ class ModelDeployment:
         print(config)
         # https://docs.ray.io/en/latest/serve/production-guide/config.html#dynamically-change-parameters-without-restarting-replicas-user-config
     @app.post("/v1/chat/completions")
     async def create_chat_completion(self, body: dict, raw_request: Request):
         """Creates a model response for the given chat conversation. Learn more in the [text generation](/docs/guides/text-generation), [vision](/docs/guides/vision), and [audio](/docs/guides/audio) guides.  Parameter support can differ depending on the model used to generate the response, particularly for newer reasoning models. Parameters that are only supported for reasoning models are noted below. For the current state of  unsupported parameters in reasoning models,  [refer to the reasoning guide](/docs/guides/reasoning).

 # isort: on
+import asyncio
+import json
+import threading
+import uuid
+from datetime import datetime
+from typing import Dict, List, Optional
+from datasets import (
+    Dataset,
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    load_dataset,
+)
+from fastapi import FastAPI, HTTPException, Request
 from openai.types.chat.chat_completion import ChatCompletion
 from openai.types.chat.chat_completion import Choice as ChatCompletionChoice
 from openai.types.chat.chat_completion_chunk import ChatCompletionChunk
 from openai.types.chat.chat_completion_chunk import ChoiceDelta
 from openai.types.chat.chat_completion_message import ChatCompletionMessage
 from openai.types.chat.completion_create_params import CompletionCreateParams
+from openai.types.fine_tuning import FineTuningJob
+from peft import PeftModel
 from pydantic import TypeAdapter
 from ray import serve
+from smolagents import CodeAgent, LiteLLMModel, Model, TransformersModel, VLLMModel
+from smolagents.monitoring import LogLevel
 from sse_starlette import EventSourceResponse
 from starlette.responses import JSONResponse
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    Trainer,
+    TrainingArguments,
+)
 from transformers.generation.streamers import AsyncTextIteratorStreamer
 from transformers.image_utils import load_image
+from trl import SFTTrainer
 dtype = (
     None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
         model_name: str,
     ):
         self.model_name = model_name
+        self.fine_tuning_jobs: Dict[str, FineTuningJob] = {}
+        self.training_threads: Dict[str, threading.Thread] = {}
+        # Load base model and processor
+        self.model, self.processor = FastModel.from_pretrained(
             load_in_4bit=load_in_4bit,
             max_seq_length=max_seq_length,
             model_name=self.model_name,
         )
+        # Configure LoRA for fine-tuning
+        self.model = FastModel.get_peft_model(
+            self.model,
+            r=16,  # LoRA rank
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+            lora_alpha=32,
+            lora_dropout=0.05,
+            bias="none",
+            use_gradient_checkpointing=True,
+            random_state=42,
+            use_rslora=False,
+        )
+        FastModel.for_inference(self.model)  # Enable native 2x faster inference
     def reconfigure(self, config: Dict[str, Any]):
         print("=== reconfigure ===")
         print(config)
         # https://docs.ray.io/en/latest/serve/production-guide/config.html#dynamically-change-parameters-without-restarting-replicas-user-config
+    def _run_training(self, job_id: str, training_file: str, model_name: str):
+        """Run the training process in a separate thread."""
+        try:
+            # Update job status to queued
+            self.fine_tuning_jobs[job_id].status = "queued"
+            # Simulate file validation
+            time.sleep(2)
+            # Update job status to running
+            self.fine_tuning_jobs[job_id].status = "running"
+            self.fine_tuning_jobs[job_id].started_at = int(datetime.now().timestamp())
+            # Load and prepare dataset
+            dataset = load_dataset("json", data_files=training_file)
+            # Configure chat template
+            tokenizer = get_chat_template(
+                self.processor,
+                chat_template="chatml",
+                mapping={
+                    "role": "from",
+                    "content": "value",
+                    "user": "human",
+                    "assistant": "gpt",
+                },
+                map_eos_token=True,
+            )
+            # Format dataset
+            def formatting_prompts_func(examples):
+                convos = examples["conversations"]
+                texts = [
+                    tokenizer.apply_chat_template(
+                        convo, tokenize=False, add_generation_prompt=False
+                    )
+                    for convo in convos
+                ]
+                return {"text": texts}
+            dataset = dataset.map(formatting_prompts_func, batched=True)
+            # Configure training arguments
+            training_args = TrainingArguments(
+                output_dir=f"models/{job_id}",
+                num_train_epochs=3,
+                per_device_train_batch_size=4,
+                gradient_accumulation_steps=4,
+                learning_rate=2e-4,
+                fp16=True,
+                logging_steps=10,
+                save_strategy="epoch",
+                optim="adamw_torch",
+                warmup_ratio=0.1,
+                lr_scheduler_type="cosine",
+                weight_decay=0.01,
+            )
+            # Create data collator
+            data_collator = DataCollatorForLanguageModeling(
+                tokenizer=tokenizer,
+                mlm=False,
+            )
+            # Create trainer
+            trainer = SFTTrainer(
+                model=self.model,
+                tokenizer=tokenizer,
+                train_dataset=dataset["train"],
+                args=training_args,
+                data_collator=data_collator,
+                max_seq_length=max_seq_length,
+                packing=False,
+            )
+            # Train
+            trainer.train()
+            # Save model and adapter
+            output_dir = f"models/{job_id}"
+            os.makedirs(output_dir, exist_ok=True)
+            # Save the base model config and tokenizer
+            self.model.config.save_pretrained(output_dir)
+            tokenizer.save_pretrained(output_dir)
+            # Save the adapter weights
+            self.model.save_pretrained(output_dir)
+            # Save the merged model in 16-bit format
+            try:
+                # First try to merge and save in 16-bit
+                self.model.save_pretrained_merged(
+                    output_dir,
+                    tokenizer,
+                    save_method="merged_16bit",
+                )
+            except Exception as merge_error:
+                print(f"Failed to merge weights: {str(merge_error)}")
+                # If merging fails, just save the adapter weights
+                self.model.save_pretrained(output_dir)
+            # Update job status to succeeded
+            self.fine_tuning_jobs[job_id].status = "succeeded"
+            self.fine_tuning_jobs[job_id].finished_at = int(datetime.now().timestamp())
+            self.fine_tuning_jobs[job_id].trained_tokens = (
+                trainer.state.global_step * training_args.per_device_train_batch_size
+            )
+            # Add result files
+            result_files = [
+                f"{output_dir}/config.json",
+                f"{output_dir}/tokenizer.json",
+                f"{output_dir}/adapter_config.json",
+                f"{output_dir}/adapter_model.bin",
+            ]
+            # Add merged model files if they exist
+            if os.path.exists(f"{output_dir}/pytorch_model.bin"):
+                result_files.append(f"{output_dir}/pytorch_model.bin")
+            self.fine_tuning_jobs[job_id].result_files = result_files
+        except Exception as e:
+            # Update job status to failed
+            self.fine_tuning_jobs[job_id].status = "failed"
+            self.fine_tuning_jobs[job_id].finished_at = int(datetime.now().timestamp())
+            self.fine_tuning_jobs[job_id].error = str(e)
+            print(f"Training failed: {str(e)}")
+            import traceback
+            print(traceback.format_exc())
+    @app.post("/v1/fine_tuning/jobs")
+    async def create_fine_tuning_job(self, body: dict):
+        """Create a fine-tuning job."""
+        try:
+            # Validate required fields
+            if "training_file" not in body:
+                raise HTTPException(status_code=400, detail="training_file is required")
+            if "model" not in body:
+                raise HTTPException(status_code=400, detail="model is required")
+            # Generate job ID
+            job_id = f"ftjob-{uuid.uuid4().hex[:8]}"
+            # Create job object
+            job = FineTuningJob(
+                id=job_id,
+                object="fine_tuning.job",
+                created_at=int(datetime.now().timestamp()),
+                finished_at=None,
+                model=body["model"],
+                fine_tuned_model=None,
+                organization_id="org-123",
+                status="validating_files",  # Start with validating_files
+                hyperparameters=body.get("hyperparameters", {}),
+                training_file=body["training_file"],
+                trained_tokens=None,
+                error=None,
+                result_files=[],  # Required field
+                seed=42,  # Required field
+            )
+            # Store job
+            self.fine_tuning_jobs[job_id] = job
+            # Start training in background thread
+            thread = threading.Thread(
+                target=self._run_training,
+                args=(job_id, body["training_file"], body["model"]),
+            )
+            thread.start()
+            self.training_threads[job_id] = thread
+            return job.model_dump()
+        except Exception as e:
+            raise HTTPException(status_code=500, detail=str(e))
+    @app.get("/v1/fine_tuning/jobs")
+    async def list_fine_tuning_jobs(self):
+        """List all fine-tuning jobs."""
+        return {
+            "object": "list",
+            "data": [job.model_dump() for job in self.fine_tuning_jobs.values()],
+        }
+    @app.get("/v1/fine_tuning/jobs/{job_id}")
+    async def get_fine_tuning_job(self, job_id: str):
+        """Get a specific fine-tuning job."""
+        if job_id not in self.fine_tuning_jobs:
+            raise HTTPException(status_code=404, detail="Job not found")
+        return self.fine_tuning_jobs[job_id].model_dump()
+    @app.post("/v1/fine_tuning/jobs/{job_id}/cancel")
+    async def cancel_fine_tuning_job(self, job_id: str):
+        """Cancel a fine-tuning job."""
+        if job_id not in self.fine_tuning_jobs:
+            raise HTTPException(status_code=404, detail="Job not found")
+        job = self.fine_tuning_jobs[job_id]
+        if job.status not in ["created", "running"]:
+            raise HTTPException(status_code=400, detail="Job cannot be cancelled")
+        job.status = "cancelled"
+        job.finished_at = int(datetime.now().timestamp())
+        return job.model_dump()
     @app.post("/v1/chat/completions")
     async def create_chat_completion(self, body: dict, raw_request: Request):
         """Creates a model response for the given chat conversation. Learn more in the [text generation](/docs/guides/text-generation), [vision](/docs/guides/vision), and [audio](/docs/guides/audio) guides.  Parameter support can differ depending on the model used to generate the response, particularly for newer reasoning models. Parameters that are only supported for reasoning models are noted below. For the current state of  unsupported parameters in reasoning models,  [refer to the reasoning guide](/docs/guides/reasoning).

serve_test.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import json
 from openai import OpenAI
@@ -35,6 +37,68 @@ def test_chat_completion():
         print(traceback.format_exc())
 if __name__ == "__main__":
     print("Testing chat completions endpoint...")
     test_chat_completion()

 import json
+import os
+import time
 from openai import OpenAI
         print(traceback.format_exc())
+def test_fine_tuning():
+    try:
+        # Create a sample training file
+        training_data = {
+            "conversations": [
+                {
+                    "from": "human",
+                    "value": "What is the capital of France?",
+                },
+                {
+                    "from": "gpt",
+                    "value": "The capital of France is Paris.",
+                },
+            ]
+        }
+        training_file = "training_data.json"
+        with open(training_file, "w") as f:
+            json.dump(training_data, f)
+        print("\nCreating fine-tuning job...")
+        job = client.fine_tuning.jobs.create(
+            training_file=training_file,
+            model="unsloth/SmolLM2-135M-Instruct-bnb-4bit",
+        )
+        print(f"Created job: {job.id}")
+        # Wait for job to start
+        print("\nWaiting for job to start...")
+        time.sleep(2)
+        # List jobs
+        print("\nListing fine-tuning jobs...")
+        jobs = client.fine_tuning.jobs.list()
+        print(f"Found {len(jobs.data)} jobs")
+        # Get job status
+        print("\nGetting job status...")
+        job = client.fine_tuning.jobs.retrieve(job.id)
+        print(f"Job status: {job.status}")
+        # Wait for job to complete or fail
+        print("\nWaiting for job to complete...")
+        while job.status in ["created", "running"]:
+            time.sleep(5)
+            job = client.fine_tuning.jobs.retrieve(job.id)
+            print(f"Job status: {job.status}")
+        # Clean up
+        os.remove(training_file)
+    except Exception as e:
+        print(f"Error occurred: {str(e)}")
+        import traceback
+        print("\nFull traceback:")
+        print(traceback.format_exc())
 if __name__ == "__main__":
     print("Testing chat completions endpoint...")
     test_chat_completion()
+    print("\nTesting fine-tuning endpoints...")
+    test_fine_tuning()