Spaces:
Runtime error
Runtime error
| import os | |
| import atexit | |
| import torch | |
| print("CUDA Available:", torch.cuda.is_available()) | |
| print("GPU Count:", torch.cuda.device_count()) | |
| print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU") | |
| SYSTEM_PROMPT = "You are a compliance assistant. Use the provided risk data to answer user questions. If a single risk object is given, provide a direct answer. If a list of risks is provided, summarize, compare, or analyze the collection as needed. Always base your response on the data provided." | |
| hf_token = os.environ["HF_TOKEN"] | |
| class VllmApiServer: | |
| def __init__( | |
| self, | |
| model_path="casperhansen/llama-3.3-70b-instruct-awq", | |
| adapter_path="artemisiaai/fine-tuned-adapter", | |
| port=7860, # Default HuggingFace Spaces port | |
| host="0.0.0.0" | |
| ): | |
| self.model_path = model_path | |
| self.adapter_path = adapter_path | |
| self.port = port | |
| self.host = host | |
| self.server_process = None | |
| # Register cleanup on exit | |
| atexit.register(self._cleanup_server) | |
| def _start_vllm_server(self): | |
| """Start vLLM OpenAI API server""" | |
| cmd = [ | |
| "python", "-m", "vllm.entrypoints.openai.api_server", | |
| "--model", self.model_path, | |
| "--host", self.host, | |
| "--port", str(self.port), | |
| "--enable-lora", | |
| "--lora-modules", f"adapter={self.adapter_path}", | |
| "--max-lora-rank", "64", | |
| "--tensor-parallel-size", "4" | |
| ] | |
| print(f"Starting vLLM server with command: {' '.join(cmd)}") | |
| print(f"API will be available at: http://{self.host}:{self.port}/v1") | |
| # Run as main process (not subprocess for HuggingFace Spaces) | |
| os.execvp("python", cmd) | |
| def _cleanup_server(self): | |
| """Clean up vLLM server process""" | |
| if self.server_process: | |
| self.server_process.terminate() | |
| self.server_process.wait() | |
| def run(self): | |
| """Start the vLLM API server""" | |
| self._start_vllm_server() | |
| if __name__ == "__main__": | |
| server = VllmApiServer() | |
| server.run() |