import os import atexit import torch print("CUDA Available:", torch.cuda.is_available()) print("GPU Count:", torch.cuda.device_count()) print("Device Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU") SYSTEM_PROMPT = "You are a compliance assistant. Use the provided risk data to answer user questions. If a single risk object is given, provide a direct answer. If a list of risks is provided, summarize, compare, or analyze the collection as needed. Always base your response on the data provided." hf_token = os.environ["HF_TOKEN"] class VllmApiServer: def __init__( self, model_path="casperhansen/llama-3.3-70b-instruct-awq", adapter_path="artemisiaai/fine-tuned-adapter", port=7860, # Default HuggingFace Spaces port host="0.0.0.0" ): self.model_path = model_path self.adapter_path = adapter_path self.port = port self.host = host self.server_process = None # Register cleanup on exit atexit.register(self._cleanup_server) def _start_vllm_server(self): """Start vLLM OpenAI API server""" cmd = [ "python", "-m", "vllm.entrypoints.openai.api_server", "--model", self.model_path, "--host", self.host, "--port", str(self.port), "--enable-lora", "--lora-modules", f"adapter={self.adapter_path}", "--max-lora-rank", "64", "--tensor-parallel-size", "4" ] print(f"Starting vLLM server with command: {' '.join(cmd)}") print(f"API will be available at: http://{self.host}:{self.port}/v1") # Run as main process (not subprocess for HuggingFace Spaces) os.execvp("python", cmd) def _cleanup_server(self): """Clean up vLLM server process""" if self.server_process: self.server_process.terminate() self.server_process.wait() def run(self): """Start the vLLM API server""" self._start_vllm_server() if __name__ == "__main__": server = VllmApiServer() server.run()