Spaces:
Runtime error
Runtime error
File size: 1,998 Bytes
807c87c 99b4862 2217b4e d1d6df8 d9e2d70 981bdb7 d9e2d70 2217b4e d9e2d70 2217b4e d9e2d70 2217b4e d9e2d70 2217b4e 1dc5913 2217b4e 1dc5913 2217b4e b44d8f4 2217b4e 6be943f 2217b4e 6be943f 2217b4e 6be943f a7f8386 2217b4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import os
import json
import subprocess
import time
import signal
import atexit
SYSTEM_PROMPT = "You are a compliance assistant. Use the provided risk data to answer user questions. If a single risk object is given, provide a direct answer. If a list of risks is provided, summarize, compare, or analyze the collection as needed. Always base your response on the data provided."
hf_token = os.environ["HF_TOKEN"]
class VllmApiServer:
def __init__(
self,
model_path="casperhansen/llama-3.3-70b-instruct-awq",
adapter_path="artemisiaai/fine-tuned-adapter",
port=7860, # Default HuggingFace Spaces port
host="0.0.0.0"
):
self.model_path = model_path
self.adapter_path = adapter_path
self.port = port
self.host = host
self.server_process = None
# Register cleanup on exit
atexit.register(self._cleanup_server)
def _start_vllm_server(self):
"""Start vLLM OpenAI API server"""
cmd = [
"python", "-m", "vllm.entrypoints.openai.api_server",
"--model", self.model_path,
"--host", self.host,
"--port", str(self.port),
"--enable-lora",
"--lora-modules", f"adapter={self.adapter_path}",
"--max-lora-rank", "64",
"--tensor-parallel-size", "1"
]
print(f"Starting vLLM server with command: {' '.join(cmd)}")
print(f"API will be available at: http://{self.host}:{self.port}/v1")
# Run as main process (not subprocess for HuggingFace Spaces)
os.execvp("python", cmd)
def _cleanup_server(self):
"""Clean up vLLM server process"""
if self.server_process:
self.server_process.terminate()
self.server_process.wait()
def run(self):
"""Start the vLLM API server"""
self._start_vllm_server()
if __name__ == "__main__":
server = VllmApiServer()
server.run() |