File size: 1,998 Bytes
807c87c
99b4862
2217b4e
 
 
 
d1d6df8
d9e2d70
981bdb7
d9e2d70
2217b4e
d9e2d70
 
 
 
2217b4e
 
d9e2d70
2217b4e
d9e2d70
2217b4e
 
 
1dc5913
2217b4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dc5913
2217b4e
 
b44d8f4
2217b4e
 
6be943f
2217b4e
 
 
 
 
6be943f
2217b4e
 
 
6be943f
a7f8386
2217b4e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import os
import json
import subprocess
import time
import signal
import atexit

SYSTEM_PROMPT = "You are a compliance assistant. Use the provided risk data to answer user questions. If a single risk object is given, provide a direct answer. If a list of risks is provided, summarize, compare, or analyze the collection as needed. Always base your response on the data provided."
hf_token = os.environ["HF_TOKEN"]

class VllmApiServer:
    def __init__(
        self,
        model_path="casperhansen/llama-3.3-70b-instruct-awq",
        adapter_path="artemisiaai/fine-tuned-adapter",
        port=7860,  # Default HuggingFace Spaces port
        host="0.0.0.0"
    ):
        self.model_path = model_path
        self.adapter_path = adapter_path
        self.port = port
        self.host = host
        self.server_process = None
        
        # Register cleanup on exit
        atexit.register(self._cleanup_server)

    def _start_vllm_server(self):
        """Start vLLM OpenAI API server"""
        cmd = [
            "python", "-m", "vllm.entrypoints.openai.api_server",
            "--model", self.model_path,
            "--host", self.host,
            "--port", str(self.port),
            "--enable-lora",
            "--lora-modules", f"adapter={self.adapter_path}",
            "--max-lora-rank", "64",
            "--tensor-parallel-size", "1"
        ]
        
        print(f"Starting vLLM server with command: {' '.join(cmd)}")
        print(f"API will be available at: http://{self.host}:{self.port}/v1")
        
        # Run as main process (not subprocess for HuggingFace Spaces)
        os.execvp("python", cmd)

    def _cleanup_server(self):
        """Clean up vLLM server process"""
        if self.server_process:
            self.server_process.terminate()
            self.server_process.wait()

    def run(self):
        """Start the vLLM API server"""
        self._start_vllm_server()

if __name__ == "__main__":
    server = VllmApiServer()
    server.run()