File size: 2,778 Bytes
3ec3a80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60967d1
 
3ec3a80
 
60967d1
3ec3a80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import time
import ctypes
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import llama_cpp
import uvicorn

# === Suppress llama_cpp logs ===
@ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)
def silent_log_callback(level, message, user_data):
    pass

llama_cpp.llama_log_set(silent_log_callback, None)

# === Model configuration ===
os.environ.setdefault("HF_HOME", "/root/.cache/huggingface")
CACHE_DIR = os.environ["HF_HOME"]
MODEL_REPO = "Qwen/Qwen1.5-1.8B-Chat-GGUF"
MODEL_FILE = "qwen1_5-1_8b-chat-q4_k_m.gguf"

# === Download model if not present ===
model_path = hf_hub_download(
    repo_id=MODEL_REPO,
    filename=MODEL_FILE,
    cache_dir=CACHE_DIR,
    local_dir=CACHE_DIR,
    local_dir_use_symlinks=False
)

# === Load LLM ===
llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_batch=512,
    n_threads=os.cpu_count(),
    n_gpu_layers=0,            # CPU-only
    use_mmap=True,
    use_mlock=False,
    low_cpu_mem_usage=True,
    verbose=False
)

# === Initialize FastAPI ===
app = FastAPI()

# === Streaming generator ===
def stream_chat_response(user_query: str):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a helpful AI assistant that reads research papers, "
                "highlights the key points, and explains the paper in simple terms."
            )
        },
        {
            "role": "user",
            "content": user_query
        }
    ]

    def generate():
        try:
            for chunk in llm.create_chat_completion(
                messages=messages,
                temperature=0.2,
                top_k=50,
                top_p=0.9,
                repeat_penalty=1.1,
                max_tokens=512,
                stream=True,
            ):
                if "choices" in chunk:
                    token = chunk["choices"][0]["delta"].get("content", "")
                    yield f"data: {token}\n\n"
        except Exception as e:
            yield f"data: [ERROR] {str(e)}\n\n"

    return StreamingResponse(generate(), media_type="text/event-stream")

# === Root ===
@app.get("/")
def read_root():
    return {"message": "LLM is ready. Send POST to /chat"}

# === POST endpoint to chat with paper ===
@app.post("/chat")
async def chat_with_llm(request: Request):
    data = await request.json()
    user_query = data.get("query", "")
    return stream_chat_response(user_query)

# === Run the app (required for HF Spaces) ===
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=7860)