Spaces:
Sleeping
Sleeping
import os | |
import time | |
import ctypes | |
from fastapi import FastAPI, Request | |
from fastapi.responses import StreamingResponse | |
from huggingface_hub import hf_hub_download | |
from llama_cpp import Llama | |
import llama_cpp | |
import uvicorn | |
# === Suppress llama_cpp logs === | |
def silent_log_callback(level, message, user_data): | |
pass | |
llama_cpp.llama_log_set(silent_log_callback, None) | |
# === Model configuration === | |
os.environ.setdefault("HF_HOME", "/root/.cache/huggingface") | |
CACHE_DIR = os.environ["HF_HOME"] | |
MODEL_REPO = "Qwen/Qwen1.5-1.8B-Chat-GGUF" | |
MODEL_FILE = "qwen1_5-1_8b-chat-q4_k_m.gguf" | |
# === Download model if not present === | |
model_path = hf_hub_download( | |
repo_id=MODEL_REPO, | |
filename=MODEL_FILE, | |
cache_dir=CACHE_DIR, | |
local_dir=CACHE_DIR, | |
local_dir_use_symlinks=False | |
) | |
# === Load LLM === | |
llm = Llama( | |
model_path=model_path, | |
n_ctx=4096, | |
n_batch=512, | |
n_threads=os.cpu_count(), | |
n_gpu_layers=0, # CPU-only | |
use_mmap=True, | |
use_mlock=False, | |
low_cpu_mem_usage=True, | |
verbose=False | |
) | |
# === Initialize FastAPI === | |
app = FastAPI() | |
# === Streaming generator === | |
def stream_chat_response(user_query: str): | |
messages = [ | |
{ | |
"role": "system", | |
"content": ( | |
"You are a helpful AI assistant that reads research papers, " | |
"highlights the key points, and explains the paper in simple terms." | |
) | |
}, | |
{ | |
"role": "user", | |
"content": user_query | |
} | |
] | |
def generate(): | |
try: | |
for chunk in llm.create_chat_completion( | |
messages=messages, | |
temperature=0.2, | |
top_k=50, | |
top_p=0.9, | |
repeat_penalty=1.1, | |
max_tokens=512, | |
stream=True, | |
): | |
if "choices" in chunk: | |
token = chunk["choices"][0]["delta"].get("content", "") | |
yield f"data: {token}\n\n" | |
except Exception as e: | |
yield f"data: [ERROR] {str(e)}\n\n" | |
return StreamingResponse(generate(), media_type="text/event-stream") | |
# === Root === | |
def read_root(): | |
return {"message": "LLM is ready. Send POST to /chat"} | |
# === POST endpoint to chat with paper === | |
async def chat_with_llm(request: Request): | |
data = await request.json() | |
user_query = data.get("query", "") | |
return stream_chat_response(user_query) | |
# === Run the app (required for HF Spaces) === | |
if __name__ == "__main__": | |
uvicorn.run(app, host="0.0.0.0", port=7860) | |