import os import time import ctypes from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse from huggingface_hub import hf_hub_download from llama_cpp import Llama import llama_cpp import uvicorn # === Suppress llama_cpp logs === @ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p) def silent_log_callback(level, message, user_data): pass llama_cpp.llama_log_set(silent_log_callback, None) # === Model configuration === os.environ.setdefault("HF_HOME", "/root/.cache/huggingface") CACHE_DIR = os.environ["HF_HOME"] MODEL_REPO = "Qwen/Qwen1.5-1.8B-Chat-GGUF" MODEL_FILE = "qwen1_5-1_8b-chat-q4_k_m.gguf" # === Download model if not present === model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, cache_dir=CACHE_DIR, local_dir=CACHE_DIR, local_dir_use_symlinks=False ) # === Load LLM === llm = Llama( model_path=model_path, n_ctx=4096, n_batch=512, n_threads=os.cpu_count(), n_gpu_layers=0, # CPU-only use_mmap=True, use_mlock=False, low_cpu_mem_usage=True, verbose=False ) # === Initialize FastAPI === app = FastAPI() # === Streaming generator === def stream_chat_response(user_query: str): messages = [ { "role": "system", "content": ( "You are a helpful AI assistant that reads research papers, " "highlights the key points, and explains the paper in simple terms." ) }, { "role": "user", "content": user_query } ] def generate(): try: for chunk in llm.create_chat_completion( messages=messages, temperature=0.2, top_k=50, top_p=0.9, repeat_penalty=1.1, max_tokens=512, stream=True, ): if "choices" in chunk: token = chunk["choices"][0]["delta"].get("content", "") yield f"data: {token}\n\n" except Exception as e: yield f"data: [ERROR] {str(e)}\n\n" return StreamingResponse(generate(), media_type="text/event-stream") # === Root === @app.get("/") def read_root(): return {"message": "LLM is ready. Send POST to /chat"} # === POST endpoint to chat with paper === @app.post("/chat") async def chat_with_llm(request: Request): data = await request.json() user_query = data.get("query", "") return stream_chat_response(user_query) # === Run the app (required for HF Spaces) === if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)