Qwen_API / app.py
Prince-2025's picture
Upload 3 files
60967d1 verified
import os
import time
import ctypes
from fastapi import FastAPI, Request
from fastapi.responses import StreamingResponse
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import llama_cpp
import uvicorn
# === Suppress llama_cpp logs ===
@ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)
def silent_log_callback(level, message, user_data):
pass
llama_cpp.llama_log_set(silent_log_callback, None)
# === Model configuration ===
os.environ.setdefault("HF_HOME", "/root/.cache/huggingface")
CACHE_DIR = os.environ["HF_HOME"]
MODEL_REPO = "Qwen/Qwen1.5-1.8B-Chat-GGUF"
MODEL_FILE = "qwen1_5-1_8b-chat-q4_k_m.gguf"
# === Download model if not present ===
model_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILE,
cache_dir=CACHE_DIR,
local_dir=CACHE_DIR,
local_dir_use_symlinks=False
)
# === Load LLM ===
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_batch=512,
n_threads=os.cpu_count(),
n_gpu_layers=0, # CPU-only
use_mmap=True,
use_mlock=False,
low_cpu_mem_usage=True,
verbose=False
)
# === Initialize FastAPI ===
app = FastAPI()
# === Streaming generator ===
def stream_chat_response(user_query: str):
messages = [
{
"role": "system",
"content": (
"You are a helpful AI assistant that reads research papers, "
"highlights the key points, and explains the paper in simple terms."
)
},
{
"role": "user",
"content": user_query
}
]
def generate():
try:
for chunk in llm.create_chat_completion(
messages=messages,
temperature=0.2,
top_k=50,
top_p=0.9,
repeat_penalty=1.1,
max_tokens=512,
stream=True,
):
if "choices" in chunk:
token = chunk["choices"][0]["delta"].get("content", "")
yield f"data: {token}\n\n"
except Exception as e:
yield f"data: [ERROR] {str(e)}\n\n"
return StreamingResponse(generate(), media_type="text/event-stream")
# === Root ===
@app.get("/")
def read_root():
return {"message": "LLM is ready. Send POST to /chat"}
# === POST endpoint to chat with paper ===
@app.post("/chat")
async def chat_with_llm(request: Request):
data = await request.json()
user_query = data.get("query", "")
return stream_chat_response(user_query)
# === Run the app (required for HF Spaces) ===
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)