Spaces:

Prince-2025
/

Qwen_API

Sleeping

App Files Files Community

Qwen_API / app.py

Prince-2025

Upload 3 files

60967d1 verified about 1 month ago

raw

history blame contribute delete

2.78 kB

	import os
	import time
	import ctypes
	from fastapi import FastAPI, Request
	from fastapi.responses import StreamingResponse
	from huggingface_hub import hf_hub_download
	from llama_cpp import Llama
	import llama_cpp
	import uvicorn

	# === Suppress llama_cpp logs ===
	@ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)
	def silent_log_callback(level, message, user_data):
	pass

	llama_cpp.llama_log_set(silent_log_callback, None)

	# === Model configuration ===
	os.environ.setdefault("HF_HOME", "/root/.cache/huggingface")
	CACHE_DIR = os.environ["HF_HOME"]
	MODEL_REPO = "Qwen/Qwen1.5-1.8B-Chat-GGUF"
	MODEL_FILE = "qwen1_5-1_8b-chat-q4_k_m.gguf"

	# === Download model if not present ===
	model_path = hf_hub_download(
	repo_id=MODEL_REPO,
	filename=MODEL_FILE,
	cache_dir=CACHE_DIR,
	local_dir=CACHE_DIR,
	local_dir_use_symlinks=False
	)

	# === Load LLM ===
	llm = Llama(
	model_path=model_path,
	n_ctx=4096,
	n_batch=512,
	n_threads=os.cpu_count(),
	n_gpu_layers=0, # CPU-only
	use_mmap=True,
	use_mlock=False,
	low_cpu_mem_usage=True,
	verbose=False
	)

	# === Initialize FastAPI ===
	app = FastAPI()

	# === Streaming generator ===
	def stream_chat_response(user_query: str):
	messages = [
	{
	"role": "system",
	"content": (
	"You are a helpful AI assistant that reads research papers, "
	"highlights the key points, and explains the paper in simple terms."
	)
	},
	{
	"role": "user",
	"content": user_query
	}
	]

	def generate():
	try:
	for chunk in llm.create_chat_completion(
	messages=messages,
	temperature=0.2,
	top_k=50,
	top_p=0.9,
	repeat_penalty=1.1,
	max_tokens=512,
	stream=True,
	):
	if "choices" in chunk:
	token = chunk["choices"][0]["delta"].get("content", "")
	yield f"data: {token}\n\n"
	except Exception as e:
	yield f"data: [ERROR] {str(e)}\n\n"

	return StreamingResponse(generate(), media_type="text/event-stream")

	# === Root ===
	@app.get("/")
	def read_root():
	return {"message": "LLM is ready. Send POST to /chat"}

	# === POST endpoint to chat with paper ===
	@app.post("/chat")
	async def chat_with_llm(request: Request):
	data = await request.json()
	user_query = data.get("query", "")
	return stream_chat_response(user_query)

	# === Run the app (required for HF Spaces) ===
	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)