Spaces:

Prince-2025
/

Qwen_API

Sleeping

App Files Files Community

Prince-2025 commited on Aug 1

Commit

3ec3a80

verified ·

1 Parent(s): 7b9595c

Upload 3 files

Browse files

Files changed (3) hide show

Dockerfile +34 -0
app.py +97 -0
requirements.txt +3 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,34 @@

+# Use official Python 3.11 image
+FROM python:3.10-slim
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    libopenblas-dev \
+    git \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Set environment variables
+ENV PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    HF_HUB_DISABLE_SYMLINKS_WARNING=1
+# Create app directory
+WORKDIR /app
+# Copy your app files
+COPY app.py /app/app.py
+COPY requirements.txt /app/requirements.txt
+# Install Python dependencies (including llama-cpp-python from PyPI — NOT broken wheel links)
+RUN pip install --upgrade pip && \
+    pip install llama-cpp-python==0.2.72 && \
+    pip install -r requirements.txt
+# Expose FastAPI port
+EXPOSE 7860
+# Start the FastAPI app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import time
+import ctypes
+from fastapi import FastAPI, Request
+from fastapi.responses import StreamingResponse
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+import llama_cpp
+import uvicorn
+# === Suppress llama_cpp logs ===
+@ctypes.CFUNCTYPE(None, ctypes.c_int, ctypes.c_char_p, ctypes.c_void_p)
+def silent_log_callback(level, message, user_data):
+    pass
+llama_cpp.llama_log_set(silent_log_callback, None)
+# === Model configuration ===
+MODEL_REPO = "Qwen/Qwen1.5-1.8B-Chat-GGUF"
+MODEL_FILE = "qwen1_5-1_8b-chat-q4_k_m.gguf"
+CACHE_DIR = "/data/models"  # Use /data for Hugging Face Spaces
+# === Download model if not present ===
+model_path = hf_hub_download(
+    repo_id=MODEL_REPO,
+    filename=MODEL_FILE,
+    cache_dir=CACHE_DIR,
+    local_dir=CACHE_DIR,
+    local_dir_use_symlinks=False
+)
+# === Load LLM ===
+llm = Llama(
+    model_path=model_path,
+    n_ctx=4096,
+    n_batch=512,
+    n_threads=os.cpu_count(),
+    n_gpu_layers=0,            # CPU-only
+    use_mmap=True,
+    use_mlock=False,
+    low_cpu_mem_usage=True,
+    verbose=False
+)
+# === Initialize FastAPI ===
+app = FastAPI()
+# === Streaming generator ===
+def stream_chat_response(user_query: str):
+    messages = [
+        {
+            "role": "system",
+            "content": (
+                "You are a helpful AI assistant that reads research papers, "
+                "highlights the key points, and explains the paper in simple terms."
+            )
+        },
+        {
+            "role": "user",
+            "content": user_query
+        }
+    ]
+    def generate():
+        try:
+            for chunk in llm.create_chat_completion(
+                messages=messages,
+                temperature=0.2,
+                top_k=50,
+                top_p=0.9,
+                repeat_penalty=1.1,
+                max_tokens=512,
+                stream=True,
+            ):
+                if "choices" in chunk:
+                    token = chunk["choices"][0]["delta"].get("content", "")
+                    yield f"data: {token}\n\n"
+        except Exception as e:
+            yield f"data: [ERROR] {str(e)}\n\n"
+    return StreamingResponse(generate(), media_type="text/event-stream")
+# === Root ===
+@app.get("/")
+def read_root():
+    return {"message": "LLM is ready. Send POST to /chat"}
+# === POST endpoint to chat with paper ===
+@app.post("/chat")
+async def chat_with_llm(request: Request):
+    data = await request.json()
+    user_query = data.get("query", "")
+    return stream_chat_response(user_query)
+# === Run the app (required for HF Spaces) ===
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+fastapi
+uvicorn
+huggingface_hub