Spaces:

allekssandr
/

llama-index-space

Runtime error

App Files Files Community

Aleksandr Maiorov commited on Feb 18

Commit

faf7233

1 Parent(s): fa63c41

v 0.1

Browse files

Files changed (2) hide show

Dockerfile +14 -0
app.py +72 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,14 @@

+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./app/requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+RUN pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
+COPY --chown=user . /app
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import logging
+from fastapi import FastAPI
+from llama_index.llms.llama_cpp import LlamaCPP
+from transformers import AutoTokenizer
+logging.basicConfig(
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+logger.info("Запускаемся... 🥳🥳🥳")
+app = FastAPI()
+model_url = "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf"
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
+def messages_to_prompt(messages):
+    messages = [{"role": m.role.value, "content": m.content} for m in messages]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    return prompt
+def completion_to_prompt(completion):
+    messages = [{"role": "user", "content": completion}]
+    prompt = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    return prompt
+llm = LlamaCPP(
+    # You can pass in the URL to a GGML model to download it automatically
+    model_url=model_url,
+    # optionally, you can set the path to a pre-downloaded model instead of model_url
+    model_path=None,
+    temperature=0.1,
+    max_new_tokens=256,
+    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
+    context_window=16384,
+    # kwargs to pass to __call__()
+    generate_kwargs={},
+    # kwargs to pass to __init__()
+    # set to at least 1 to use GPU
+    model_kwargs={"n_gpu_layers": -1},
+    # transform inputs into Llama2 format
+    messages_to_prompt=messages_to_prompt,
+    completion_to_prompt=completion_to_prompt,
+    verbose=True,
+)
+@app.get("/")
+def greet_json():
+    return {"Hello": "World!"}
+@app.put("/system-prompt")
+async def set_system_prompt(text: str):
+    logger.info('post/system-prompt')
+    # global SYSTEM_PROMPT
+    # SYSTEM_PROMPT = text
+@app.post("/predict")
+async def predict(text: str):
+    # Генерация ответа с помощью модели
+    logger.info('post/predict')
+    response = llm.complete(text)
+    return {"response": response}