Aleksandr Maiorov commited on
Commit
faf7233
·
1 Parent(s): fa63c41
Files changed (2) hide show
  1. Dockerfile +14 -0
  2. app.py +72 -0
Dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ RUN useradd -m -u 1000 user
4
+ USER user
5
+ ENV PATH="/home/user/.local/bin:$PATH"
6
+
7
+ WORKDIR /app
8
+
9
+ COPY --chown=user ./app/requirements.txt requirements.txt
10
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
11
+ RUN pip install llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
12
+
13
+ COPY --chown=user . /app
14
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from fastapi import FastAPI
4
+ from llama_index.llms.llama_cpp import LlamaCPP
5
+ from transformers import AutoTokenizer
6
+
7
+ logging.basicConfig(
8
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
9
+ level=logging.INFO
10
+ )
11
+ logger = logging.getLogger(__name__)
12
+
13
+ logger.info("Запускаемся... 🥳🥳🥳")
14
+
15
+ app = FastAPI()
16
+ model_url = "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf"
17
+
18
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
19
+
20
+
21
+ def messages_to_prompt(messages):
22
+ messages = [{"role": m.role.value, "content": m.content} for m in messages]
23
+ prompt = tokenizer.apply_chat_template(
24
+ messages, tokenize=False, add_generation_prompt=True
25
+ )
26
+ return prompt
27
+
28
+
29
+ def completion_to_prompt(completion):
30
+ messages = [{"role": "user", "content": completion}]
31
+ prompt = tokenizer.apply_chat_template(
32
+ messages, tokenize=False, add_generation_prompt=True
33
+ )
34
+ return prompt
35
+
36
+
37
+ llm = LlamaCPP(
38
+ # You can pass in the URL to a GGML model to download it automatically
39
+ model_url=model_url,
40
+ # optionally, you can set the path to a pre-downloaded model instead of model_url
41
+ model_path=None,
42
+ temperature=0.1,
43
+ max_new_tokens=256,
44
+ # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
45
+ context_window=16384,
46
+ # kwargs to pass to __call__()
47
+ generate_kwargs={},
48
+ # kwargs to pass to __init__()
49
+ # set to at least 1 to use GPU
50
+ model_kwargs={"n_gpu_layers": -1},
51
+ # transform inputs into Llama2 format
52
+ messages_to_prompt=messages_to_prompt,
53
+ completion_to_prompt=completion_to_prompt,
54
+ verbose=True,
55
+ )
56
+
57
+ @app.get("/")
58
+ def greet_json():
59
+ return {"Hello": "World!"}
60
+
61
+ @app.put("/system-prompt")
62
+ async def set_system_prompt(text: str):
63
+ logger.info('post/system-prompt')
64
+ # global SYSTEM_PROMPT
65
+ # SYSTEM_PROMPT = text
66
+
67
+ @app.post("/predict")
68
+ async def predict(text: str):
69
+ # Генерация ответа с помощью модели
70
+ logger.info('post/predict')
71
+ response = llm.complete(text)
72
+ return {"response": response}