Aleksandr Maiorov
v 0.1
c2f54d4
raw
history blame
2.17 kB
import logging
from fastapi import FastAPI
from llama_index.llms.llama_cpp import LlamaCPP
from transformers import AutoTokenizer
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.INFO
)
logger = logging.getLogger(__name__)
logger.info("Запускаемся... 🥳🥳🥳")
app = FastAPI()
model_url = "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf"
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
def messages_to_prompt(messages):
messages = [{"role": m.role.value, "content": m.content} for m in messages]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return prompt
def completion_to_prompt(completion):
messages = [{"role": "user", "content": completion}]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return prompt
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
model_url=model_url,
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path=None,
temperature=0.1,
max_new_tokens=64,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=1638,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": -1},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
@app.get("/")
def greet_json():
return {"Hello": "World!"}
@app.put("/system-prompt")
async def set_system_prompt(text: str):
logger.info('post/system-prompt')
# global SYSTEM_PROMPT
# SYSTEM_PROMPT = text
@app.post("/predict")
async def predict(text: str):
# Генерация ответа с помощью модели
logger.info('post/predict')
response = llm.complete(text)
return {"response": response}