Spaces:
Runtime error
Runtime error
File size: 2,167 Bytes
faf7233 c2f54d4 faf7233 9f503a4 faf7233 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import logging
from fastapi import FastAPI
from llama_index.llms.llama_cpp import LlamaCPP
from transformers import AutoTokenizer
logging.basicConfig(
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
level=logging.INFO
)
logger = logging.getLogger(__name__)
logger.info("Запускаемся... 🥳🥳🥳")
app = FastAPI()
model_url = "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf"
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
def messages_to_prompt(messages):
messages = [{"role": m.role.value, "content": m.content} for m in messages]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return prompt
def completion_to_prompt(completion):
messages = [{"role": "user", "content": completion}]
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return prompt
llm = LlamaCPP(
# You can pass in the URL to a GGML model to download it automatically
model_url=model_url,
# optionally, you can set the path to a pre-downloaded model instead of model_url
model_path=None,
temperature=0.1,
max_new_tokens=64,
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
context_window=1638,
# kwargs to pass to __call__()
generate_kwargs={},
# kwargs to pass to __init__()
# set to at least 1 to use GPU
model_kwargs={"n_gpu_layers": -1},
# transform inputs into Llama2 format
messages_to_prompt=messages_to_prompt,
completion_to_prompt=completion_to_prompt,
verbose=True,
)
@app.get("/")
def greet_json():
return {"Hello": "World!"}
@app.put("/system-prompt")
async def set_system_prompt(text: str):
logger.info('post/system-prompt')
# global SYSTEM_PROMPT
# SYSTEM_PROMPT = text
@app.post("/predict")
async def predict(text: str):
# Генерация ответа с помощью модели
logger.info('post/predict')
response = llm.complete(text)
return {"response": response} |