Spaces:
Runtime error
Runtime error
import logging | |
from fastapi import FastAPI | |
from llama_index.llms.llama_cpp import LlamaCPP | |
from transformers import AutoTokenizer | |
logging.basicConfig( | |
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', | |
level=logging.INFO | |
) | |
logger = logging.getLogger(__name__) | |
logger.info("Запускаемся... 🥳🥳🥳") | |
app = FastAPI() | |
model_url = "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf" | |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct") | |
def messages_to_prompt(messages): | |
messages = [{"role": m.role.value, "content": m.content} for m in messages] | |
prompt = tokenizer.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
return prompt | |
def completion_to_prompt(completion): | |
messages = [{"role": "user", "content": completion}] | |
prompt = tokenizer.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
return prompt | |
llm = LlamaCPP( | |
# You can pass in the URL to a GGML model to download it automatically | |
model_url=model_url, | |
# optionally, you can set the path to a pre-downloaded model instead of model_url | |
model_path=None, | |
temperature=0.1, | |
max_new_tokens=64, | |
# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room | |
context_window=1638, | |
# kwargs to pass to __call__() | |
generate_kwargs={}, | |
# kwargs to pass to __init__() | |
# set to at least 1 to use GPU | |
model_kwargs={"n_gpu_layers": -1}, | |
# transform inputs into Llama2 format | |
messages_to_prompt=messages_to_prompt, | |
completion_to_prompt=completion_to_prompt, | |
verbose=True, | |
) | |
def greet_json(): | |
return {"Hello": "World!"} | |
async def set_system_prompt(text: str): | |
logger.info('post/system-prompt') | |
# global SYSTEM_PROMPT | |
# SYSTEM_PROMPT = text | |
async def predict(text: str): | |
# Генерация ответа с помощью модели | |
logger.info('post/predict') | |
response = llm.complete(text) | |
return {"response": response} |