Spaces:

allekssandr
/

llama-index-space

Runtime error

File size: 2,167 Bytes

import logging

from fastapi import FastAPI
from llama_index.llms.llama_cpp import LlamaCPP
from transformers import AutoTokenizer

logging.basicConfig(
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)

logger.info("Запускаемся... 🥳🥳🥳")

app = FastAPI()
model_url = "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf"

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")


def messages_to_prompt(messages):
    messages = [{"role": m.role.value, "content": m.content} for m in messages]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return prompt


def completion_to_prompt(completion):
    messages = [{"role": "user", "content": completion}]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return prompt


llm = LlamaCPP(
    # You can pass in the URL to a GGML model to download it automatically
    model_url=model_url,
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path=None,
    temperature=0.1,
    max_new_tokens=64,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=1638,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

@app.get("/")
def greet_json():
    return {"Hello": "World!"}

@app.put("/system-prompt")
async def set_system_prompt(text: str):
    logger.info('post/system-prompt')
    # global SYSTEM_PROMPT
    # SYSTEM_PROMPT = text

@app.post("/predict")
async def predict(text: str):
    # Генерация ответа с помощью модели
    logger.info('post/predict')
    response = llm.complete(text)
    return {"response": response}