import logging from fastapi import FastAPI from llama_index.llms.llama_cpp import LlamaCPP from transformers import AutoTokenizer logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO ) logger = logging.getLogger(__name__) logger.info("Запускаемся... 🥳🥳🥳") app = FastAPI() model_url = "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf" tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct") def messages_to_prompt(messages): messages = [{"role": m.role.value, "content": m.content} for m in messages] prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) return prompt def completion_to_prompt(completion): messages = [{"role": "user", "content": completion}] prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) return prompt llm = LlamaCPP( # You can pass in the URL to a GGML model to download it automatically model_url=model_url, # optionally, you can set the path to a pre-downloaded model instead of model_url model_path=None, temperature=0.1, max_new_tokens=64, # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room context_window=1638, # kwargs to pass to __call__() generate_kwargs={}, # kwargs to pass to __init__() # set to at least 1 to use GPU model_kwargs={"n_gpu_layers": -1}, # transform inputs into Llama2 format messages_to_prompt=messages_to_prompt, completion_to_prompt=completion_to_prompt, verbose=True, ) @app.get("/") def greet_json(): return {"Hello": "World!"} @app.put("/system-prompt") async def set_system_prompt(text: str): logger.info('post/system-prompt') # global SYSTEM_PROMPT # SYSTEM_PROMPT = text @app.post("/predict") async def predict(text: str): # Генерация ответа с помощью модели logger.info('post/predict') response = llm.complete(text) return {"response": response}