Spaces:

allekssandr
/

llama-index-space

Runtime error

Aleksandr Maiorov

v 0.1

c2f54d4 6 months ago

2.17 kB

	import logging

	from fastapi import FastAPI
	from llama_index.llms.llama_cpp import LlamaCPP
	from transformers import AutoTokenizer

	logging.basicConfig(
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	level=logging.INFO
	)
	logger = logging.getLogger(__name__)

	logger.info("Запускаемся... 🥳🥳🥳")

	app = FastAPI()
	model_url = "https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q3_k_m.gguf"

	tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")


	def messages_to_prompt(messages):
	messages = [{"role": m.role.value, "content": m.content} for m in messages]
	prompt = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	return prompt


	def completion_to_prompt(completion):
	messages = [{"role": "user", "content": completion}]
	prompt = tokenizer.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	return prompt


	llm = LlamaCPP(
	# You can pass in the URL to a GGML model to download it automatically
	model_url=model_url,
	# optionally, you can set the path to a pre-downloaded model instead of model_url
	model_path=None,
	temperature=0.1,
	max_new_tokens=64,
	# llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
	context_window=1638,
	# kwargs to pass to __call__()
	generate_kwargs={},
	# kwargs to pass to __init__()
	# set to at least 1 to use GPU
	model_kwargs={"n_gpu_layers": -1},
	# transform inputs into Llama2 format
	messages_to_prompt=messages_to_prompt,
	completion_to_prompt=completion_to_prompt,
	verbose=True,
	)

	@app.get("/")
	def greet_json():
	return {"Hello": "World!"}

	@app.put("/system-prompt")
	async def set_system_prompt(text: str):
	logger.info('post/system-prompt')
	# global SYSTEM_PROMPT
	# SYSTEM_PROMPT = text

	@app.post("/predict")
	async def predict(text: str):
	# Генерация ответа с помощью модели
	logger.info('post/predict')
	response = llm.complete(text)
	return {"response": response}