from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama app = FastAPI() qwen3_gguf_llm = Llama.from_pretrained( repo_id="unsloth/Qwen3-0.6B-GGUF", filename="Qwen3-0.6B-UD-Q8_K_XL.gguf" ) class PromptRequest(BaseModel): prompt: str class GenerateResponse(BaseModel): generated_text: str # Simple in-memory conversation memory (list of messages) conversation_history = [] @app.post("/generate/qwen3-0.6b-gguf", response_model=GenerateResponse) async def generate_qwen3_gguf_endpoint(request: PromptRequest): # Append user message to history conversation_history.append({"role": "user", "content": request.prompt}) # Call the model with full conversation history response = qwen3_gguf_llm.create_chat_completion(messages=conversation_history) # Extract assistant reply assistant_message = response['choices'][0]['message']['content'] # Append assistant reply to history conversation_history.append({"role": "assistant", "content": assistant_message}) return GenerateResponse(generated_text=assistant_message)