Spaces:
Running
Running
File size: 1,099 Bytes
80c3a84 6c0215b d5939d1 d15392d f0ba669 e6978bd d86b208 f0ba669 ad67d60 80c3a84 e9f3a9a 80c3a84 a0b62ab e6978bd 80c3a84 e6978bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
app = FastAPI()
qwen3_gguf_llm = Llama.from_pretrained(
repo_id="unsloth/Qwen3-0.6B-GGUF",
filename="Qwen3-0.6B-UD-Q8_K_XL.gguf"
)
class PromptRequest(BaseModel):
prompt: str
class GenerateResponse(BaseModel):
generated_text: str
# Simple in-memory conversation memory (list of messages)
conversation_history = []
@app.post("/generate/qwen3-0.6b-gguf", response_model=GenerateResponse)
async def generate_qwen3_gguf_endpoint(request: PromptRequest):
# Append user message to history
conversation_history.append({"role": "user", "content": request.prompt})
# Call the model with full conversation history
response = qwen3_gguf_llm.create_chat_completion(messages=conversation_history)
# Extract assistant reply
assistant_message = response['choices'][0]['message']['content']
# Append assistant reply to history
conversation_history.append({"role": "assistant", "content": assistant_message})
return GenerateResponse(generated_text=assistant_message)
|