File size: 1,099 Bytes
80c3a84
6c0215b
d5939d1
d15392d
 
 
f0ba669
e6978bd
d86b208
f0ba669
ad67d60
80c3a84
 
e9f3a9a
80c3a84
 
a0b62ab
e6978bd
 
 
80c3a84
 
e6978bd
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama

app = FastAPI()

qwen3_gguf_llm = Llama.from_pretrained(
    repo_id="unsloth/Qwen3-0.6B-GGUF",
    filename="Qwen3-0.6B-UD-Q8_K_XL.gguf"
)

class PromptRequest(BaseModel):
    prompt: str

class GenerateResponse(BaseModel):
    generated_text: str

# Simple in-memory conversation memory (list of messages)
conversation_history = []

@app.post("/generate/qwen3-0.6b-gguf", response_model=GenerateResponse)
async def generate_qwen3_gguf_endpoint(request: PromptRequest):
    # Append user message to history
    conversation_history.append({"role": "user", "content": request.prompt})

    # Call the model with full conversation history
    response = qwen3_gguf_llm.create_chat_completion(messages=conversation_history)

    # Extract assistant reply
    assistant_message = response['choices'][0]['message']['content']

    # Append assistant reply to history
    conversation_history.append({"role": "assistant", "content": assistant_message})

    return GenerateResponse(generated_text=assistant_message)