Spaces:
Running
Running
from fastapi import FastAPI | |
from pydantic import BaseModel | |
from llama_cpp import Llama | |
app = FastAPI() | |
qwen3_gguf_llm = Llama.from_pretrained( | |
repo_id="unsloth/Qwen3-0.6B-GGUF", | |
filename="Qwen3-0.6B-BF16.gguf", | |
) | |
class PromptRequest(BaseModel): | |
prompt: str | |
class GenerateResponse(BaseModel): | |
reasoning_content: str = "" | |
generated_text: str | |
async def generate_qwen3_gguf_endpoint(request: PromptRequest): | |
messages = [{"role": "user", "content": request.prompt}] | |
response = qwen3_gguf_llm.create_chat_completion(messages=messages, max_tokens=256) | |
generated_text = response['choices'][0]['message']['content'] | |
return GenerateResponse(generated_text=generated_text) | |