from fastapi import FastAPI from pydantic import BaseModel from llama_cpp import Llama app = FastAPI() import os print("Current working directory:", os.getcwd()) print("Files in current directory:", os.listdir(".")) print("Model file exists:", os.path.isfile("Qwen3-0.6B-UD-IQ1_S.gguf")) qwen3_gguf_llm = Llama(model_path="Qwen3-0.6B-UD-IQ1_S.gguf") class PromptRequest(BaseModel): prompt: str class GenerateResponse(BaseModel): reasoning_content: str = "" generated_text: str @app.post("/generate/qwen3-0.6b-gguf", response_model=GenerateResponse) async def generate_qwen3_gguf_endpoint(request: PromptRequest): messages = [{"role": "user", "content": request.prompt}] response = qwen3_gguf_llm.create_chat_completion(messages=messages) generated_text = response['choices'][0]['message']['content'] return GenerateResponse(generated_text=generated_text)