Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -253,7 +253,11 @@ def generate_qwen3_gguf(prompt: str) -> (str, str):
|
|
253 |
messages = [
|
254 |
{"role": "user", "content": prompt}
|
255 |
]
|
256 |
-
|
|
|
|
|
|
|
|
|
257 |
generated_text = response['choices'][0]['message']['content']
|
258 |
|
259 |
if "</think>" in generated_text:
|
@@ -264,6 +268,7 @@ def generate_qwen3_gguf(prompt: str) -> (str, str):
|
|
264 |
|
265 |
|
266 |
|
|
|
267 |
@app.post("/generate/{model_name}", response_model=GenerateResponse)
|
268 |
async def generate(
|
269 |
request: PromptRequest,
|
|
|
253 |
messages = [
|
254 |
{"role": "user", "content": prompt}
|
255 |
]
|
256 |
+
# Set max_tokens or max_new_tokens to keep total tokens <= 512
|
257 |
+
response = qwen3_gguf_llm.create_chat_completion(
|
258 |
+
messages=messages,
|
259 |
+
max_tokens=512 # or smaller, adjust to fit your use case
|
260 |
+
)
|
261 |
generated_text = response['choices'][0]['message']['content']
|
262 |
|
263 |
if "</think>" in generated_text:
|
|
|
268 |
|
269 |
|
270 |
|
271 |
+
|
272 |
@app.post("/generate/{model_name}", response_model=GenerateResponse)
|
273 |
async def generate(
|
274 |
request: PromptRequest,
|