Spaces:
Running
on
T4
Running
on
T4
trying streaming in new endpoint
Browse files
app.py
CHANGED
|
@@ -308,28 +308,61 @@ async def chat(query,history, method, sources,reports,subtype, client_ip=None, s
|
|
| 308 |
chat_model = inf_provider()
|
| 309 |
start_time = time.time()
|
| 310 |
async def process_stream():
|
| 311 |
-
nonlocal answer_yet
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
parsed_answer = parse_output_llm_with_sources(answer_yet)
|
| 326 |
history[-1] = (query, parsed_answer)
|
|
|
|
| 327 |
logs_data["answer"] = parsed_answer
|
| 328 |
yield [tuple(x) for x in history], docs_html, logs_data, session_id
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
|
| 330 |
-
# Stream the response updates
|
| 331 |
async for update in process_stream():
|
| 332 |
yield update
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
|
| 335 |
elif model_config.get('reader','TYPE') == 'DEDICATED':
|
|
|
|
| 308 |
chat_model = inf_provider()
|
| 309 |
start_time = time.time()
|
| 310 |
async def process_stream():
|
| 311 |
+
nonlocal answer_yet
|
| 312 |
+
try:
|
| 313 |
+
formatted_messages = [
|
| 314 |
+
{
|
| 315 |
+
"role": msg.type if hasattr(msg, 'type') else msg.role,
|
| 316 |
+
"content": msg.content
|
| 317 |
+
}
|
| 318 |
+
for msg in messages
|
| 319 |
+
]
|
| 320 |
+
|
| 321 |
+
response = chat_model.chat_completion(
|
| 322 |
+
messages=formatted_messages,
|
| 323 |
+
max_tokens=int(model_config.get('reader', 'MAX_TOKENS'))
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
response_text = response.choices[0].message.content
|
| 327 |
+
words = response_text.split()
|
| 328 |
+
for word in words:
|
| 329 |
+
answer_yet += word + " "
|
| 330 |
parsed_answer = parse_output_llm_with_sources(answer_yet)
|
| 331 |
history[-1] = (query, parsed_answer)
|
| 332 |
+
# Update logs_data with current answer (and get a new timestamp)
|
| 333 |
logs_data["answer"] = parsed_answer
|
| 334 |
yield [tuple(x) for x in history], docs_html, logs_data, session_id
|
| 335 |
+
await asyncio.sleep(0.05)
|
| 336 |
+
|
| 337 |
+
except Exception as e:
|
| 338 |
+
raise
|
| 339 |
|
|
|
|
| 340 |
async for update in process_stream():
|
| 341 |
yield update
|
| 342 |
+
|
| 343 |
+
# async def process_stream():
|
| 344 |
+
# nonlocal answer_yet # Use the outer scope's answer_yet variable
|
| 345 |
+
# # Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
|
| 346 |
+
# # instead of modifying the one from the outer scope.
|
| 347 |
+
# # Iterate over the streaming response chunks
|
| 348 |
+
# response = chat_model.chat.completions.create(
|
| 349 |
+
# model=model_config.get("reader","INF_PROVIDER_MODEL"),
|
| 350 |
+
# messages = messages,
|
| 351 |
+
# stream= True,
|
| 352 |
+
# max_tokens=int(model_config.get('reader','MAX_TOKENS')),
|
| 353 |
+
# )
|
| 354 |
+
# for message in response:
|
| 355 |
+
# token = message.choices[0].delta.content
|
| 356 |
+
# if token:
|
| 357 |
+
# answer_yet += token
|
| 358 |
+
# parsed_answer = parse_output_llm_with_sources(answer_yet)
|
| 359 |
+
# history[-1] = (query, parsed_answer)
|
| 360 |
+
# logs_data["answer"] = parsed_answer
|
| 361 |
+
# yield [tuple(x) for x in history], docs_html, logs_data, session_id
|
| 362 |
+
#
|
| 363 |
+
# # Stream the response updates
|
| 364 |
+
# async for update in process_stream():
|
| 365 |
+
# yield update
|
| 366 |
|
| 367 |
|
| 368 |
elif model_config.get('reader','TYPE') == 'DEDICATED':
|