Spaces:

GIZ
/

audit_assistant

Running on T4

App Files Files Community

ppsingh commited on May 15

Commit

08db2a3

verified ·

1 Parent(s): cf374ac

trying streaming in new endpoint

Browse files

Files changed (1) hide show

app.py +48 -15

app.py CHANGED Viewed

@@ -308,28 +308,61 @@ async def chat(query,history, method, sources,reports,subtype, client_ip=None, s
         chat_model = inf_provider()
         start_time = time.time()
         async def process_stream():
-            nonlocal answer_yet # Use the outer scope's answer_yet variable
-            # Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
-            # instead of modifying the one from the outer scope.
-            # Iterate over the streaming response chunks
-            response = chat_model.chat.completions.create(
-                                        model=model_config.get("reader","INF_PROVIDER_MODEL"),
-                                        messages = messages,
-                                        stream= True,
-                                        max_tokens=int(model_config.get('reader','MAX_TOKENS')),
-                                    )
-            for message in response:
-                token = message.choices[0].delta.content
-                if token:
-                    answer_yet += token
                     parsed_answer = parse_output_llm_with_sources(answer_yet)
                     history[-1] = (query, parsed_answer)
                     logs_data["answer"] = parsed_answer
                     yield [tuple(x) for x in history], docs_html, logs_data, session_id
-        # Stream the response updates
         async for update in process_stream():
             yield update
     elif model_config.get('reader','TYPE') == 'DEDICATED':

         chat_model = inf_provider()
         start_time = time.time()
         async def process_stream():
+            nonlocal answer_yet
+            try:
+                formatted_messages = [
+                    {
+                        "role": msg.type if hasattr(msg, 'type') else msg.role,
+                        "content": msg.content
+                    }
+                    for msg in messages
+                ]
+                response = chat_model.chat_completion(
+                    messages=formatted_messages,
+                    max_tokens=int(model_config.get('reader', 'MAX_TOKENS'))
+                )
+                response_text = response.choices[0].message.content
+                words = response_text.split()
+                for word in words:
+                    answer_yet += word + " "
                     parsed_answer = parse_output_llm_with_sources(answer_yet)
                     history[-1] = (query, parsed_answer)
+                    # Update logs_data with current answer (and get a new timestamp)
                     logs_data["answer"] = parsed_answer
                     yield [tuple(x) for x in history], docs_html, logs_data, session_id
+                    await asyncio.sleep(0.05)
+            except Exception as e:
+                raise
         async for update in process_stream():
             yield update
+#        async def process_stream():
+#            nonlocal answer_yet # Use the outer scope's answer_yet variable
+#            # Without nonlocal, Python would create a new local variable answer_yet inside process_stream(),
+#            # instead of modifying the one from the outer scope.
+#            # Iterate over the streaming response chunks
+#            response = chat_model.chat.completions.create(
+#                                        model=model_config.get("reader","INF_PROVIDER_MODEL"),
+#                                        messages = messages,
+#                                        stream= True,
+#                                        max_tokens=int(model_config.get('reader','MAX_TOKENS')),
+#                                    )
+#            for message in response:
+#                token = message.choices[0].delta.content
+#                if token:
+#                    answer_yet += token
+#                    parsed_answer = parse_output_llm_with_sources(answer_yet)
+#                    history[-1] = (query, parsed_answer)
+#                    logs_data["answer"] = parsed_answer
+#                    yield [tuple(x) for x in history], docs_html, logs_data, session_id
+#
+#        # Stream the response updates
+#        async for update in process_stream():
+#            yield update
     elif model_config.get('reader','TYPE') == 'DEDICATED':