File size: 16,763 Bytes
924e633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1c0a32
924e633
f1c0a32
924e633
 
 
 
f1c0a32
 
924e633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1c0a32
 
 
924e633
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1c0a32
 
 
924e633
 
f1c0a32
 
 
 
 
 
 
 
 
 
924e633
 
 
 
 
 
 
 
 
f1c0a32
924e633
 
 
 
 
 
 
f1c0a32
 
 
924e633
 
 
 
 
 
 
 
 
 
 
 
 
 
f1c0a32
924e633
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
import gradio as gr
import asyncio
import logging
import tempfile
import json
import re
import requests
from typing import Optional, Dict, Any, List

from services.audio_service import AudioService
from services.llm_service import LLMService
from services.screen_service import ScreenService
from config.settings import Settings
from config.prompts import get_generic_prompt, get_vision_prompt

# Configure root logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class MCPRestClient:
    def __init__(self, base_url: str = "http://localhost:8000"):
        self.base_url = base_url.rstrip('/')

    async def initialize(self) -> bool:
        """Test connection to MCP server"""
        mcp_available = False
        try:
            response = requests.get(f"{self.base_url}/", timeout=5)
            if response.status_code == 200:
                logger.info("Successfully connected to MCP server")
                mcp_available = True
                return mcp_available
            else:
                raise ConnectionError(f"MCP server returned status {response.status_code}")
        except Exception as e:
            logger.error(f"Failed to connect to MCP server at {self.base_url}: {e}")
            logger.info("IRIS did not detect any MCP server. If you're running this in a HuggingFace space, please referr to the readme.md documentation.")

    async def get_available_tools(self) -> Dict[str, Dict]:
        """Get list of available tools from MCP server"""
        try:
            response = requests.get(f"{self.base_url}/tools", timeout=5)
            if response.status_code == 200:
                data = response.json()
                tools = {}
                for tool in data.get("tools", []):
                    tools[tool["name"]] = {
                        "description": tool.get("description", ""),
                        "inputSchema": tool.get("inputSchema", {})
                    }
                return tools
            else:
                logger.error(f"Failed to get tools: HTTP {response.status_code}")
                return {}
        except Exception as e:
            logger.error(f"Failed to get tools: {e}")
            return {}

    async def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> Any:
        """Call a tool on the MCP server"""
        try:
            payload = {
                "name": tool_name,
                "arguments": arguments
            }
            
            response = requests.post(
                f"{self.base_url}/tools/call",
                json=payload,
                timeout=30
            )
            
            if response.status_code == 200:
                data = response.json()
                if data.get("success"):
                    return data.get("result")
                else:
                    return {"error": data.get("error", "Unknown error")}
            else:
                return {"error": f"HTTP {response.status_code}: {response.text}"}
                    
        except Exception as e:
            return {"error": str(e)}

    async def close(self):
        """Nothing to close with requests"""
        pass

class AgenticChatbot:
    def __init__(self):
        self.settings = Settings()

        # AudioService
        audio_api_key = (
            self.settings.hf_token
            if self.settings.effective_audio_provider == "huggingface"
            else self.settings.openai_api_key
        )
        self.audio_service = AudioService(
            api_key=audio_api_key,
            stt_provider="fal-ai",
            stt_model=self.settings.stt_model,
            tts_model=self.settings.tts_model,
        )

        # LLMService
        self.llm_service = LLMService(
            api_key=self.settings.llm_api_key,
            model_name=self.settings.effective_model_name,
        )

        # MCPService - Now using REST client
        mcp_server_url = getattr(self.settings, 'mcp_server_url', 'http://localhost:8000')
        self.mcp_service = MCPRestClient(mcp_server_url)

        # ScreenService
        self.screen_service = ScreenService(
            prompt=get_vision_prompt(),
            model=self.settings.NEBIUS_MODEL,
            fps=0.05,
            queue_size=2,
            monitor=1,
            compression_quality=self.settings.screen_compression_quality,
            max_width=self.settings.max_width,
            max_height=self.settings.max_height,
        )
        self.latest_screen_context: str = ""
        self.conversation_history: List[Dict[str, Any]] = []

    async def initialize(self):
        try:
            mcp_available = await self.mcp_service.initialize()
            if not mcp_available:
                return False
            tools = await self.mcp_service.get_available_tools()
            logger.info(f"Initialized with {len(tools)} MCP tools")
        except Exception as e:
            logger.error(f"MCP init failed: {e}")

    # Screen callbacks
    def _on_screen_result(self, resp: dict, latency: float, frame_b64: str):
        try:
            content = resp.choices[0].message.content
        except Exception:
            content = str(resp)
        self.latest_screen_context = content
        logger.info(f"[Screen] {latency*1000:.0f}ms β†’ {content}")

    def _get_conversation_history(self) -> List[Dict[str, str]]:
        """Return the current conversation history for the screen service"""
        return self.conversation_history.copy()

    def start_screen_sharing(self) -> str:
        self.latest_screen_context = ""
        # Pass the history getter method to screen service
        self.screen_service.start(
            self._on_screen_result,
            history_getter=self._get_conversation_history  # Use the method reference
        )
        return "βœ… Screen sharing started."

    async def stop_screen_sharing(
        self,
        history: Optional[List[Dict[str, str]]]
    ) -> (List[Dict[str, str]], str, Optional[str]):
        """Stop screen sharing and append an LLM-generated summary to the chat."""
        # Stop capture
        self.screen_service.stop()
        
        # Get the latest vision context
        vision_ctx = self.latest_screen_context
        
        if vision_ctx and history is not None:
            # Call process_message with the vision context as user input
            updated_history, audio_path = await self.process_message(
                text_input=f"VISION MODEL OUTPUT: {vision_ctx}",
                audio_input=None,
                history=history
            )
            return updated_history, "πŸ›‘ Screen sharing stopped.", audio_path
        
        # If no vision context or history, just return
        return history or [], "πŸ›‘ Screen sharing stopped.", None

    async def execute_tool_calls(self, response_text: str) -> str:
        """Parse and execute function calls from LLM response using robust regex parsing"""
        
        # Clean the response text - remove code blocks and extra formatting
        cleaned_text = re.sub(r'```[a-zA-Z]*\n?', '', response_text)  # Remove code block markers
        cleaned_text = re.sub(r'\n```', '', cleaned_text)  # Remove closing code blocks
        
        # Pattern for function calls: function_name(arg1="value1", arg2=value2, arg3=true)
        function_pattern = r'(\w+)\s*\(\s*([^)]*)\s*\)'
        
        results = []
        
        # Find all function calls in the cleaned response
        for match in re.finditer(function_pattern, cleaned_text):
            tool_name = match.group(1)
            args_str = match.group(2).strip()
            
            # Skip if this isn't actually a tool (check against available tools)
            available_tools = await self.mcp_service.get_available_tools()
            if tool_name not in available_tools:
                continue
                
            try:
                # Parse arguments using regex for key=value pairs
                args = {}
                if args_str:
                    # Pattern for key=value pairs, handling quoted strings, numbers, booleans
                    arg_pattern = r'(\w+)\s*=\s*(?:"([^"]*)"|\'([^\']*)\'|(\w+))'
                    
                    for arg_match in re.finditer(arg_pattern, args_str):
                        key = arg_match.group(1)
                        # Get the value from whichever group matched (quoted or unquoted)
                        value = (arg_match.group(2) or 
                                arg_match.group(3) or 
                                arg_match.group(4))
                        
                        # Type conversion for common types
                        if value.lower() == 'true':
                            args[key] = True
                        elif value.lower() == 'false':
                            args[key] = False
                        elif value.isdigit():
                            args[key] = int(value)
                        elif value.replace('.', '').isdigit():
                            args[key] = float(value)
                        else:
                            args[key] = value
                
                # Execute the tool
                logger.info(f"Executing tool: {tool_name} with args: {args}")
                result = await self.mcp_service.call_tool(tool_name, args)
                results.append({
                    'tool': tool_name,
                    'args': args,
                    'result': result
                })
                
            except Exception as e:
                results.append({
                    'tool': tool_name,
                    'args': args if 'args' in locals() else {},
                    'error': str(e)
                })
        
        # Format results for LLM
        if not results:
            return ""
            
        formatted_results = []
        for result in results:
            if 'error' in result:
                formatted_results.append(
                    f"Tool {result['tool']} failed: {result['error']}"
                )
            else:
                formatted_results.append(
                    f"Tool {result['tool']} executed successfully:\n{json.dumps(result['result'], indent=2)}"
                )
        
        return "\n\n".join(formatted_results)

    # Chat / tool integration
    async def generate_response(
        self,
        user_input: str,
        screen_context: str = "",
        tool_result: str = ""
    ) -> str:
        # Retrieve available tools metadata
        tools = await self.mcp_service.get_available_tools()
        # Format tool list for prompt
        tool_desc = "\n".join(f"- {name}: {info.get('description','')}" for name, info in tools.items())

        # Build messages
        messages: List[Dict[str, str]] = [
            {"role": "system", "content": get_generic_prompt()},
        ]
        # Inform LLM about tools
        if tool_desc:
            messages.append({"role": "system", "content": f"Available tools:\n{tool_desc}"})
        messages.append({"role": "user",   "content": user_input})
        if tool_result:
            messages.append({"role": "assistant", "content": tool_result})

        return await self.llm_service.get_chat_completion(messages)

    async def process_message(
        self,
        text_input: str,
        audio_input: Optional[str],
        history: List[Dict[str, str]]
    ) -> (List[Dict[str, str]], Optional[str]):
        # Debug: Log the incoming state
        logger.info(f"=== PROCESS_MESSAGE START ===")
        for i, msg in enumerate(history[-3:]):
            logger.info(f"  {len(history) - 3 + i}: {msg.get('role')} - {msg.get('content', '')[:100]}...")
        
        # Update the internal conversation history to match the UI history
        self.conversation_history = history.copy()
        
        # STT
        transcript = ""
        if audio_input:
            transcript = await self.audio_service.speech_to_text(audio_input)
        user_input = (text_input + " " + transcript).strip()
        
        # If no input, return unchanged
        if not user_input:
            return history, None
        
        # Check if this is a vision model output being processed
        is_vision_output = user_input.startswith("VISION MODEL OUTPUT:")
        
        # Add user message to both histories (ALWAYS add the user input)
        user_message = {"role": "user", "content": user_input}
        history.append(user_message)
        self.conversation_history.append(user_message)

        # Handle screen context - only for regular user inputs, not vision outputs
        screen_ctx = ""
        if not is_vision_output and self.latest_screen_context:
            screen_ctx = self.latest_screen_context
            # Clear the screen context after using it to prevent reuse
            self.latest_screen_context = ""

        # Get initial LLM response (may include tool calls)
        assistant_reply = await self.generate_response(user_input, screen_ctx)
        
        # Check if response contains function calls and execute them
        tool_results = await self.execute_tool_calls(assistant_reply)
        if tool_results:
            tool_message = {"role": "assistant", "content": tool_results}
            history.append(tool_message)
            self.conversation_history.append(tool_message)
            # Get final response after tool execution
            assistant_reply = await self.generate_response(user_input, screen_ctx, tool_results)

        # ALWAYS add the final assistant response to both histories
        assistant_message = {"role": "assistant", "content": assistant_reply}
        history.append(assistant_message)
        self.conversation_history.append(assistant_message)

        # TTS - only speak the assistant reply for regular inputs
        audio_path = None
        audio_bytes = await self.audio_service.text_to_speech(assistant_reply)
        if audio_bytes:
            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
            tmp.write(audio_bytes)
            tmp.close()
            audio_path = tmp.name

        logger.info(f"=== PROCESS_MESSAGE END ===")
        
        return history, audio_path

    async def cleanup(self):
        """Cleanup resources"""
        await self.mcp_service.close()

# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
# Gradio interface setup
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

chatbot = AgenticChatbot()

async def setup_gradio_interface() -> gr.Blocks:
    # Try to initialize MCP; if it fails, we’ll show a static message in Gradio

    mcp_available = await chatbot.initialize()

    with gr.Blocks(title="Agentic Chatbot", theme=gr.themes.Soft()) as demo:
        # If no MCP server, show a banner
        if not mcp_available:
            gr.Markdown(
                """
                <div style="padding:10px; border:2px solid #f00; border-radius:5px; background-color:#fee; color: #000;">
                  **⚠️ No MCP detected. Please refer to the README documentation. IRIS requires a Virtual Environment to run.**
                </div>
                """
            )

        chat = gr.Chatbot(type="messages", label="Conversation")
        text_input  = gr.Textbox(lines=2, placeholder="Type your message…", label="Text")
        audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Voice")

        # Screen-sharing controls
        screen_status = gr.Textbox(label="Screen Sharing Status", interactive=False)
        start_btn     = gr.Button("Start sharing screen")
        stop_btn      = gr.Button("Stop sharing screen")

        # AI response audio player
        audio_output  = gr.Audio(label="AI Response", autoplay=True)

        # Message send
        send_btn = gr.Button("Send", variant="primary")

        # Wire up buttons
        start_btn.click(fn=chatbot.start_screen_sharing, inputs=None, outputs=screen_status)
        stop_btn.click(fn=chatbot.stop_screen_sharing,
                       inputs=[chat],
                       outputs=[chat, screen_status, audio_output])

        send_btn.click(
            chatbot.process_message,
            inputs=[text_input, audio_input, chat],
            outputs=[chat, audio_output]
        )
        text_input.submit(
            chatbot.process_message,
            inputs=[text_input, audio_input, chat],
            outputs=[chat, audio_output]
        )

    return demo


if __name__ == "__main__":
    demo = asyncio.run(setup_gradio_interface())
    demo.launch(server_name="0.0.0.0", server_port=7860)