Spaces:

Agents-MCP-Hackathon
/

IRIS

Sleeping

File size: 16,763 Bytes

import gradio as gr
import asyncio
import logging
import tempfile
import json
import re
import requests
from typing import Optional, Dict, Any, List

from services.audio_service import AudioService
from services.llm_service import LLMService
from services.screen_service import ScreenService
from config.settings import Settings
from config.prompts import get_generic_prompt, get_vision_prompt

# Configure root logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class MCPRestClient:
    def __init__(self, base_url: str = "http://localhost:8000"):
        self.base_url = base_url.rstrip('/')

    async def initialize(self) -> bool:
        """Test connection to MCP server"""
        mcp_available = False
        try:
            response = requests.get(f"{self.base_url}/", timeout=5)
            if response.status_code == 200:
                logger.info("Successfully connected to MCP server")
                mcp_available = True
                return mcp_available
            else:
                raise ConnectionError(f"MCP server returned status {response.status_code}")
        except Exception as e:
            logger.error(f"Failed to connect to MCP server at {self.base_url}: {e}")
            logger.info("IRIS did not detect any MCP server. If you're running this in a HuggingFace space, please referr to the readme.md documentation.")

    async def get_available_tools(self) -> Dict[str, Dict]:
        """Get list of available tools from MCP server"""
        try:
            response = requests.get(f"{self.base_url}/tools", timeout=5)
            if response.status_code == 200:
                data = response.json()
                tools = {}
                for tool in data.get("tools", []):
                    tools[tool["name"]] = {
                        "description": tool.get("description", ""),
                        "inputSchema": tool.get("inputSchema", {})
                    }
                return tools
            else:
                logger.error(f"Failed to get tools: HTTP {response.status_code}")
                return {}
        except Exception as e:
            logger.error(f"Failed to get tools: {e}")
            return {}

    async def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> Any:
        """Call a tool on the MCP server"""
        try:
            payload = {
                "name": tool_name,
                "arguments": arguments
            }
            
            response = requests.post(
                f"{self.base_url}/tools/call",
                json=payload,
                timeout=30
            )
            
            if response.status_code == 200:
                data = response.json()
                if data.get("success"):
                    return data.get("result")
                else:
                    return {"error": data.get("error", "Unknown error")}
            else:
                return {"error": f"HTTP {response.status_code}: {response.text}"}
                    
        except Exception as e:
            return {"error": str(e)}

    async def close(self):
        """Nothing to close with requests"""
        pass

class AgenticChatbot:
    def __init__(self):
        self.settings = Settings()

        # AudioService
        audio_api_key = (
            self.settings.hf_token
            if self.settings.effective_audio_provider == "huggingface"
            else self.settings.openai_api_key
        )
        self.audio_service = AudioService(
            api_key=audio_api_key,
            stt_provider="fal-ai",
            stt_model=self.settings.stt_model,
            tts_model=self.settings.tts_model,
        )

        # LLMService
        self.llm_service = LLMService(
            api_key=self.settings.llm_api_key,
            model_name=self.settings.effective_model_name,
        )

        # MCPService - Now using REST client
        mcp_server_url = getattr(self.settings, 'mcp_server_url', 'http://localhost:8000')
        self.mcp_service = MCPRestClient(mcp_server_url)

        # ScreenService
        self.screen_service = ScreenService(
            prompt=get_vision_prompt(),
            model=self.settings.NEBIUS_MODEL,
            fps=0.05,
            queue_size=2,
            monitor=1,
            compression_quality=self.settings.screen_compression_quality,
            max_width=self.settings.max_width,
            max_height=self.settings.max_height,
        )
        self.latest_screen_context: str = ""
        self.conversation_history: List[Dict[str, Any]] = []

    async def initialize(self):
        try:
            mcp_available = await self.mcp_service.initialize()
            if not mcp_available:
                return False
            tools = await self.mcp_service.get_available_tools()
            logger.info(f"Initialized with {len(tools)} MCP tools")
        except Exception as e:
            logger.error(f"MCP init failed: {e}")

    # Screen callbacks
    def _on_screen_result(self, resp: dict, latency: float, frame_b64: str):
        try:
            content = resp.choices[0].message.content
        except Exception:
            content = str(resp)
        self.latest_screen_context = content
        logger.info(f"[Screen] {latency*1000:.0f}ms → {content}")

    def _get_conversation_history(self) -> List[Dict[str, str]]:
        """Return the current conversation history for the screen service"""
        return self.conversation_history.copy()

    def start_screen_sharing(self) -> str:
        self.latest_screen_context = ""
        # Pass the history getter method to screen service
        self.screen_service.start(
            self._on_screen_result,
            history_getter=self._get_conversation_history  # Use the method reference
        )
        return "✅ Screen sharing started."

    async def stop_screen_sharing(
        self,
        history: Optional[List[Dict[str, str]]]
    ) -> (List[Dict[str, str]], str, Optional[str]):
        """Stop screen sharing and append an LLM-generated summary to the chat."""
        # Stop capture
        self.screen_service.stop()
        
        # Get the latest vision context
        vision_ctx = self.latest_screen_context
        
        if vision_ctx and history is not None:
            # Call process_message with the vision context as user input
            updated_history, audio_path = await self.process_message(
                text_input=f"VISION MODEL OUTPUT: {vision_ctx}",
                audio_input=None,
                history=history
            )
            return updated_history, "🛑 Screen sharing stopped.", audio_path
        
        # If no vision context or history, just return
        return history or [], "🛑 Screen sharing stopped.", None

    async def execute_tool_calls(self, response_text: str) -> str:
        """Parse and execute function calls from LLM response using robust regex parsing"""
        
        # Clean the response text - remove code blocks and extra formatting
        cleaned_text = re.sub(r'```[a-zA-Z]*\n?', '', response_text)  # Remove code block markers
        cleaned_text = re.sub(r'\n```', '', cleaned_text)  # Remove closing code blocks
        
        # Pattern for function calls: function_name(arg1="value1", arg2=value2, arg3=true)
        function_pattern = r'(\w+)\s*\(\s*([^)]*)\s*\)'
        
        results = []
        
        # Find all function calls in the cleaned response
        for match in re.finditer(function_pattern, cleaned_text):
            tool_name = match.group(1)
            args_str = match.group(2).strip()
            
            # Skip if this isn't actually a tool (check against available tools)
            available_tools = await self.mcp_service.get_available_tools()
            if tool_name not in available_tools:
                continue
                
            try:
                # Parse arguments using regex for key=value pairs
                args = {}
                if args_str:
                    # Pattern for key=value pairs, handling quoted strings, numbers, booleans
                    arg_pattern = r'(\w+)\s*=\s*(?:"([^"]*)"|\'([^\']*)\'|(\w+))'
                    
                    for arg_match in re.finditer(arg_pattern, args_str):
                        key = arg_match.group(1)
                        # Get the value from whichever group matched (quoted or unquoted)
                        value = (arg_match.group(2) or 
                                arg_match.group(3) or 
                                arg_match.group(4))
                        
                        # Type conversion for common types
                        if value.lower() == 'true':
                            args[key] = True
                        elif value.lower() == 'false':
                            args[key] = False
                        elif value.isdigit():
                            args[key] = int(value)
                        elif value.replace('.', '').isdigit():
                            args[key] = float(value)
                        else:
                            args[key] = value
                
                # Execute the tool
                logger.info(f"Executing tool: {tool_name} with args: {args}")
                result = await self.mcp_service.call_tool(tool_name, args)
                results.append({
                    'tool': tool_name,
                    'args': args,
                    'result': result
                })
                
            except Exception as e:
                results.append({
                    'tool': tool_name,
                    'args': args if 'args' in locals() else {},
                    'error': str(e)
                })
        
        # Format results for LLM
        if not results:
            return ""
            
        formatted_results = []
        for result in results:
            if 'error' in result:
                formatted_results.append(
                    f"Tool {result['tool']} failed: {result['error']}"
                )
            else:
                formatted_results.append(
                    f"Tool {result['tool']} executed successfully:\n{json.dumps(result['result'], indent=2)}"
                )
        
        return "\n\n".join(formatted_results)

    # Chat / tool integration
    async def generate_response(
        self,
        user_input: str,
        screen_context: str = "",
        tool_result: str = ""
    ) -> str:
        # Retrieve available tools metadata
        tools = await self.mcp_service.get_available_tools()
        # Format tool list for prompt
        tool_desc = "\n".join(f"- {name}: {info.get('description','')}" for name, info in tools.items())

        # Build messages
        messages: List[Dict[str, str]] = [
            {"role": "system", "content": get_generic_prompt()},
        ]
        # Inform LLM about tools
        if tool_desc:
            messages.append({"role": "system", "content": f"Available tools:\n{tool_desc}"})
        messages.append({"role": "user",   "content": user_input})
        if tool_result:
            messages.append({"role": "assistant", "content": tool_result})

        return await self.llm_service.get_chat_completion(messages)

    async def process_message(
        self,
        text_input: str,
        audio_input: Optional[str],
        history: List[Dict[str, str]]
    ) -> (List[Dict[str, str]], Optional[str]):
        # Debug: Log the incoming state
        logger.info(f"=== PROCESS_MESSAGE START ===")
        for i, msg in enumerate(history[-3:]):
            logger.info(f"  {len(history) - 3 + i}: {msg.get('role')} - {msg.get('content', '')[:100]}...")
        
        # Update the internal conversation history to match the UI history
        self.conversation_history = history.copy()
        
        # STT
        transcript = ""
        if audio_input:
            transcript = await self.audio_service.speech_to_text(audio_input)
        user_input = (text_input + " " + transcript).strip()
        
        # If no input, return unchanged
        if not user_input:
            return history, None
        
        # Check if this is a vision model output being processed
        is_vision_output = user_input.startswith("VISION MODEL OUTPUT:")
        
        # Add user message to both histories (ALWAYS add the user input)
        user_message = {"role": "user", "content": user_input}
        history.append(user_message)
        self.conversation_history.append(user_message)

        # Handle screen context - only for regular user inputs, not vision outputs
        screen_ctx = ""
        if not is_vision_output and self.latest_screen_context:
            screen_ctx = self.latest_screen_context
            # Clear the screen context after using it to prevent reuse
            self.latest_screen_context = ""

        # Get initial LLM response (may include tool calls)
        assistant_reply = await self.generate_response(user_input, screen_ctx)
        
        # Check if response contains function calls and execute them
        tool_results = await self.execute_tool_calls(assistant_reply)
        if tool_results:
            tool_message = {"role": "assistant", "content": tool_results}
            history.append(tool_message)
            self.conversation_history.append(tool_message)
            # Get final response after tool execution
            assistant_reply = await self.generate_response(user_input, screen_ctx, tool_results)

        # ALWAYS add the final assistant response to both histories
        assistant_message = {"role": "assistant", "content": assistant_reply}
        history.append(assistant_message)
        self.conversation_history.append(assistant_message)

        # TTS - only speak the assistant reply for regular inputs
        audio_path = None
        audio_bytes = await self.audio_service.text_to_speech(assistant_reply)
        if audio_bytes:
            tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
            tmp.write(audio_bytes)
            tmp.close()
            audio_path = tmp.name

        logger.info(f"=== PROCESS_MESSAGE END ===")
        
        return history, audio_path

    async def cleanup(self):
        """Cleanup resources"""
        await self.mcp_service.close()

# ——————————————————————————————————
# Gradio interface setup
# ——————————————————————————————————

chatbot = AgenticChatbot()

async def setup_gradio_interface() -> gr.Blocks:
    # Try to initialize MCP; if it fails, we’ll show a static message in Gradio

    mcp_available = await chatbot.initialize()

    with gr.Blocks(title="Agentic Chatbot", theme=gr.themes.Soft()) as demo:
        # If no MCP server, show a banner
        if not mcp_available:
            gr.Markdown(
                """
                <div style="padding:10px; border:2px solid #f00; border-radius:5px; background-color:#fee; color: #000;">
                  **⚠️ No MCP detected. Please refer to the README documentation. IRIS requires a Virtual Environment to run.**
                </div>
                """
            )

        chat = gr.Chatbot(type="messages", label="Conversation")
        text_input  = gr.Textbox(lines=2, placeholder="Type your message…", label="Text")
        audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Voice")

        # Screen-sharing controls
        screen_status = gr.Textbox(label="Screen Sharing Status", interactive=False)
        start_btn     = gr.Button("Start sharing screen")
        stop_btn      = gr.Button("Stop sharing screen")

        # AI response audio player
        audio_output  = gr.Audio(label="AI Response", autoplay=True)

        # Message send
        send_btn = gr.Button("Send", variant="primary")

        # Wire up buttons
        start_btn.click(fn=chatbot.start_screen_sharing, inputs=None, outputs=screen_status)
        stop_btn.click(fn=chatbot.stop_screen_sharing,
                       inputs=[chat],
                       outputs=[chat, screen_status, audio_output])

        send_btn.click(
            chatbot.process_message,
            inputs=[text_input, audio_input, chat],
            outputs=[chat, audio_output]
        )
        text_input.submit(
            chatbot.process_message,
            inputs=[text_input, audio_input, chat],
            outputs=[chat, audio_output]
        )

    return demo


if __name__ == "__main__":
    demo = asyncio.run(setup_gradio_interface())
    demo.launch(server_name="0.0.0.0", server_port=7860)