Spaces:
Sleeping
Sleeping
File size: 16,763 Bytes
924e633 f1c0a32 924e633 f1c0a32 924e633 f1c0a32 924e633 f1c0a32 924e633 f1c0a32 924e633 f1c0a32 924e633 f1c0a32 924e633 f1c0a32 924e633 f1c0a32 924e633 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 |
import gradio as gr
import asyncio
import logging
import tempfile
import json
import re
import requests
from typing import Optional, Dict, Any, List
from services.audio_service import AudioService
from services.llm_service import LLMService
from services.screen_service import ScreenService
from config.settings import Settings
from config.prompts import get_generic_prompt, get_vision_prompt
# Configure root logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class MCPRestClient:
def __init__(self, base_url: str = "http://localhost:8000"):
self.base_url = base_url.rstrip('/')
async def initialize(self) -> bool:
"""Test connection to MCP server"""
mcp_available = False
try:
response = requests.get(f"{self.base_url}/", timeout=5)
if response.status_code == 200:
logger.info("Successfully connected to MCP server")
mcp_available = True
return mcp_available
else:
raise ConnectionError(f"MCP server returned status {response.status_code}")
except Exception as e:
logger.error(f"Failed to connect to MCP server at {self.base_url}: {e}")
logger.info("IRIS did not detect any MCP server. If you're running this in a HuggingFace space, please referr to the readme.md documentation.")
async def get_available_tools(self) -> Dict[str, Dict]:
"""Get list of available tools from MCP server"""
try:
response = requests.get(f"{self.base_url}/tools", timeout=5)
if response.status_code == 200:
data = response.json()
tools = {}
for tool in data.get("tools", []):
tools[tool["name"]] = {
"description": tool.get("description", ""),
"inputSchema": tool.get("inputSchema", {})
}
return tools
else:
logger.error(f"Failed to get tools: HTTP {response.status_code}")
return {}
except Exception as e:
logger.error(f"Failed to get tools: {e}")
return {}
async def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> Any:
"""Call a tool on the MCP server"""
try:
payload = {
"name": tool_name,
"arguments": arguments
}
response = requests.post(
f"{self.base_url}/tools/call",
json=payload,
timeout=30
)
if response.status_code == 200:
data = response.json()
if data.get("success"):
return data.get("result")
else:
return {"error": data.get("error", "Unknown error")}
else:
return {"error": f"HTTP {response.status_code}: {response.text}"}
except Exception as e:
return {"error": str(e)}
async def close(self):
"""Nothing to close with requests"""
pass
class AgenticChatbot:
def __init__(self):
self.settings = Settings()
# AudioService
audio_api_key = (
self.settings.hf_token
if self.settings.effective_audio_provider == "huggingface"
else self.settings.openai_api_key
)
self.audio_service = AudioService(
api_key=audio_api_key,
stt_provider="fal-ai",
stt_model=self.settings.stt_model,
tts_model=self.settings.tts_model,
)
# LLMService
self.llm_service = LLMService(
api_key=self.settings.llm_api_key,
model_name=self.settings.effective_model_name,
)
# MCPService - Now using REST client
mcp_server_url = getattr(self.settings, 'mcp_server_url', 'http://localhost:8000')
self.mcp_service = MCPRestClient(mcp_server_url)
# ScreenService
self.screen_service = ScreenService(
prompt=get_vision_prompt(),
model=self.settings.NEBIUS_MODEL,
fps=0.05,
queue_size=2,
monitor=1,
compression_quality=self.settings.screen_compression_quality,
max_width=self.settings.max_width,
max_height=self.settings.max_height,
)
self.latest_screen_context: str = ""
self.conversation_history: List[Dict[str, Any]] = []
async def initialize(self):
try:
mcp_available = await self.mcp_service.initialize()
if not mcp_available:
return False
tools = await self.mcp_service.get_available_tools()
logger.info(f"Initialized with {len(tools)} MCP tools")
except Exception as e:
logger.error(f"MCP init failed: {e}")
# Screen callbacks
def _on_screen_result(self, resp: dict, latency: float, frame_b64: str):
try:
content = resp.choices[0].message.content
except Exception:
content = str(resp)
self.latest_screen_context = content
logger.info(f"[Screen] {latency*1000:.0f}ms β {content}")
def _get_conversation_history(self) -> List[Dict[str, str]]:
"""Return the current conversation history for the screen service"""
return self.conversation_history.copy()
def start_screen_sharing(self) -> str:
self.latest_screen_context = ""
# Pass the history getter method to screen service
self.screen_service.start(
self._on_screen_result,
history_getter=self._get_conversation_history # Use the method reference
)
return "β
Screen sharing started."
async def stop_screen_sharing(
self,
history: Optional[List[Dict[str, str]]]
) -> (List[Dict[str, str]], str, Optional[str]):
"""Stop screen sharing and append an LLM-generated summary to the chat."""
# Stop capture
self.screen_service.stop()
# Get the latest vision context
vision_ctx = self.latest_screen_context
if vision_ctx and history is not None:
# Call process_message with the vision context as user input
updated_history, audio_path = await self.process_message(
text_input=f"VISION MODEL OUTPUT: {vision_ctx}",
audio_input=None,
history=history
)
return updated_history, "π Screen sharing stopped.", audio_path
# If no vision context or history, just return
return history or [], "π Screen sharing stopped.", None
async def execute_tool_calls(self, response_text: str) -> str:
"""Parse and execute function calls from LLM response using robust regex parsing"""
# Clean the response text - remove code blocks and extra formatting
cleaned_text = re.sub(r'```[a-zA-Z]*\n?', '', response_text) # Remove code block markers
cleaned_text = re.sub(r'\n```', '', cleaned_text) # Remove closing code blocks
# Pattern for function calls: function_name(arg1="value1", arg2=value2, arg3=true)
function_pattern = r'(\w+)\s*\(\s*([^)]*)\s*\)'
results = []
# Find all function calls in the cleaned response
for match in re.finditer(function_pattern, cleaned_text):
tool_name = match.group(1)
args_str = match.group(2).strip()
# Skip if this isn't actually a tool (check against available tools)
available_tools = await self.mcp_service.get_available_tools()
if tool_name not in available_tools:
continue
try:
# Parse arguments using regex for key=value pairs
args = {}
if args_str:
# Pattern for key=value pairs, handling quoted strings, numbers, booleans
arg_pattern = r'(\w+)\s*=\s*(?:"([^"]*)"|\'([^\']*)\'|(\w+))'
for arg_match in re.finditer(arg_pattern, args_str):
key = arg_match.group(1)
# Get the value from whichever group matched (quoted or unquoted)
value = (arg_match.group(2) or
arg_match.group(3) or
arg_match.group(4))
# Type conversion for common types
if value.lower() == 'true':
args[key] = True
elif value.lower() == 'false':
args[key] = False
elif value.isdigit():
args[key] = int(value)
elif value.replace('.', '').isdigit():
args[key] = float(value)
else:
args[key] = value
# Execute the tool
logger.info(f"Executing tool: {tool_name} with args: {args}")
result = await self.mcp_service.call_tool(tool_name, args)
results.append({
'tool': tool_name,
'args': args,
'result': result
})
except Exception as e:
results.append({
'tool': tool_name,
'args': args if 'args' in locals() else {},
'error': str(e)
})
# Format results for LLM
if not results:
return ""
formatted_results = []
for result in results:
if 'error' in result:
formatted_results.append(
f"Tool {result['tool']} failed: {result['error']}"
)
else:
formatted_results.append(
f"Tool {result['tool']} executed successfully:\n{json.dumps(result['result'], indent=2)}"
)
return "\n\n".join(formatted_results)
# Chat / tool integration
async def generate_response(
self,
user_input: str,
screen_context: str = "",
tool_result: str = ""
) -> str:
# Retrieve available tools metadata
tools = await self.mcp_service.get_available_tools()
# Format tool list for prompt
tool_desc = "\n".join(f"- {name}: {info.get('description','')}" for name, info in tools.items())
# Build messages
messages: List[Dict[str, str]] = [
{"role": "system", "content": get_generic_prompt()},
]
# Inform LLM about tools
if tool_desc:
messages.append({"role": "system", "content": f"Available tools:\n{tool_desc}"})
messages.append({"role": "user", "content": user_input})
if tool_result:
messages.append({"role": "assistant", "content": tool_result})
return await self.llm_service.get_chat_completion(messages)
async def process_message(
self,
text_input: str,
audio_input: Optional[str],
history: List[Dict[str, str]]
) -> (List[Dict[str, str]], Optional[str]):
# Debug: Log the incoming state
logger.info(f"=== PROCESS_MESSAGE START ===")
for i, msg in enumerate(history[-3:]):
logger.info(f" {len(history) - 3 + i}: {msg.get('role')} - {msg.get('content', '')[:100]}...")
# Update the internal conversation history to match the UI history
self.conversation_history = history.copy()
# STT
transcript = ""
if audio_input:
transcript = await self.audio_service.speech_to_text(audio_input)
user_input = (text_input + " " + transcript).strip()
# If no input, return unchanged
if not user_input:
return history, None
# Check if this is a vision model output being processed
is_vision_output = user_input.startswith("VISION MODEL OUTPUT:")
# Add user message to both histories (ALWAYS add the user input)
user_message = {"role": "user", "content": user_input}
history.append(user_message)
self.conversation_history.append(user_message)
# Handle screen context - only for regular user inputs, not vision outputs
screen_ctx = ""
if not is_vision_output and self.latest_screen_context:
screen_ctx = self.latest_screen_context
# Clear the screen context after using it to prevent reuse
self.latest_screen_context = ""
# Get initial LLM response (may include tool calls)
assistant_reply = await self.generate_response(user_input, screen_ctx)
# Check if response contains function calls and execute them
tool_results = await self.execute_tool_calls(assistant_reply)
if tool_results:
tool_message = {"role": "assistant", "content": tool_results}
history.append(tool_message)
self.conversation_history.append(tool_message)
# Get final response after tool execution
assistant_reply = await self.generate_response(user_input, screen_ctx, tool_results)
# ALWAYS add the final assistant response to both histories
assistant_message = {"role": "assistant", "content": assistant_reply}
history.append(assistant_message)
self.conversation_history.append(assistant_message)
# TTS - only speak the assistant reply for regular inputs
audio_path = None
audio_bytes = await self.audio_service.text_to_speech(assistant_reply)
if audio_bytes:
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
tmp.write(audio_bytes)
tmp.close()
audio_path = tmp.name
logger.info(f"=== PROCESS_MESSAGE END ===")
return history, audio_path
async def cleanup(self):
"""Cleanup resources"""
await self.mcp_service.close()
# ββββββββββββββββββββββββββββββββββ
# Gradio interface setup
# ββββββββββββββββββββββββββββββββββ
chatbot = AgenticChatbot()
async def setup_gradio_interface() -> gr.Blocks:
# Try to initialize MCP; if it fails, weβll show a static message in Gradio
mcp_available = await chatbot.initialize()
with gr.Blocks(title="Agentic Chatbot", theme=gr.themes.Soft()) as demo:
# If no MCP server, show a banner
if not mcp_available:
gr.Markdown(
"""
<div style="padding:10px; border:2px solid #f00; border-radius:5px; background-color:#fee; color: #000;">
**β οΈ No MCP detected. Please refer to the README documentation. IRIS requires a Virtual Environment to run.**
</div>
"""
)
chat = gr.Chatbot(type="messages", label="Conversation")
text_input = gr.Textbox(lines=2, placeholder="Type your messageβ¦", label="Text")
audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Voice")
# Screen-sharing controls
screen_status = gr.Textbox(label="Screen Sharing Status", interactive=False)
start_btn = gr.Button("Start sharing screen")
stop_btn = gr.Button("Stop sharing screen")
# AI response audio player
audio_output = gr.Audio(label="AI Response", autoplay=True)
# Message send
send_btn = gr.Button("Send", variant="primary")
# Wire up buttons
start_btn.click(fn=chatbot.start_screen_sharing, inputs=None, outputs=screen_status)
stop_btn.click(fn=chatbot.stop_screen_sharing,
inputs=[chat],
outputs=[chat, screen_status, audio_output])
send_btn.click(
chatbot.process_message,
inputs=[text_input, audio_input, chat],
outputs=[chat, audio_output]
)
text_input.submit(
chatbot.process_message,
inputs=[text_input, audio_input, chat],
outputs=[chat, audio_output]
)
return demo
if __name__ == "__main__":
demo = asyncio.run(setup_gradio_interface())
demo.launch(server_name="0.0.0.0", server_port=7860) |