#!/usr/bin/env python3 """ Monitor GAIA test progress and provide real-time status updates """ import os import time import json from pathlib import Path from datetime import datetime import argparse def get_latest_log_file(): """Find the most recent classification test log file""" log_dir = Path("logs") if not log_dir.exists(): return None log_files = list(log_dir.glob("classification_test_*.log")) if not log_files: return None return max(log_files, key=lambda x: x.stat().st_mtime) def parse_log_progress(log_file): """Parse log file to extract current progress""" if not log_file or not log_file.exists(): return None try: with open(log_file, 'r') as f: lines = f.readlines() # Parse classification summary classification_summary = {} in_summary = False # Parse testing progress current_agent = None questions_processed = 0 total_questions = 0 current_question = None for line in lines: line = line.strip() # Classification summary section if "CLASSIFICATION SUMMARY:" in line: in_summary = True continue elif in_summary and ":" in line and "questions" in line: parts = line.split(":") if len(parts) == 2: agent = parts[0].strip() count_part = parts[1].strip() if "(" in count_part: count = int(count_part.split()[0]) classification_summary[agent] = count elif in_summary and "Testing agent types:" in line: in_summary = False # Current testing progress if "TESTING" in line and "AGENT" in line: current_agent = line.split("TESTING")[1].split("AGENT")[0].strip() elif "Questions to test:" in line: total_questions = int(line.split(":")[-1].strip()) elif "Testing" in line and "/" in line and "]" in line: # Extract current question number [X/Y] bracket_part = line.split("[")[1].split("]")[0] current_num = int(bracket_part.split("/")[0]) questions_processed = current_num - 1 # Since this is the one being processed current_question = line.split("Testing")[1].split("...")[0].strip() return { 'log_file': str(log_file), 'last_modified': datetime.fromtimestamp(log_file.stat().st_mtime), 'classification_summary': classification_summary, 'current_agent': current_agent, 'questions_processed': questions_processed, 'total_questions': total_questions, 'current_question': current_question, 'progress_percentage': (questions_processed / total_questions * 100) if total_questions > 0 else 0 } except Exception as e: return {'error': str(e)} def get_latest_results(): """Get the latest test results file""" result_files = list(Path(".").glob("gaia_classification_test_results_*.json")) if not result_files: return None latest_file = max(result_files, key=lambda x: x.stat().st_mtime) try: with open(latest_file, 'r') as f: data = json.load(f) return { 'file': str(latest_file), 'metadata': data.get('test_metadata', {}), 'overall_stats': data.get('overall_stats', {}), 'agent_performance': data.get('agent_performance', {}) } except: return None def display_status(progress, results, watch_mode=False): """Display current test status""" if watch_mode: # Clear screen in watch mode os.system('clear' if os.name == 'posix' else 'cls') print("šŸ” GAIA TEST MONITORING DASHBOARD") print("=" * 60) print(f"šŸ“… Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") if progress and 'error' not in progress: print(f"\nšŸ“Š CURRENT PROGRESS:") print(f"šŸ—‚ļø Log File: {Path(progress['log_file']).name}") print(f"ā° Last Modified: {progress['last_modified'].strftime('%H:%M:%S')}") if progress['current_agent']: print(f"\nšŸ¤– Currently Testing: {progress['current_agent'].upper()} AGENT") print(f"šŸ“ˆ Progress: {progress['questions_processed']}/{progress['total_questions']} ({progress['progress_percentage']:.1f}%)") # Progress bar bar_length = 30 filled_length = int(bar_length * progress['progress_percentage'] / 100) bar = "ā–ˆ" * filled_length + "ā–‘" * (bar_length - filled_length) print(f"ā–“ Progress: [{bar}] {progress['progress_percentage']:.1f}%") if progress['current_question']: print(f"🧩 Current Question: {progress['current_question']}...") if progress['classification_summary']: print(f"\nšŸ“Š CLASSIFICATION BREAKDOWN:") total_questions = sum(progress['classification_summary'].values()) for agent, count in sorted(progress['classification_summary'].items()): percentage = (count / total_questions) * 100 if total_questions > 0 else 0 print(f" {agent}: {count} questions ({percentage:.1f}%)") elif progress and 'error' in progress: print(f"\nāŒ ERROR reading log file: {progress['error']}") else: print(f"\nāš ļø No active test logs found") if results: print(f"\nšŸ“‹ LATEST COMPLETED RESULTS:") print(f"šŸ“„ Results File: {Path(results['file']).name}") overall = results.get('overall_stats', {}) if overall: print(f"āœ… Success Rate: {overall.get('success_rate', 0):.1f}%") print(f"šŸ“Š Total Questions: {overall.get('total_questions', 0)}") print(f"āœ… Successful: {overall.get('successful', 0)}") print(f"āŒ Errors: {overall.get('errors', 0)}") agent_perf = results.get('agent_performance', {}) if agent_perf: print(f"\nšŸŽÆ AGENT PERFORMANCE:") for agent, stats in sorted(agent_perf.items(), key=lambda x: x[1]['success_rate'], reverse=True): success_rate = stats['success_rate'] status_emoji = "🟢" if success_rate >= 90 else "🟔" if success_rate >= 70 else "šŸ”“" print(f" {status_emoji} {agent}: {success_rate:.1f}% ({stats['successful']}/{stats['total_questions']})") print(f"\nšŸ” MONITORING OPTIONS:") print(f" Watch mode: python tests/monitor_tests.py --watch") print(f" Analyze results: python tests/analyze_test_results.py ") print(f" Run new test: python tests/test_by_classification.py --agent-types ") def main(): """Main monitoring interface""" parser = argparse.ArgumentParser(description="Monitor GAIA test progress") parser.add_argument('--watch', action='store_true', help='Watch mode (auto-refresh every 10s)') parser.add_argument('--interval', type=int, default=10, help='Refresh interval in seconds for watch mode') args = parser.parse_args() if args.watch: print("šŸ‘€ Starting watch mode... (Press Ctrl+C to stop)") try: while True: progress = parse_log_progress(get_latest_log_file()) results = get_latest_results() display_status(progress, results, watch_mode=True) print(f"\nā±ļø Refreshing in {args.interval}s... (Ctrl+C to stop)") time.sleep(args.interval) except KeyboardInterrupt: print(f"\nšŸ‘‹ Monitoring stopped.") else: progress = parse_log_progress(get_latest_log_file()) results = get_latest_results() display_status(progress, results, watch_mode=False) if __name__ == "__main__": main()