Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Monitor GAIA test progress and provide real-time status updates | |
""" | |
import os | |
import time | |
import json | |
from pathlib import Path | |
from datetime import datetime | |
import argparse | |
def get_latest_log_file(): | |
"""Find the most recent classification test log file""" | |
log_dir = Path("logs") | |
if not log_dir.exists(): | |
return None | |
log_files = list(log_dir.glob("classification_test_*.log")) | |
if not log_files: | |
return None | |
return max(log_files, key=lambda x: x.stat().st_mtime) | |
def parse_log_progress(log_file): | |
"""Parse log file to extract current progress""" | |
if not log_file or not log_file.exists(): | |
return None | |
try: | |
with open(log_file, 'r') as f: | |
lines = f.readlines() | |
# Parse classification summary | |
classification_summary = {} | |
in_summary = False | |
# Parse testing progress | |
current_agent = None | |
questions_processed = 0 | |
total_questions = 0 | |
current_question = None | |
for line in lines: | |
line = line.strip() | |
# Classification summary section | |
if "CLASSIFICATION SUMMARY:" in line: | |
in_summary = True | |
continue | |
elif in_summary and ":" in line and "questions" in line: | |
parts = line.split(":") | |
if len(parts) == 2: | |
agent = parts[0].strip() | |
count_part = parts[1].strip() | |
if "(" in count_part: | |
count = int(count_part.split()[0]) | |
classification_summary[agent] = count | |
elif in_summary and "Testing agent types:" in line: | |
in_summary = False | |
# Current testing progress | |
if "TESTING" in line and "AGENT" in line: | |
current_agent = line.split("TESTING")[1].split("AGENT")[0].strip() | |
elif "Questions to test:" in line: | |
total_questions = int(line.split(":")[-1].strip()) | |
elif "Testing" in line and "/" in line and "]" in line: | |
# Extract current question number [X/Y] | |
bracket_part = line.split("[")[1].split("]")[0] | |
current_num = int(bracket_part.split("/")[0]) | |
questions_processed = current_num - 1 # Since this is the one being processed | |
current_question = line.split("Testing")[1].split("...")[0].strip() | |
return { | |
'log_file': str(log_file), | |
'last_modified': datetime.fromtimestamp(log_file.stat().st_mtime), | |
'classification_summary': classification_summary, | |
'current_agent': current_agent, | |
'questions_processed': questions_processed, | |
'total_questions': total_questions, | |
'current_question': current_question, | |
'progress_percentage': (questions_processed / total_questions * 100) if total_questions > 0 else 0 | |
} | |
except Exception as e: | |
return {'error': str(e)} | |
def get_latest_results(): | |
"""Get the latest test results file""" | |
result_files = list(Path(".").glob("gaia_classification_test_results_*.json")) | |
if not result_files: | |
return None | |
latest_file = max(result_files, key=lambda x: x.stat().st_mtime) | |
try: | |
with open(latest_file, 'r') as f: | |
data = json.load(f) | |
return { | |
'file': str(latest_file), | |
'metadata': data.get('test_metadata', {}), | |
'overall_stats': data.get('overall_stats', {}), | |
'agent_performance': data.get('agent_performance', {}) | |
} | |
except: | |
return None | |
def display_status(progress, results, watch_mode=False): | |
"""Display current test status""" | |
if watch_mode: | |
# Clear screen in watch mode | |
os.system('clear' if os.name == 'posix' else 'cls') | |
print("π GAIA TEST MONITORING DASHBOARD") | |
print("=" * 60) | |
print(f"π Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
if progress and 'error' not in progress: | |
print(f"\nπ CURRENT PROGRESS:") | |
print(f"ποΈ Log File: {Path(progress['log_file']).name}") | |
print(f"β° Last Modified: {progress['last_modified'].strftime('%H:%M:%S')}") | |
if progress['current_agent']: | |
print(f"\nπ€ Currently Testing: {progress['current_agent'].upper()} AGENT") | |
print(f"π Progress: {progress['questions_processed']}/{progress['total_questions']} ({progress['progress_percentage']:.1f}%)") | |
# Progress bar | |
bar_length = 30 | |
filled_length = int(bar_length * progress['progress_percentage'] / 100) | |
bar = "β" * filled_length + "β" * (bar_length - filled_length) | |
print(f"β Progress: [{bar}] {progress['progress_percentage']:.1f}%") | |
if progress['current_question']: | |
print(f"π§© Current Question: {progress['current_question']}...") | |
if progress['classification_summary']: | |
print(f"\nπ CLASSIFICATION BREAKDOWN:") | |
total_questions = sum(progress['classification_summary'].values()) | |
for agent, count in sorted(progress['classification_summary'].items()): | |
percentage = (count / total_questions) * 100 if total_questions > 0 else 0 | |
print(f" {agent}: {count} questions ({percentage:.1f}%)") | |
elif progress and 'error' in progress: | |
print(f"\nβ ERROR reading log file: {progress['error']}") | |
else: | |
print(f"\nβ οΈ No active test logs found") | |
if results: | |
print(f"\nπ LATEST COMPLETED RESULTS:") | |
print(f"π Results File: {Path(results['file']).name}") | |
overall = results.get('overall_stats', {}) | |
if overall: | |
print(f"β Success Rate: {overall.get('success_rate', 0):.1f}%") | |
print(f"π Total Questions: {overall.get('total_questions', 0)}") | |
print(f"β Successful: {overall.get('successful', 0)}") | |
print(f"β Errors: {overall.get('errors', 0)}") | |
agent_perf = results.get('agent_performance', {}) | |
if agent_perf: | |
print(f"\nπ― AGENT PERFORMANCE:") | |
for agent, stats in sorted(agent_perf.items(), key=lambda x: x[1]['success_rate'], reverse=True): | |
success_rate = stats['success_rate'] | |
status_emoji = "π’" if success_rate >= 90 else "π‘" if success_rate >= 70 else "π΄" | |
print(f" {status_emoji} {agent}: {success_rate:.1f}% ({stats['successful']}/{stats['total_questions']})") | |
print(f"\nπ MONITORING OPTIONS:") | |
print(f" Watch mode: python tests/monitor_tests.py --watch") | |
print(f" Analyze results: python tests/analyze_test_results.py <results_file>") | |
print(f" Run new test: python tests/test_by_classification.py --agent-types <type>") | |
def main(): | |
"""Main monitoring interface""" | |
parser = argparse.ArgumentParser(description="Monitor GAIA test progress") | |
parser.add_argument('--watch', action='store_true', help='Watch mode (auto-refresh every 10s)') | |
parser.add_argument('--interval', type=int, default=10, help='Refresh interval in seconds for watch mode') | |
args = parser.parse_args() | |
if args.watch: | |
print("π Starting watch mode... (Press Ctrl+C to stop)") | |
try: | |
while True: | |
progress = parse_log_progress(get_latest_log_file()) | |
results = get_latest_results() | |
display_status(progress, results, watch_mode=True) | |
print(f"\nβ±οΈ Refreshing in {args.interval}s... (Ctrl+C to stop)") | |
time.sleep(args.interval) | |
except KeyboardInterrupt: | |
print(f"\nπ Monitoring stopped.") | |
else: | |
progress = parse_log_progress(get_latest_log_file()) | |
results = get_latest_results() | |
display_status(progress, results, watch_mode=False) | |
if __name__ == "__main__": | |
main() |