Spaces:
Running
Running
File size: 8,162 Bytes
c262d1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
#!/usr/bin/env python3
"""
Monitor GAIA test progress and provide real-time status updates
"""
import os
import time
import json
from pathlib import Path
from datetime import datetime
import argparse
def get_latest_log_file():
"""Find the most recent classification test log file"""
log_dir = Path("logs")
if not log_dir.exists():
return None
log_files = list(log_dir.glob("classification_test_*.log"))
if not log_files:
return None
return max(log_files, key=lambda x: x.stat().st_mtime)
def parse_log_progress(log_file):
"""Parse log file to extract current progress"""
if not log_file or not log_file.exists():
return None
try:
with open(log_file, 'r') as f:
lines = f.readlines()
# Parse classification summary
classification_summary = {}
in_summary = False
# Parse testing progress
current_agent = None
questions_processed = 0
total_questions = 0
current_question = None
for line in lines:
line = line.strip()
# Classification summary section
if "CLASSIFICATION SUMMARY:" in line:
in_summary = True
continue
elif in_summary and ":" in line and "questions" in line:
parts = line.split(":")
if len(parts) == 2:
agent = parts[0].strip()
count_part = parts[1].strip()
if "(" in count_part:
count = int(count_part.split()[0])
classification_summary[agent] = count
elif in_summary and "Testing agent types:" in line:
in_summary = False
# Current testing progress
if "TESTING" in line and "AGENT" in line:
current_agent = line.split("TESTING")[1].split("AGENT")[0].strip()
elif "Questions to test:" in line:
total_questions = int(line.split(":")[-1].strip())
elif "Testing" in line and "/" in line and "]" in line:
# Extract current question number [X/Y]
bracket_part = line.split("[")[1].split("]")[0]
current_num = int(bracket_part.split("/")[0])
questions_processed = current_num - 1 # Since this is the one being processed
current_question = line.split("Testing")[1].split("...")[0].strip()
return {
'log_file': str(log_file),
'last_modified': datetime.fromtimestamp(log_file.stat().st_mtime),
'classification_summary': classification_summary,
'current_agent': current_agent,
'questions_processed': questions_processed,
'total_questions': total_questions,
'current_question': current_question,
'progress_percentage': (questions_processed / total_questions * 100) if total_questions > 0 else 0
}
except Exception as e:
return {'error': str(e)}
def get_latest_results():
"""Get the latest test results file"""
result_files = list(Path(".").glob("gaia_classification_test_results_*.json"))
if not result_files:
return None
latest_file = max(result_files, key=lambda x: x.stat().st_mtime)
try:
with open(latest_file, 'r') as f:
data = json.load(f)
return {
'file': str(latest_file),
'metadata': data.get('test_metadata', {}),
'overall_stats': data.get('overall_stats', {}),
'agent_performance': data.get('agent_performance', {})
}
except:
return None
def display_status(progress, results, watch_mode=False):
"""Display current test status"""
if watch_mode:
# Clear screen in watch mode
os.system('clear' if os.name == 'posix' else 'cls')
print("π GAIA TEST MONITORING DASHBOARD")
print("=" * 60)
print(f"π
Last Updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if progress and 'error' not in progress:
print(f"\nπ CURRENT PROGRESS:")
print(f"ποΈ Log File: {Path(progress['log_file']).name}")
print(f"β° Last Modified: {progress['last_modified'].strftime('%H:%M:%S')}")
if progress['current_agent']:
print(f"\nπ€ Currently Testing: {progress['current_agent'].upper()} AGENT")
print(f"π Progress: {progress['questions_processed']}/{progress['total_questions']} ({progress['progress_percentage']:.1f}%)")
# Progress bar
bar_length = 30
filled_length = int(bar_length * progress['progress_percentage'] / 100)
bar = "β" * filled_length + "β" * (bar_length - filled_length)
print(f"β Progress: [{bar}] {progress['progress_percentage']:.1f}%")
if progress['current_question']:
print(f"π§© Current Question: {progress['current_question']}...")
if progress['classification_summary']:
print(f"\nπ CLASSIFICATION BREAKDOWN:")
total_questions = sum(progress['classification_summary'].values())
for agent, count in sorted(progress['classification_summary'].items()):
percentage = (count / total_questions) * 100 if total_questions > 0 else 0
print(f" {agent}: {count} questions ({percentage:.1f}%)")
elif progress and 'error' in progress:
print(f"\nβ ERROR reading log file: {progress['error']}")
else:
print(f"\nβ οΈ No active test logs found")
if results:
print(f"\nπ LATEST COMPLETED RESULTS:")
print(f"π Results File: {Path(results['file']).name}")
overall = results.get('overall_stats', {})
if overall:
print(f"β
Success Rate: {overall.get('success_rate', 0):.1f}%")
print(f"π Total Questions: {overall.get('total_questions', 0)}")
print(f"β
Successful: {overall.get('successful', 0)}")
print(f"β Errors: {overall.get('errors', 0)}")
agent_perf = results.get('agent_performance', {})
if agent_perf:
print(f"\nπ― AGENT PERFORMANCE:")
for agent, stats in sorted(agent_perf.items(), key=lambda x: x[1]['success_rate'], reverse=True):
success_rate = stats['success_rate']
status_emoji = "π’" if success_rate >= 90 else "π‘" if success_rate >= 70 else "π΄"
print(f" {status_emoji} {agent}: {success_rate:.1f}% ({stats['successful']}/{stats['total_questions']})")
print(f"\nπ MONITORING OPTIONS:")
print(f" Watch mode: python tests/monitor_tests.py --watch")
print(f" Analyze results: python tests/analyze_test_results.py <results_file>")
print(f" Run new test: python tests/test_by_classification.py --agent-types <type>")
def main():
"""Main monitoring interface"""
parser = argparse.ArgumentParser(description="Monitor GAIA test progress")
parser.add_argument('--watch', action='store_true', help='Watch mode (auto-refresh every 10s)')
parser.add_argument('--interval', type=int, default=10, help='Refresh interval in seconds for watch mode')
args = parser.parse_args()
if args.watch:
print("π Starting watch mode... (Press Ctrl+C to stop)")
try:
while True:
progress = parse_log_progress(get_latest_log_file())
results = get_latest_results()
display_status(progress, results, watch_mode=True)
print(f"\nβ±οΈ Refreshing in {args.interval}s... (Ctrl+C to stop)")
time.sleep(args.interval)
except KeyboardInterrupt:
print(f"\nπ Monitoring stopped.")
else:
progress = parse_log_progress(get_latest_log_file())
results = get_latest_results()
display_status(progress, results, watch_mode=False)
if __name__ == "__main__":
main() |