Spaces:
Running
Running
File size: 6,707 Bytes
c262d1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
#!/usr/bin/env python3
"""
Run comprehensive GAIA tests across all classification groups
This script orchestrates the complete testing workflow and analysis
"""
import subprocess
import time
import json
from pathlib import Path
from datetime import datetime
def run_command(command, description, timeout=1800):
"""Run a command with timeout and capture output"""
print(f"\nπ {description}")
print(f"Command: {command}")
print("-" * 60)
try:
result = subprocess.run(
command,
shell=True,
capture_output=True,
text=True,
timeout=timeout
)
if result.returncode == 0:
print("β
SUCCESS")
print(f"Output: {result.stdout[:500]}...")
return True, result.stdout
else:
print("β FAILED")
print(f"Error: {result.stderr[:500]}...")
return False, result.stderr
except subprocess.TimeoutExpired:
print(f"β° TIMEOUT after {timeout}s")
return False, "Command timed out"
except Exception as e:
print(f"π₯ EXCEPTION: {e}")
return False, str(e)
def main():
"""Run comprehensive testing workflow"""
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
print("π― COMPREHENSIVE GAIA TESTING WORKFLOW")
print("=" * 70)
print(f"Started: {datetime.now()}")
# Activate virtual environment prefix
venv_prefix = "source venv/bin/activate &&"
# Test plan - run each agent type separately for better error analysis
test_plan = [
{
"name": "Research Questions",
"command": f"{venv_prefix} python tests/test_by_classification.py --agent-types research",
"timeout": 1800,
"priority": "HIGH"
},
{
"name": "Multimedia Questions",
"command": f"{venv_prefix} python tests/test_by_classification.py --agent-types multimedia",
"timeout": 2400,
"priority": "HIGH"
},
{
"name": "Logic/Math Questions",
"command": f"{venv_prefix} python tests/test_by_classification.py --agent-types logic_math",
"timeout": 1200,
"priority": "MEDIUM"
},
{
"name": "File Processing Questions",
"command": f"{venv_prefix} python tests/test_by_classification.py --agent-types file_processing",
"timeout": 900,
"priority": "MEDIUM"
},
{
"name": "All Agent Types (Complete)",
"command": f"{venv_prefix} python tests/test_by_classification.py",
"timeout": 3600,
"priority": "LOW"
}
]
results = []
# Execute test plan
for i, test in enumerate(test_plan, 1):
print(f"\n{'='*20} TEST {i}/{len(test_plan)} {'='*20}")
print(f"Name: {test['name']}")
print(f"Priority: {test['priority']}")
start_time = time.time()
success, output = run_command(
test['command'],
test['name'],
test['timeout']
)
end_time = time.time()
result = {
'test_name': test['name'],
'command': test['command'],
'priority': test['priority'],
'success': success,
'duration': end_time - start_time,
'output_preview': output[:200] if output else "",
'timestamp': datetime.now().isoformat()
}
results.append(result)
# Brief pause between tests
time.sleep(5)
# Generate summary report
print(f"\nπ COMPREHENSIVE TEST SUMMARY")
print("=" * 70)
total_tests = len(test_plan)
successful_tests = len([r for r in results if r['success']])
failed_tests = total_tests - successful_tests
print(f"Total Tests: {total_tests}")
print(f"Successful: {successful_tests} ({successful_tests/total_tests*100:.1f}%)")
print(f"Failed: {failed_tests} ({failed_tests/total_tests*100:.1f}%)")
print(f"\nπ DETAILED RESULTS:")
for result in results:
status = "β
" if result['success'] else "β"
duration = result['duration']
print(f" {status} {result['test_name']}: {duration:.1f}s ({result['priority']} priority)")
# Save comprehensive results
results_file = f"comprehensive_test_results_{timestamp}.json"
with open(results_file, 'w') as f:
json.dump({
'metadata': {
'timestamp': timestamp,
'total_tests': total_tests,
'successful_tests': successful_tests,
'failed_tests': failed_tests,
'success_rate': successful_tests/total_tests*100
},
'test_results': results
}, f, indent=2)
print(f"\nπΎ Results saved to: {results_file}")
# Generate action items based on results
print(f"\nπ NEXT STEPS:")
high_priority_failures = [r for r in results if not r['success'] and r['priority'] == 'HIGH']
if high_priority_failures:
print("π΄ HIGH PRIORITY FIXES NEEDED:")
for failure in high_priority_failures:
print(f" - Fix {failure['test_name']}")
print(f" Command: {failure['command']}")
medium_priority_failures = [r for r in results if not r['success'] and r['priority'] == 'MEDIUM']
if medium_priority_failures:
print("π‘ MEDIUM PRIORITY IMPROVEMENTS:")
for failure in medium_priority_failures:
print(f" - Optimize {failure['test_name']}")
if successful_tests == total_tests:
print("π ALL TESTS PASSED! Ready for production use.")
print("π‘ Consider running specific error analysis on individual results files")
# Find the most recent results files for analysis
log_files = list(Path("logs").glob("classification_test_*.log"))
if log_files:
latest_log = max(log_files, key=lambda x: x.stat().st_mtime)
print(f"π Latest log file: {latest_log}")
result_files = list(Path(".").glob("gaia_classification_test_results_*.json"))
if result_files:
latest_results = max(result_files, key=lambda x: x.stat().st_mtime)
print(f"π Latest results: {latest_results}")
print(f"π Analyze with: python tests/analyze_test_results.py {latest_results}")
print(f"\nβ
COMPREHENSIVE TESTING COMPLETE!")
print(f"Total Duration: {sum(r['duration'] for r in results):.1f}s")
if __name__ == "__main__":
main() |