Final_Assignment / tests /validate_all_questions.py
GAIA Developer
πŸ§ͺ Add comprehensive test infrastructure and async testing system
c262d1a
#!/usr/bin/env python3
"""
Validate all GAIA questions with our multi-agent system
"""
import json
import time
from typing import Dict, List
from gaia_web_loader import GAIAQuestionLoaderWeb
from main import GAIASolver
from question_classifier import QuestionClassifier
def solve_all_questions_with_validation():
"""Solve all 20 GAIA questions and collect results for validation"""
print("πŸ§ͺ COMPREHENSIVE GAIA VALIDATION - ALL 20 QUESTIONS")
print("=" * 70)
# Initialize components
print("πŸš€ Initializing multi-agent system...")
loader = GAIAQuestionLoaderWeb()
classifier = QuestionClassifier()
solver = GAIASolver()
questions = loader.questions
results = []
print(f"πŸ“š Found {len(questions)} questions to solve")
for i, question_data in enumerate(questions, 1):
task_id = question_data.get('task_id', 'unknown')
question_text = question_data.get('question', '')
file_name = question_data.get('file_name', '')
print(f"\n{'='*60}")
print(f"QUESTION {i}/20: {task_id[:8]}...")
print(f"{'='*60}")
try:
# Classification phase
print(f"🧠 CLASSIFICATION:")
classification = classifier.classify_question(question_text, file_name)
routing = classifier.get_routing_recommendation(classification)
print(f" Primary Agent: {classification['primary_agent']}")
print(f" Secondary: {classification.get('secondary_agents', [])}")
print(f" Complexity: {classification['complexity']}/5")
print(f" Confidence: {classification['confidence']:.3f}")
# Solving phase
print(f"\nπŸ€– SOLVING:")
print(f" Question: {question_text[:100]}...")
if file_name:
print(f" File: {file_name}")
start_time = time.time()
answer = solver.solve_question(question_data)
solve_time = time.time() - start_time
print(f" βœ… Answer: {answer[:100]}...")
print(f" ⏱️ Time: {solve_time:.1f}s")
# Store results
result = {
'question_id': task_id,
'question': question_text,
'file_name': file_name,
'classification': {
'primary_agent': classification['primary_agent'],
'secondary_agents': classification.get('secondary_agents', []),
'complexity': classification['complexity'],
'confidence': classification['confidence'],
'tools_needed': classification.get('tools_needed', [])
},
'routing': {
'coordination_needed': routing['requires_coordination'],
'duration_estimate': routing['estimated_duration']
},
'answer': answer,
'solve_time': solve_time,
'status': 'completed'
}
results.append(result)
except Exception as e:
print(f" ❌ Error: {e}")
# Store error result
error_result = {
'question_id': task_id,
'question': question_text,
'file_name': file_name,
'classification': classification if 'classification' in locals() else None,
'answer': f"Error: {str(e)}",
'solve_time': 0,
'status': 'error'
}
results.append(error_result)
# Small delay to avoid overwhelming APIs
time.sleep(1)
return results
def analyze_results(results: List[Dict]):
"""Analyze the solving results"""
print(f"\nπŸ“Š COMPREHENSIVE RESULTS ANALYSIS")
print("=" * 70)
total_questions = len(results)
completed = len([r for r in results if r['status'] == 'completed'])
errors = len([r for r in results if r['status'] == 'error'])
print(f"πŸ“ˆ OVERALL STATISTICS:")
print(f" Total Questions: {total_questions}")
print(f" Successfully Solved: {completed} ({completed/total_questions*100:.1f}%)")
print(f" Errors: {errors} ({errors/total_questions*100:.1f}%)")
if completed > 0:
completed_results = [r for r in results if r['status'] == 'completed']
avg_time = sum(r['solve_time'] for r in completed_results) / len(completed_results)
print(f" Average Solve Time: {avg_time:.1f}s")
# Classification analysis
print(f"\n🎯 CLASSIFICATION ANALYSIS:")
agent_counts = {}
complexity_counts = {}
confidence_scores = []
for result in results:
if result['classification']:
primary = result['classification']['primary_agent']
agent_counts[primary] = agent_counts.get(primary, 0) + 1
complexity = result['classification']['complexity']
complexity_counts[complexity] = complexity_counts.get(complexity, 0) + 1
confidence_scores.append(result['classification']['confidence'])
print(f" Agent Distribution:")
for agent, count in sorted(agent_counts.items()):
percentage = (count / total_questions) * 100
print(f" {agent}: {count} questions ({percentage:.1f}%)")
print(f" Complexity Distribution:")
for complexity, count in sorted(complexity_counts.items()):
percentage = (count / total_questions) * 100
print(f" Level {complexity}: {count} questions ({percentage:.1f}%)")
if confidence_scores:
avg_confidence = sum(confidence_scores) / len(confidence_scores)
print(f" Average Classification Confidence: {avg_confidence:.3f}")
# Question type analysis
print(f"\nπŸ“ QUESTION BREAKDOWN:")
for i, result in enumerate(results, 1):
status_emoji = "βœ…" if result['status'] == 'completed' else "❌"
task_id = result['question_id'][:8]
primary_agent = result['classification']['primary_agent'] if result['classification'] else 'unknown'
answer_preview = result['answer'][:50] + "..." if len(result['answer']) > 50 else result['answer']
print(f" {i:2d}. {status_emoji} {task_id}... [{primary_agent}] {answer_preview}")
def save_results(results: List[Dict]):
"""Save results to JSON file for further analysis"""
output_file = "gaia_validation_results.json"
with open(output_file, 'w') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\nπŸ’Ύ Results saved to: {output_file}")
print(f"πŸ“‹ Use this file to compare with official GAIA answers")
def main():
"""Main validation workflow"""
print("🎯 Starting comprehensive GAIA validation...")
print("⚠️ This will take several minutes to complete all 20 questions")
# Solve all questions
results = solve_all_questions_with_validation()
# Analyze results
analyze_results(results)
# Save for comparison
save_results(results)
print(f"\nβœ… VALIDATION COMPLETE!")
print(f"πŸ“Š Check gaia_validation_results.json for detailed results")
print(f"πŸ” Compare answers with official GAIA dataset when available")
if __name__ == "__main__":
main()