#!/usr/bin/env python3 """ Validate all GAIA questions with our multi-agent system """ import json import time from typing import Dict, List from gaia_web_loader import GAIAQuestionLoaderWeb from main import GAIASolver from question_classifier import QuestionClassifier def solve_all_questions_with_validation(): """Solve all 20 GAIA questions and collect results for validation""" print("๐Ÿงช COMPREHENSIVE GAIA VALIDATION - ALL 20 QUESTIONS") print("=" * 70) # Initialize components print("๐Ÿš€ Initializing multi-agent system...") loader = GAIAQuestionLoaderWeb() classifier = QuestionClassifier() solver = GAIASolver() questions = loader.questions results = [] print(f"๐Ÿ“š Found {len(questions)} questions to solve") for i, question_data in enumerate(questions, 1): task_id = question_data.get('task_id', 'unknown') question_text = question_data.get('question', '') file_name = question_data.get('file_name', '') print(f"\n{'='*60}") print(f"QUESTION {i}/20: {task_id[:8]}...") print(f"{'='*60}") try: # Classification phase print(f"๐Ÿง  CLASSIFICATION:") classification = classifier.classify_question(question_text, file_name) routing = classifier.get_routing_recommendation(classification) print(f" Primary Agent: {classification['primary_agent']}") print(f" Secondary: {classification.get('secondary_agents', [])}") print(f" Complexity: {classification['complexity']}/5") print(f" Confidence: {classification['confidence']:.3f}") # Solving phase print(f"\n๐Ÿค– SOLVING:") print(f" Question: {question_text[:100]}...") if file_name: print(f" File: {file_name}") start_time = time.time() answer = solver.solve_question(question_data) solve_time = time.time() - start_time print(f" โœ… Answer: {answer[:100]}...") print(f" โฑ๏ธ Time: {solve_time:.1f}s") # Store results result = { 'question_id': task_id, 'question': question_text, 'file_name': file_name, 'classification': { 'primary_agent': classification['primary_agent'], 'secondary_agents': classification.get('secondary_agents', []), 'complexity': classification['complexity'], 'confidence': classification['confidence'], 'tools_needed': classification.get('tools_needed', []) }, 'routing': { 'coordination_needed': routing['requires_coordination'], 'duration_estimate': routing['estimated_duration'] }, 'answer': answer, 'solve_time': solve_time, 'status': 'completed' } results.append(result) except Exception as e: print(f" โŒ Error: {e}") # Store error result error_result = { 'question_id': task_id, 'question': question_text, 'file_name': file_name, 'classification': classification if 'classification' in locals() else None, 'answer': f"Error: {str(e)}", 'solve_time': 0, 'status': 'error' } results.append(error_result) # Small delay to avoid overwhelming APIs time.sleep(1) return results def analyze_results(results: List[Dict]): """Analyze the solving results""" print(f"\n๐Ÿ“Š COMPREHENSIVE RESULTS ANALYSIS") print("=" * 70) total_questions = len(results) completed = len([r for r in results if r['status'] == 'completed']) errors = len([r for r in results if r['status'] == 'error']) print(f"๐Ÿ“ˆ OVERALL STATISTICS:") print(f" Total Questions: {total_questions}") print(f" Successfully Solved: {completed} ({completed/total_questions*100:.1f}%)") print(f" Errors: {errors} ({errors/total_questions*100:.1f}%)") if completed > 0: completed_results = [r for r in results if r['status'] == 'completed'] avg_time = sum(r['solve_time'] for r in completed_results) / len(completed_results) print(f" Average Solve Time: {avg_time:.1f}s") # Classification analysis print(f"\n๐ŸŽฏ CLASSIFICATION ANALYSIS:") agent_counts = {} complexity_counts = {} confidence_scores = [] for result in results: if result['classification']: primary = result['classification']['primary_agent'] agent_counts[primary] = agent_counts.get(primary, 0) + 1 complexity = result['classification']['complexity'] complexity_counts[complexity] = complexity_counts.get(complexity, 0) + 1 confidence_scores.append(result['classification']['confidence']) print(f" Agent Distribution:") for agent, count in sorted(agent_counts.items()): percentage = (count / total_questions) * 100 print(f" {agent}: {count} questions ({percentage:.1f}%)") print(f" Complexity Distribution:") for complexity, count in sorted(complexity_counts.items()): percentage = (count / total_questions) * 100 print(f" Level {complexity}: {count} questions ({percentage:.1f}%)") if confidence_scores: avg_confidence = sum(confidence_scores) / len(confidence_scores) print(f" Average Classification Confidence: {avg_confidence:.3f}") # Question type analysis print(f"\n๐Ÿ“ QUESTION BREAKDOWN:") for i, result in enumerate(results, 1): status_emoji = "โœ…" if result['status'] == 'completed' else "โŒ" task_id = result['question_id'][:8] primary_agent = result['classification']['primary_agent'] if result['classification'] else 'unknown' answer_preview = result['answer'][:50] + "..." if len(result['answer']) > 50 else result['answer'] print(f" {i:2d}. {status_emoji} {task_id}... [{primary_agent}] {answer_preview}") def save_results(results: List[Dict]): """Save results to JSON file for further analysis""" output_file = "gaia_validation_results.json" with open(output_file, 'w') as f: json.dump(results, f, indent=2, ensure_ascii=False) print(f"\n๐Ÿ’พ Results saved to: {output_file}") print(f"๐Ÿ“‹ Use this file to compare with official GAIA answers") def main(): """Main validation workflow""" print("๐ŸŽฏ Starting comprehensive GAIA validation...") print("โš ๏ธ This will take several minutes to complete all 20 questions") # Solve all questions results = solve_all_questions_with_validation() # Analyze results analyze_results(results) # Save for comparison save_results(results) print(f"\nโœ… VALIDATION COMPLETE!") print(f"๐Ÿ“Š Check gaia_validation_results.json for detailed results") print(f"๐Ÿ” Compare answers with official GAIA dataset when available") if __name__ == "__main__": main()