File size: 7,461 Bytes
c262d1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python3
"""
Validate all GAIA questions with our multi-agent system
"""

import json
import time
from typing import Dict, List
from gaia_web_loader import GAIAQuestionLoaderWeb
from main import GAIASolver
from question_classifier import QuestionClassifier

def solve_all_questions_with_validation():
    """Solve all 20 GAIA questions and collect results for validation"""
    
    print("πŸ§ͺ COMPREHENSIVE GAIA VALIDATION - ALL 20 QUESTIONS")
    print("=" * 70)
    
    # Initialize components
    print("πŸš€ Initializing multi-agent system...")
    loader = GAIAQuestionLoaderWeb()
    classifier = QuestionClassifier()
    solver = GAIASolver()
    
    questions = loader.questions
    results = []
    
    print(f"πŸ“š Found {len(questions)} questions to solve")
    
    for i, question_data in enumerate(questions, 1):
        task_id = question_data.get('task_id', 'unknown')
        question_text = question_data.get('question', '')
        file_name = question_data.get('file_name', '')
        
        print(f"\n{'='*60}")
        print(f"QUESTION {i}/20: {task_id[:8]}...")
        print(f"{'='*60}")
        
        try:
            # Classification phase
            print(f"🧠 CLASSIFICATION:")
            classification = classifier.classify_question(question_text, file_name)
            routing = classifier.get_routing_recommendation(classification)
            
            print(f"  Primary Agent: {classification['primary_agent']}")
            print(f"  Secondary: {classification.get('secondary_agents', [])}")
            print(f"  Complexity: {classification['complexity']}/5")
            print(f"  Confidence: {classification['confidence']:.3f}")
            
            # Solving phase
            print(f"\nπŸ€– SOLVING:")
            print(f"  Question: {question_text[:100]}...")
            if file_name:
                print(f"  File: {file_name}")
            
            start_time = time.time()
            answer = solver.solve_question(question_data)
            solve_time = time.time() - start_time
            
            print(f"  βœ… Answer: {answer[:100]}...")
            print(f"  ⏱️ Time: {solve_time:.1f}s")
            
            # Store results
            result = {
                'question_id': task_id,
                'question': question_text,
                'file_name': file_name,
                'classification': {
                    'primary_agent': classification['primary_agent'],
                    'secondary_agents': classification.get('secondary_agents', []),
                    'complexity': classification['complexity'],
                    'confidence': classification['confidence'],
                    'tools_needed': classification.get('tools_needed', [])
                },
                'routing': {
                    'coordination_needed': routing['requires_coordination'],
                    'duration_estimate': routing['estimated_duration']
                },
                'answer': answer,
                'solve_time': solve_time,
                'status': 'completed'
            }
            
            results.append(result)
            
        except Exception as e:
            print(f"  ❌ Error: {e}")
            
            # Store error result
            error_result = {
                'question_id': task_id,
                'question': question_text,
                'file_name': file_name,
                'classification': classification if 'classification' in locals() else None,
                'answer': f"Error: {str(e)}",
                'solve_time': 0,
                'status': 'error'
            }
            results.append(error_result)
        
        # Small delay to avoid overwhelming APIs
        time.sleep(1)
    
    return results

def analyze_results(results: List[Dict]):
    """Analyze the solving results"""
    
    print(f"\nπŸ“Š COMPREHENSIVE RESULTS ANALYSIS")
    print("=" * 70)
    
    total_questions = len(results)
    completed = len([r for r in results if r['status'] == 'completed'])
    errors = len([r for r in results if r['status'] == 'error'])
    
    print(f"πŸ“ˆ OVERALL STATISTICS:")
    print(f"  Total Questions: {total_questions}")
    print(f"  Successfully Solved: {completed} ({completed/total_questions*100:.1f}%)")
    print(f"  Errors: {errors} ({errors/total_questions*100:.1f}%)")
    
    if completed > 0:
        completed_results = [r for r in results if r['status'] == 'completed']
        avg_time = sum(r['solve_time'] for r in completed_results) / len(completed_results)
        print(f"  Average Solve Time: {avg_time:.1f}s")
    
    # Classification analysis
    print(f"\n🎯 CLASSIFICATION ANALYSIS:")
    agent_counts = {}
    complexity_counts = {}
    confidence_scores = []
    
    for result in results:
        if result['classification']:
            primary = result['classification']['primary_agent']
            agent_counts[primary] = agent_counts.get(primary, 0) + 1
            
            complexity = result['classification']['complexity']
            complexity_counts[complexity] = complexity_counts.get(complexity, 0) + 1
            
            confidence_scores.append(result['classification']['confidence'])
    
    print(f"  Agent Distribution:")
    for agent, count in sorted(agent_counts.items()):
        percentage = (count / total_questions) * 100
        print(f"    {agent}: {count} questions ({percentage:.1f}%)")
    
    print(f"  Complexity Distribution:")
    for complexity, count in sorted(complexity_counts.items()):
        percentage = (count / total_questions) * 100
        print(f"    Level {complexity}: {count} questions ({percentage:.1f}%)")
    
    if confidence_scores:
        avg_confidence = sum(confidence_scores) / len(confidence_scores)
        print(f"  Average Classification Confidence: {avg_confidence:.3f}")
    
    # Question type analysis
    print(f"\nπŸ“ QUESTION BREAKDOWN:")
    for i, result in enumerate(results, 1):
        status_emoji = "βœ…" if result['status'] == 'completed' else "❌"
        task_id = result['question_id'][:8]
        primary_agent = result['classification']['primary_agent'] if result['classification'] else 'unknown'
        answer_preview = result['answer'][:50] + "..." if len(result['answer']) > 50 else result['answer']
        
        print(f"  {i:2d}. {status_emoji} {task_id}... [{primary_agent}] {answer_preview}")

def save_results(results: List[Dict]):
    """Save results to JSON file for further analysis"""
    
    output_file = "gaia_validation_results.json"
    
    with open(output_file, 'w') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\nπŸ’Ύ Results saved to: {output_file}")
    print(f"πŸ“‹ Use this file to compare with official GAIA answers")

def main():
    """Main validation workflow"""
    
    print("🎯 Starting comprehensive GAIA validation...")
    print("⚠️  This will take several minutes to complete all 20 questions")
    
    # Solve all questions
    results = solve_all_questions_with_validation()
    
    # Analyze results
    analyze_results(results)
    
    # Save for comparison
    save_results(results)
    
    print(f"\nβœ… VALIDATION COMPLETE!")
    print(f"πŸ“Š Check gaia_validation_results.json for detailed results")
    print(f"πŸ” Compare answers with official GAIA dataset when available")

if __name__ == "__main__":
    main()