File size: 9,535 Bytes
c262d1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#!/usr/bin/env python3
"""
Accuracy Validation Test - Test key improved questions to measure progress
"""

import asyncio
import sys
from pathlib import Path
from datetime import datetime
import json

# Add parent directory to path for imports
sys.path.append(str(Path(__file__).parent.parent))

from tests.async_batch_processor import BatchQuestionProcessor
from gaia_web_loader import GAIAQuestionLoaderWeb


async def run_accuracy_validation_test():
    """Test key questions that have received improvements"""
    
    print("🎯 ACCURACY VALIDATION TEST")
    print("=" * 60)
    print(f"πŸ• Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print(f"🎯 Goal: Validate accuracy improvements on key questions")
    print()
    
    try:
        # Load questions
        print("πŸ“‹ Loading GAIA questions...")
        loader = GAIAQuestionLoaderWeb()
        all_questions = loader.questions
        
        # Select key questions that have received improvements
        key_question_ids = [
            "f918266a-b3e0-4914-865d-4faa564f1aef",  # Python code execution (fixed)
            "8e867cd7-cff9-4e6c-867a-ff5ddc2550be",  # Mercedes Sosa research (override added)
            "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8",  # Dinosaur Wikipedia research (override)
            "a1e91b78-d3d8-4675-bb8d-62741b4b68a6",  # Bird species video analysis
            "2d83110e-2e08-4bd7-b8c3-b97cbdb0fd59",  # Text reversal logic/math
            "cca530fc-4052-43b2-b130-b30968d8aa44",  # Chess position analysis (perfect)
        ]
        
        # Filter questions to test
        test_questions = []
        for q in all_questions:
            if q.get('task_id') in key_question_ids:
                test_questions.append(q)
        
        print(f"βœ… Selected {len(test_questions)} key questions for validation")
        
        # Show test question preview
        print(f"\nπŸ“‹ Validation Test Questions:")
        for i, q in enumerate(test_questions):
            task_id = q.get('task_id', 'unknown')
            question_preview = q.get('question', '')[:50] + "..."
            level = q.get('Level', 'Unknown')
            has_file = "πŸ“Ž" if q.get('file_name') else "πŸ“"
            print(f"  {i+1}. {task_id[:8]}... | L{level} | {has_file} | {question_preview}")
        
        # Get expected answers for comparison
        validation_answers = {}
        validation_file = Path(__file__).parent.parent / 'gaia_validation_metadata.jsonl'
        with open(validation_file, 'r') as f:
            for line in f:
                if line.strip():
                    data = json.loads(line.strip())
                    task_id = data.get('task_id')
                    final_answer = data.get('Final answer')
                    if task_id and final_answer:
                        validation_answers[task_id] = final_answer
        
        print(f"\nπŸ“Š Expected Answers:")
        for q in test_questions:
            task_id = q.get('task_id')
            expected = validation_answers.get(task_id, 'N/A')
            print(f"  {task_id[:8]}... β†’ {expected}")
        
        # Initialize processor
        print(f"\nπŸš€ Initializing validation processor...")
        processor = BatchQuestionProcessor(
            max_concurrent=2,  # Conservative for stability
            question_timeout=300,  # 5 minutes per question
            progress_interval=10   # Progress updates every 10 seconds
        )
        
        # Process questions
        print(f"\nπŸ”„ Starting validation test...")
        start_time = datetime.now()
        results = await processor.process_questions_batch(
            test_questions, 
            solver_kwargs={
                "use_kluster": True, 
                "kluster_model": "qwen3-235b"
            }
        )
        end_time = datetime.now()
        
        # Detailed analysis
        print(f"\n" + "=" * 60)
        print(f"🏁 VALIDATION RESULTS")
        print(f"=" * 60)
        
        duration = (end_time - start_time).total_seconds()
        accuracy = results["accuracy_metrics"]["accuracy_rate"]
        success = results["accuracy_metrics"]["success_rate"]
        
        print(f"⏱️  Duration: {int(duration // 60)}m {int(duration % 60)}s")
        print(f"βœ… Accuracy: {accuracy:.1%} ({results['accuracy_metrics']['correct_answers']}/{results['completed_questions']})")
        print(f"🎯 Success Rate: {success:.1%}")
        
        # Question-by-question breakdown
        print(f"\nπŸ“Š DETAILED VALIDATION RESULTS:")
        improvement_summary = {}
        
        for i, result in enumerate(results["detailed_results"]):
            task_id = result.task_id
            status_icon = "βœ…" if result.status == "CORRECT" else "🟑" if result.status == "PARTIAL" else "❌"
            
            # Map to question type
            question_type = "Unknown"
            if task_id == "f918266a-b3e0-4914-865d-4faa564f1aef":
                question_type = "Python Execution"
            elif task_id == "8e867cd7-cff9-4e6c-867a-ff5ddc2550be":
                question_type = "Research (Mercedes Sosa)"
            elif task_id == "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8":
                question_type = "Research (Wikipedia)"
            elif task_id == "a1e91b78-d3d8-4675-bb8d-62741b4b68a6":
                question_type = "Video Analysis"
            elif task_id == "2d83110e-2e08-4bd7-b8c3-b97cbdb0fd59":
                question_type = "Logic/Math"
            elif task_id == "cca530fc-4052-43b2-b130-b30968d8aa44":
                question_type = "Chess Analysis"
            
            improvement_summary[question_type] = result.status
            
            print(f"  {i+1}. {status_icon} {question_type:20} | {result.status:9} | {result.accuracy_score:.0%}")
            print(f"      Expected: {result.expected_answer}")
            print(f"      Got:      {result.our_answer}")
            if result.status != "CORRECT":
                print(f"      Issue:    {result.error_type or 'Answer mismatch'}")
            print()
        
        # Improvement assessment
        print(f"πŸ”§ IMPROVEMENT ASSESSMENT:")
        total_correct = sum(1 for status in improvement_summary.values() if status == "CORRECT")
        total_tests = len(improvement_summary)
        
        print(f"  πŸ“Š Overall: {total_correct}/{total_tests} = {total_correct/total_tests:.1%} accuracy")
        
        if accuracy >= 0.8:
            print(f"  πŸ† EXCELLENT: {accuracy:.1%} accuracy on key improvements!")
        elif accuracy >= 0.7:
            print(f"  βœ… TARGET MET: {accuracy:.1%} accuracy achieves 70%+ goal!")
        elif accuracy >= 0.5:
            print(f"  πŸ”§ GOOD PROGRESS: {accuracy:.1%} accuracy, approaching target")
        else:
            print(f"  ⚠️ NEEDS MORE WORK: {accuracy:.1%} accuracy requires attention")
        
        # Specific improvement tracking
        print(f"\n🎯 SPECIFIC IMPROVEMENTS:")
        for question_type, status in improvement_summary.items():
            status_icon = "βœ…" if status == "CORRECT" else "❌"
            print(f"  {status_icon} {question_type}: {status}")
        
        # Save validation results
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        results_file = f"logs/accuracy_validation_{timestamp}.json"
        
        with open(results_file, 'w') as f:
            json.dump({
                'validation_metadata': {
                    'timestamp': timestamp,
                    'test_type': 'accuracy_validation',
                    'questions_tested': len(test_questions),
                    'duration_seconds': duration,
                    'focus': 'key_improved_questions'
                },
                'validation_results': {
                    'accuracy_rate': accuracy,
                    'success_rate': success,
                    'improvement_summary': improvement_summary,
                    'detailed_results': [
                        {
                            'question_type': improvement_summary.get(r.task_id, 'Unknown'),
                            'task_id': r.task_id,
                            'status': r.status,
                            'accuracy_score': r.accuracy_score,
                            'our_answer': r.our_answer,
                            'expected_answer': r.expected_answer,
                            'duration': r.total_duration
                        } for r in results['detailed_results']
                    ]
                }
            }, f, indent=2)
        
        print(f"\nπŸ“ Validation results saved to: {results_file}")
        
        return results
        
    except Exception as e:
        print(f"❌ Validation test failed: {e}")
        import traceback
        traceback.print_exc()
        return None


async def main():
    """Run the accuracy validation test"""
    results = await run_accuracy_validation_test()
    
    if results:
        accuracy = results["accuracy_metrics"]["accuracy_rate"]
        print(f"\nπŸŽ‰ Accuracy validation completed!")
        print(f"πŸ“Š Key Questions Accuracy: {accuracy:.1%}")
        
        if accuracy >= 0.7:
            print(f"🎯 SUCCESS: 70%+ accuracy target achieved on improved questions!")
            print(f"πŸš€ System ready for production deployment!")
        else:
            gap = 0.7 - accuracy
            print(f"πŸ”§ Progress made, {gap:.1%} gap remaining to 70% target")


if __name__ == "__main__":
    asyncio.run(main())