File size: 14,186 Bytes
c262d1a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#!/usr/bin/env python3
"""
Logged Clean Test - Test all questions with proper logging and no overrides
"""

import os
import sys
import json
import time
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Add parent directory to path for imports
sys.path.append(str(Path(__file__).parent.parent))

# Local imports
from gaia_web_loader import GAIAQuestionLoaderWeb
from main import GAIASolver
from question_classifier import QuestionClassifier
from tests.test_logging_utils import test_logger


def load_validation_answers():
    """Load correct answers from GAIA validation metadata"""
    answers = {}
    try:
        validation_path = Path(__file__).parent.parent / 'gaia_validation_metadata.jsonl'
        with open(validation_path, 'r') as f:
            for line in f:
                if line.strip():
                    data = json.loads(line.strip())
                    task_id = data.get('task_id')
                    final_answer = data.get('Final answer')
                    if task_id and final_answer:
                        answers[task_id] = final_answer
    except Exception as e:
        print(f"⚠️ Could not load validation data: {e}")
    return answers


def validate_answer(task_id: str, our_answer: str, validation_answers: dict):
    """Validate our answer against the correct answer"""
    if task_id not in validation_answers:
        return None
    
    expected = str(validation_answers[task_id]).strip()
    our_clean = str(our_answer).strip()
    
    # Exact match
    if our_clean.lower() == expected.lower():
        return {"status": "CORRECT", "expected": expected, "our": our_clean}
    
    # Check if our answer contains the expected answer
    if expected.lower() in our_clean.lower():
        return {"status": "PARTIAL", "expected": expected, "our": our_clean}
    
    return {"status": "INCORRECT", "expected": expected, "our": our_clean}


def test_single_question(question_data, validation_answers, model="qwen3-235b"):
    """Test a single question without any overrides - WITH LOGGING"""
    task_id = question_data.get('task_id', 'unknown')
    
    # Use the same logging approach as test_specific_question.py
    with test_logger("clean_batch_question", task_id):
        try:
            print(f"πŸ§ͺ Testing question: {task_id}")
            print("=" * 60)
            
            # Initialize solver and classifier
            print(f"πŸš€ Initializing GAIA Solver with Kluster.ai {model}...")
            solver = GAIASolver(use_kluster=True, kluster_model=model)
            print("🧠 Initializing Question Classifier...")
            classifier = QuestionClassifier()
            
            # Display question details
            print(f"βœ… Found question!")
            print(f"πŸ“ Question: {question_data.get('question', 'N/A')}")
            print(f"🏷️  Level: {question_data.get('Level', 'Unknown')}")
            print(f"πŸ“Ž Has file: {'Yes' if question_data.get('file_name') else 'No'}")
            if question_data.get('file_name'):
                print(f"πŸ“„ File: {question_data.get('file_name')}")
            
            # Classify the question
            print(f"\n🧠 QUESTION CLASSIFICATION:")
            print("-" * 40)
            question_text = question_data.get('question', '')
            file_name = question_data.get('file_name', '')
            classification = classifier.classify_question(question_text, file_name)
            
            print(f"🎯 Primary Agent: {classification['primary_agent']}")
            if classification['secondary_agents']:
                print(f"🀝 Secondary Agents: {', '.join(classification['secondary_agents'])}")
            print(f"πŸ“Š Complexity: {classification['complexity']}/5")
            print(f"🎲 Confidence: {classification['confidence']:.3f}")
            print(f"πŸ”§ Tools Needed: {', '.join(classification['tools_needed'][:3])}")
            if len(classification['tools_needed']) > 3:
                print(f"     (+{len(classification['tools_needed'])-3} more tools)")
            print(f"πŸ’­ Reasoning: {classification['reasoning']}")
            
            # Solve the question (NO OVERRIDES - pure LLM reasoning)
            print(f"\nπŸ€– Solving question...")
            print(f"🎯 Question type: {classification['primary_agent']}")
            print(f"πŸ”„ Processing... (NO OVERRIDES - Pure LLM + Tools)")
            
            start_time = time.time()
            answer = solver.solve_question(question_data)
            end_time = time.time()
            
            duration = end_time - start_time
            print(f"βœ… Completed in {duration:.1f} seconds")
            
            # Validate answer
            print(f"\nπŸ” ANSWER VALIDATION:")
            print("-" * 40)
            validation_result = validate_answer(task_id, answer, validation_answers)
            
            if validation_result:
                print(f"Expected Answer: {validation_result['expected']}")
                print(f"Our Answer: {validation_result['our']}")
                print(f"Status: {validation_result['status']}")
                if validation_result['status'] == 'CORRECT':
                    print(f"βœ… PERFECT MATCH!")
                elif validation_result['status'] == 'PARTIAL':
                    print(f"🟑 PARTIAL MATCH - contains correct answer")
                else:
                    print(f"❌ INCORRECT - answers don't match")
            else:
                print(f"⚠️ No validation data available for question {task_id}")
            
            print(f"\nπŸ“‹ FINAL RESULTS:")
            print("=" * 60)
            print(f"Task ID: {task_id}")
            print(f"Question Type: {classification['primary_agent']}")
            print(f"Classification Confidence: {classification['confidence']:.3f}")
            print(f"Our Answer: {answer}")
            if validation_result:
                print(f"Expected Answer: {validation_result['expected']}")
                print(f"Validation Status: {validation_result['status']}")
            print(f"Duration: {duration:.1f}s")
            print(f"🚫 NO OVERRIDES APPLIED - Pure LLM reasoning")
            
            result = {
                'task_id': task_id,
                'question_type': classification['primary_agent'],
                'complexity': classification['complexity'],
                'confidence': classification['confidence'],
                'our_answer': str(answer),
                'expected_answer': validation_result['expected'] if validation_result else 'N/A',
                'status': validation_result['status'] if validation_result else 'NO_VALIDATION',
                'duration': duration,
                'question_preview': question_data.get('question', '')[:50] + "..."
            }
            
            status_icon = "βœ…" if result['status'] == "CORRECT" else "🟑" if result['status'] == "PARTIAL" else "❌"
            print(f"\n{status_icon} FINAL STATUS: {result['status']}")
            
            return result
            
        except Exception as e:
            print(f"❌ Error testing question: {e}")
            import traceback
            traceback.print_exc()
            
            return {
                'task_id': task_id,
                'question_type': 'error',
                'complexity': 0,
                'confidence': 0.0,
                'our_answer': '',
                'expected_answer': validation_answers.get(task_id, 'N/A'),
                'status': 'ERROR',
                'duration': 0.0,
                'error': str(e),
                'question_preview': question_data.get('question', '')[:50] + "..."
            }


def run_logged_clean_test():
    """Run logged clean test on all questions"""
    
    print("πŸ§ͺ LOGGED CLEAN TEST - NO OVERRIDES")
    print("=" * 60)
    print("🎯 Goal: Measure real accuracy with full logging")
    print("🚫 No hardcoded answers or overrides")
    print("πŸ€– Pure LLM + Tools reasoning only")
    print("πŸ“ Full detailed logs will be created")
    print()
    
    # Load questions and validation data
    print("πŸ“‹ Loading GAIA questions...")
    loader = GAIAQuestionLoaderWeb()
    all_questions = loader.questions
    validation_answers = load_validation_answers()
    
    print(f"βœ… Loaded {len(all_questions)} questions")
    print(f"βœ… Loaded {len(validation_answers)} validation answers")
    
    # Show question preview
    print(f"\nπŸ“‹ Questions to test:")
    for i, q in enumerate(all_questions[:3]):  # Show first 3
        task_id = q.get('task_id', 'unknown')
        question_preview = q.get('question', '')[:40] + "..."
        level = q.get('Level', 'Unknown')
        expected = validation_answers.get(task_id, 'N/A')
        has_file = "πŸ“Ž" if q.get('file_name') else "πŸ“"
        print(f"  {i+1}. {task_id[:8]}... | L{level} | {has_file} | Expected: {expected}")
        print(f"     {question_preview}")
    
    if len(all_questions) > 3:
        print(f"  ... and {len(all_questions) - 3} more questions")
    
    print(f"\nπŸš€ Starting logged clean test...")
    print(f"πŸ“ Each question will create a detailed log file")
    print(f"⏱️  Estimated time: ~{len(all_questions) * 2} minutes")
    
    # Process first 3 questions for demonstration (you can change this)
    test_questions = all_questions[:3]  # Test first 3 questions
    
    start_time = time.time()
    results = []
    
    for i, question_data in enumerate(test_questions):
        print(f"\n" + "="*80)
        print(f"πŸ“Š PROGRESS: {i+1}/{len(test_questions)}")
        print(f"πŸ”„ Processing question {question_data.get('task_id', 'unknown')[:8]}...")
        
        result = test_single_question(question_data, validation_answers)
        results.append(result)
        
        # Show progress
        completed = i + 1
        correct_so_far = len([r for r in results if r['status'] == 'CORRECT'])
        current_accuracy = correct_so_far / completed * 100
        print(f"πŸ“ˆ Current accuracy: {current_accuracy:.1f}% ({correct_so_far}/{completed})")
    
    end_time = time.time()
    total_duration = end_time - start_time
    
    # Final analysis
    print(f"\n" + "=" * 80)
    print(f"🏁 LOGGED CLEAN TEST RESULTS")
    print(f"=" * 80)
    
    # Calculate metrics
    total_questions = len(results)
    correct_answers = len([r for r in results if r['status'] == 'CORRECT'])
    partial_answers = len([r for r in results if r['status'] == 'PARTIAL'])
    incorrect_answers = len([r for r in results if r['status'] == 'INCORRECT'])
    errors = len([r for r in results if r['status'] == 'ERROR'])
    
    accuracy_rate = correct_answers / total_questions * 100
    success_rate = (correct_answers + partial_answers) / total_questions * 100
    
    print(f"⏱️  Total Duration: {int(total_duration // 60)}m {int(total_duration % 60)}s")
    print(f"βœ… **HONEST ACCURACY: {accuracy_rate:.1f}%** ({correct_answers}/{total_questions})")
    print(f"🎯 Success Rate: {success_rate:.1f}% (including partial)")
    print(f"⚑ Avg per Question: {total_duration/total_questions:.1f}s")
    
    print(f"\nπŸ“Š DETAILED BREAKDOWN:")
    print(f"  βœ… CORRECT: {correct_answers} ({correct_answers/total_questions:.1%})")
    print(f"  🟑 PARTIAL: {partial_answers} ({partial_answers/total_questions:.1%})")
    print(f"  ❌ INCORRECT: {incorrect_answers} ({incorrect_answers/total_questions:.1%})")
    print(f"  πŸ’₯ ERROR: {errors} ({errors/total_questions:.1%})")
    
    # Question-by-question results
    print(f"\nπŸ“‹ DETAILED QUESTION RESULTS:")
    for i, result in enumerate(results):
        status_icon = "βœ…" if result['status'] == "CORRECT" else "🟑" if result['status'] == "PARTIAL" else "❌"
        print(f"  {i+1}. {status_icon} {result['task_id'][:8]}... | {result['question_type']:12} | {result['status']:9} | {result['duration']:5.1f}s")
        print(f"      Expected: {result['expected_answer']}")
        print(f"      Got:      {result['our_answer']}")
        if 'error' in result:
            print(f"      Error:    {result['error']}")
    
    # Save results
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    results_file = f"logs/logged_clean_test_{timestamp}.json"
    
    with open(results_file, 'w') as f:
        json.dump({
            'test_metadata': {
                'timestamp': timestamp,
                'test_type': 'logged_clean_test_no_overrides',
                'total_questions': total_questions,
                'duration_seconds': total_duration,
                'model': 'qwen3-235b',
                'note': 'Pure LLM reasoning with full logging'
            },
            'metrics': {
                'accuracy_rate': accuracy_rate,
                'success_rate': success_rate,
                'correct_answers': correct_answers,
                'partial_answers': partial_answers,
                'incorrect_answers': incorrect_answers,
                'errors': errors
            },
            'detailed_results': results
        }, f, indent=2)
    
    print(f"\nπŸ“ Results summary saved to: {results_file}")
    print(f"πŸ“ Individual question logs saved to: logs/clean_batch_question_<id>_*.log")
    
    # Final assessment
    print(f"\n🎯 HONEST ASSESSMENT:")
    print(f"🚫 NO CHEATING - Pure LLM reasoning only")
    print(f"πŸ“Š **Real System Accuracy: {accuracy_rate:.1f}%**")
    
    if accuracy_rate >= 70:
        print(f"πŸ† EXCELLENT: Achieves 70%+ target!")
    elif accuracy_rate >= 50:
        print(f"πŸ”§ GOOD: Solid performance, room for improvement")
    elif accuracy_rate >= 30:
        print(f"⚠️ MODERATE: Needs significant improvements")
    else:
        print(f"🚨 POOR: Requires major system overhaul")
    
    print(f"\nπŸ“ Check the log files for detailed execution traces!")
    
    return accuracy_rate, results


if __name__ == "__main__":
    accuracy, results = run_logged_clean_test()
    print(f"\nπŸŽ‰ Logged clean test completed!")
    print(f"πŸ“Š **HONEST ACCURACY: {accuracy:.1f}%**")
    print(f"πŸ” Full logs available in logs/ directory")