Final_Assignment / tests /logged_clean_test.py
GAIA Developer
πŸ§ͺ Add comprehensive test infrastructure and async testing system
c262d1a
#!/usr/bin/env python3
"""
Logged Clean Test - Test all questions with proper logging and no overrides
"""
import os
import sys
import json
import time
from pathlib import Path
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Add parent directory to path for imports
sys.path.append(str(Path(__file__).parent.parent))
# Local imports
from gaia_web_loader import GAIAQuestionLoaderWeb
from main import GAIASolver
from question_classifier import QuestionClassifier
from tests.test_logging_utils import test_logger
def load_validation_answers():
"""Load correct answers from GAIA validation metadata"""
answers = {}
try:
validation_path = Path(__file__).parent.parent / 'gaia_validation_metadata.jsonl'
with open(validation_path, 'r') as f:
for line in f:
if line.strip():
data = json.loads(line.strip())
task_id = data.get('task_id')
final_answer = data.get('Final answer')
if task_id and final_answer:
answers[task_id] = final_answer
except Exception as e:
print(f"⚠️ Could not load validation data: {e}")
return answers
def validate_answer(task_id: str, our_answer: str, validation_answers: dict):
"""Validate our answer against the correct answer"""
if task_id not in validation_answers:
return None
expected = str(validation_answers[task_id]).strip()
our_clean = str(our_answer).strip()
# Exact match
if our_clean.lower() == expected.lower():
return {"status": "CORRECT", "expected": expected, "our": our_clean}
# Check if our answer contains the expected answer
if expected.lower() in our_clean.lower():
return {"status": "PARTIAL", "expected": expected, "our": our_clean}
return {"status": "INCORRECT", "expected": expected, "our": our_clean}
def test_single_question(question_data, validation_answers, model="qwen3-235b"):
"""Test a single question without any overrides - WITH LOGGING"""
task_id = question_data.get('task_id', 'unknown')
# Use the same logging approach as test_specific_question.py
with test_logger("clean_batch_question", task_id):
try:
print(f"πŸ§ͺ Testing question: {task_id}")
print("=" * 60)
# Initialize solver and classifier
print(f"πŸš€ Initializing GAIA Solver with Kluster.ai {model}...")
solver = GAIASolver(use_kluster=True, kluster_model=model)
print("🧠 Initializing Question Classifier...")
classifier = QuestionClassifier()
# Display question details
print(f"βœ… Found question!")
print(f"πŸ“ Question: {question_data.get('question', 'N/A')}")
print(f"🏷️ Level: {question_data.get('Level', 'Unknown')}")
print(f"πŸ“Ž Has file: {'Yes' if question_data.get('file_name') else 'No'}")
if question_data.get('file_name'):
print(f"πŸ“„ File: {question_data.get('file_name')}")
# Classify the question
print(f"\n🧠 QUESTION CLASSIFICATION:")
print("-" * 40)
question_text = question_data.get('question', '')
file_name = question_data.get('file_name', '')
classification = classifier.classify_question(question_text, file_name)
print(f"🎯 Primary Agent: {classification['primary_agent']}")
if classification['secondary_agents']:
print(f"🀝 Secondary Agents: {', '.join(classification['secondary_agents'])}")
print(f"πŸ“Š Complexity: {classification['complexity']}/5")
print(f"🎲 Confidence: {classification['confidence']:.3f}")
print(f"πŸ”§ Tools Needed: {', '.join(classification['tools_needed'][:3])}")
if len(classification['tools_needed']) > 3:
print(f" (+{len(classification['tools_needed'])-3} more tools)")
print(f"πŸ’­ Reasoning: {classification['reasoning']}")
# Solve the question (NO OVERRIDES - pure LLM reasoning)
print(f"\nπŸ€– Solving question...")
print(f"🎯 Question type: {classification['primary_agent']}")
print(f"πŸ”„ Processing... (NO OVERRIDES - Pure LLM + Tools)")
start_time = time.time()
answer = solver.solve_question(question_data)
end_time = time.time()
duration = end_time - start_time
print(f"βœ… Completed in {duration:.1f} seconds")
# Validate answer
print(f"\nπŸ” ANSWER VALIDATION:")
print("-" * 40)
validation_result = validate_answer(task_id, answer, validation_answers)
if validation_result:
print(f"Expected Answer: {validation_result['expected']}")
print(f"Our Answer: {validation_result['our']}")
print(f"Status: {validation_result['status']}")
if validation_result['status'] == 'CORRECT':
print(f"βœ… PERFECT MATCH!")
elif validation_result['status'] == 'PARTIAL':
print(f"🟑 PARTIAL MATCH - contains correct answer")
else:
print(f"❌ INCORRECT - answers don't match")
else:
print(f"⚠️ No validation data available for question {task_id}")
print(f"\nπŸ“‹ FINAL RESULTS:")
print("=" * 60)
print(f"Task ID: {task_id}")
print(f"Question Type: {classification['primary_agent']}")
print(f"Classification Confidence: {classification['confidence']:.3f}")
print(f"Our Answer: {answer}")
if validation_result:
print(f"Expected Answer: {validation_result['expected']}")
print(f"Validation Status: {validation_result['status']}")
print(f"Duration: {duration:.1f}s")
print(f"🚫 NO OVERRIDES APPLIED - Pure LLM reasoning")
result = {
'task_id': task_id,
'question_type': classification['primary_agent'],
'complexity': classification['complexity'],
'confidence': classification['confidence'],
'our_answer': str(answer),
'expected_answer': validation_result['expected'] if validation_result else 'N/A',
'status': validation_result['status'] if validation_result else 'NO_VALIDATION',
'duration': duration,
'question_preview': question_data.get('question', '')[:50] + "..."
}
status_icon = "βœ…" if result['status'] == "CORRECT" else "🟑" if result['status'] == "PARTIAL" else "❌"
print(f"\n{status_icon} FINAL STATUS: {result['status']}")
return result
except Exception as e:
print(f"❌ Error testing question: {e}")
import traceback
traceback.print_exc()
return {
'task_id': task_id,
'question_type': 'error',
'complexity': 0,
'confidence': 0.0,
'our_answer': '',
'expected_answer': validation_answers.get(task_id, 'N/A'),
'status': 'ERROR',
'duration': 0.0,
'error': str(e),
'question_preview': question_data.get('question', '')[:50] + "..."
}
def run_logged_clean_test():
"""Run logged clean test on all questions"""
print("πŸ§ͺ LOGGED CLEAN TEST - NO OVERRIDES")
print("=" * 60)
print("🎯 Goal: Measure real accuracy with full logging")
print("🚫 No hardcoded answers or overrides")
print("πŸ€– Pure LLM + Tools reasoning only")
print("πŸ“ Full detailed logs will be created")
print()
# Load questions and validation data
print("πŸ“‹ Loading GAIA questions...")
loader = GAIAQuestionLoaderWeb()
all_questions = loader.questions
validation_answers = load_validation_answers()
print(f"βœ… Loaded {len(all_questions)} questions")
print(f"βœ… Loaded {len(validation_answers)} validation answers")
# Show question preview
print(f"\nπŸ“‹ Questions to test:")
for i, q in enumerate(all_questions[:3]): # Show first 3
task_id = q.get('task_id', 'unknown')
question_preview = q.get('question', '')[:40] + "..."
level = q.get('Level', 'Unknown')
expected = validation_answers.get(task_id, 'N/A')
has_file = "πŸ“Ž" if q.get('file_name') else "πŸ“"
print(f" {i+1}. {task_id[:8]}... | L{level} | {has_file} | Expected: {expected}")
print(f" {question_preview}")
if len(all_questions) > 3:
print(f" ... and {len(all_questions) - 3} more questions")
print(f"\nπŸš€ Starting logged clean test...")
print(f"πŸ“ Each question will create a detailed log file")
print(f"⏱️ Estimated time: ~{len(all_questions) * 2} minutes")
# Process first 3 questions for demonstration (you can change this)
test_questions = all_questions[:3] # Test first 3 questions
start_time = time.time()
results = []
for i, question_data in enumerate(test_questions):
print(f"\n" + "="*80)
print(f"πŸ“Š PROGRESS: {i+1}/{len(test_questions)}")
print(f"πŸ”„ Processing question {question_data.get('task_id', 'unknown')[:8]}...")
result = test_single_question(question_data, validation_answers)
results.append(result)
# Show progress
completed = i + 1
correct_so_far = len([r for r in results if r['status'] == 'CORRECT'])
current_accuracy = correct_so_far / completed * 100
print(f"πŸ“ˆ Current accuracy: {current_accuracy:.1f}% ({correct_so_far}/{completed})")
end_time = time.time()
total_duration = end_time - start_time
# Final analysis
print(f"\n" + "=" * 80)
print(f"🏁 LOGGED CLEAN TEST RESULTS")
print(f"=" * 80)
# Calculate metrics
total_questions = len(results)
correct_answers = len([r for r in results if r['status'] == 'CORRECT'])
partial_answers = len([r for r in results if r['status'] == 'PARTIAL'])
incorrect_answers = len([r for r in results if r['status'] == 'INCORRECT'])
errors = len([r for r in results if r['status'] == 'ERROR'])
accuracy_rate = correct_answers / total_questions * 100
success_rate = (correct_answers + partial_answers) / total_questions * 100
print(f"⏱️ Total Duration: {int(total_duration // 60)}m {int(total_duration % 60)}s")
print(f"βœ… **HONEST ACCURACY: {accuracy_rate:.1f}%** ({correct_answers}/{total_questions})")
print(f"🎯 Success Rate: {success_rate:.1f}% (including partial)")
print(f"⚑ Avg per Question: {total_duration/total_questions:.1f}s")
print(f"\nπŸ“Š DETAILED BREAKDOWN:")
print(f" βœ… CORRECT: {correct_answers} ({correct_answers/total_questions:.1%})")
print(f" 🟑 PARTIAL: {partial_answers} ({partial_answers/total_questions:.1%})")
print(f" ❌ INCORRECT: {incorrect_answers} ({incorrect_answers/total_questions:.1%})")
print(f" πŸ’₯ ERROR: {errors} ({errors/total_questions:.1%})")
# Question-by-question results
print(f"\nπŸ“‹ DETAILED QUESTION RESULTS:")
for i, result in enumerate(results):
status_icon = "βœ…" if result['status'] == "CORRECT" else "🟑" if result['status'] == "PARTIAL" else "❌"
print(f" {i+1}. {status_icon} {result['task_id'][:8]}... | {result['question_type']:12} | {result['status']:9} | {result['duration']:5.1f}s")
print(f" Expected: {result['expected_answer']}")
print(f" Got: {result['our_answer']}")
if 'error' in result:
print(f" Error: {result['error']}")
# Save results
timestamp = time.strftime("%Y%m%d_%H%M%S")
results_file = f"logs/logged_clean_test_{timestamp}.json"
with open(results_file, 'w') as f:
json.dump({
'test_metadata': {
'timestamp': timestamp,
'test_type': 'logged_clean_test_no_overrides',
'total_questions': total_questions,
'duration_seconds': total_duration,
'model': 'qwen3-235b',
'note': 'Pure LLM reasoning with full logging'
},
'metrics': {
'accuracy_rate': accuracy_rate,
'success_rate': success_rate,
'correct_answers': correct_answers,
'partial_answers': partial_answers,
'incorrect_answers': incorrect_answers,
'errors': errors
},
'detailed_results': results
}, f, indent=2)
print(f"\nπŸ“ Results summary saved to: {results_file}")
print(f"πŸ“ Individual question logs saved to: logs/clean_batch_question_<id>_*.log")
# Final assessment
print(f"\n🎯 HONEST ASSESSMENT:")
print(f"🚫 NO CHEATING - Pure LLM reasoning only")
print(f"πŸ“Š **Real System Accuracy: {accuracy_rate:.1f}%**")
if accuracy_rate >= 70:
print(f"πŸ† EXCELLENT: Achieves 70%+ target!")
elif accuracy_rate >= 50:
print(f"πŸ”§ GOOD: Solid performance, room for improvement")
elif accuracy_rate >= 30:
print(f"⚠️ MODERATE: Needs significant improvements")
else:
print(f"🚨 POOR: Requires major system overhaul")
print(f"\nπŸ“ Check the log files for detailed execution traces!")
return accuracy_rate, results
if __name__ == "__main__":
accuracy, results = run_logged_clean_test()
print(f"\nπŸŽ‰ Logged clean test completed!")
print(f"πŸ“Š **HONEST ACCURACY: {accuracy:.1f}%**")
print(f"πŸ” Full logs available in logs/ directory")