Final_Assignment / tests /focused_accuracy_test.py
GAIA Developer
πŸ§ͺ Add comprehensive test infrastructure and async testing system
c262d1a
#!/usr/bin/env python3
"""
Focused Accuracy Test - Test first 10 questions for complete baseline
"""
import asyncio
import sys
from pathlib import Path
from datetime import datetime
import json
# Add parent directory to path for imports
sys.path.append(str(Path(__file__).parent.parent))
from tests.async_batch_processor import BatchQuestionProcessor
from gaia_web_loader import GAIAQuestionLoaderWeb
async def run_focused_accuracy_test():
"""Run focused accuracy test on first 10 questions"""
print("🎯 FOCUSED GAIA ACCURACY TEST (First 10 Questions)")
print("=" * 70)
print(f"πŸ• Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print()
try:
# Load questions
print("πŸ“‹ Loading GAIA questions...")
loader = GAIAQuestionLoaderWeb()
all_questions = loader.questions
# Use first 10 questions for focused testing
test_questions = all_questions[:10]
print(f"βœ… Selected {len(test_questions)} questions for focused testing")
# Show question preview
print(f"\nπŸ“‹ Test Questions:")
for i, q in enumerate(test_questions):
task_id = q.get('task_id', 'unknown')
question_preview = q.get('question', '')[:50] + "..."
level = q.get('Level', 'Unknown')
has_file = "πŸ“Ž" if q.get('file_name') else "πŸ“"
print(f" {i+1:2d}. {task_id[:8]}... | L{level} | {has_file} | {question_preview}")
# Initialize processor with optimized settings for focused test
print(f"\nπŸš€ Initializing focused batch processor...")
processor = BatchQuestionProcessor(
max_concurrent=2, # Lower concurrency for stability
question_timeout=600, # 10 minutes per question
progress_interval=10 # Progress updates every 10 seconds
)
print(f"βš™οΈ Focused Test Configuration:")
print(f" - Questions: {len(test_questions)}")
print(f" - Max Concurrent: {processor.max_concurrent}")
print(f" - Question Timeout: {processor.question_timeout}s")
print(f" - Expected Duration: ~{len(test_questions) * 2} minutes")
# Process questions
print(f"\nπŸ”„ Starting focused accuracy test...")
start_time = datetime.now()
results = await processor.process_questions_batch(
test_questions,
solver_kwargs={
"use_kluster": True,
"kluster_model": "qwen3-235b"
}
)
end_time = datetime.now()
# Analyze results
print(f"\n" + "=" * 70)
print(f"🏁 FOCUSED TEST RESULTS")
print(f"=" * 70)
duration = (end_time - start_time).total_seconds()
accuracy = results["accuracy_metrics"]["accuracy_rate"]
success = results["accuracy_metrics"]["success_rate"]
print(f"⏱️ Total Duration: {int(duration // 60)}m {int(duration % 60)}s")
print(f"βœ… Accuracy: {accuracy:.1%} ({results['accuracy_metrics']['correct_answers']}/{results['completed_questions']})")
print(f"🎯 Success Rate: {success:.1%}")
print(f"⚑ Avg per Question: {results['performance_metrics']['average_duration']:.1f}s")
# Detailed question-by-question results
print(f"\nπŸ“Š QUESTION-BY-QUESTION RESULTS:")
for i, result in enumerate(results["detailed_results"]):
status_icon = "βœ…" if result.status == "CORRECT" else "🟑" if result.status == "PARTIAL" else "❌"
task_id = result.task_id[:8]
classification = result.classification
duration = result.total_duration
accuracy_score = result.accuracy_score
print(f" {i+1:2d}. {status_icon} {task_id}... | {classification:12} | {accuracy_score:.0%} | {duration:5.1f}s")
if result.status != "CORRECT":
print(f" Expected: {result.expected_answer}")
print(f" Got: {result.our_answer}")
if result.error_type:
print(f" Error: {result.error_type}")
# Classification analysis
print(f"\n🎯 CLASSIFICATION PERFORMANCE:")
classification_stats = {}
for result in results["detailed_results"]:
classification = result.classification
if classification not in classification_stats:
classification_stats[classification] = {
'total': 0, 'correct': 0, 'partial': 0, 'durations': []
}
classification_stats[classification]['total'] += 1
classification_stats[classification]['durations'].append(result.total_duration)
if result.status == 'CORRECT':
classification_stats[classification]['correct'] += 1
elif result.status == 'PARTIAL':
classification_stats[classification]['partial'] += 1
for classification, stats in sorted(classification_stats.items()):
total = stats['total']
correct = stats['correct']
partial = stats['partial']
accuracy_rate = correct / total if total > 0 else 0
success_rate = (correct + partial) / total if total > 0 else 0
avg_duration = sum(stats['durations']) / len(stats['durations']) if stats['durations'] else 0
print(f" {classification:15} | {accuracy_rate:.1%} acc | {success_rate:.1%} success | {total:2d} questions | {avg_duration:5.1f}s avg")
# Assessment and recommendations
print(f"\nπŸ”§ ASSESSMENT:")
if accuracy >= 0.9:
print(f" πŸ† EXCELLENT: {accuracy:.1%} accuracy! System performing very well.")
elif accuracy >= 0.7:
print(f" βœ… TARGET MET: {accuracy:.1%} accuracy achieves 70%+ goal!")
elif accuracy >= 0.5:
print(f" πŸ”§ GOOD PROGRESS: {accuracy:.1%} accuracy, approaching target.")
else:
print(f" 🚨 NEEDS IMPROVEMENT: {accuracy:.1%} accuracy requires attention.")
# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_file = f"logs/focused_accuracy_test_{timestamp}.json"
with open(results_file, 'w') as f:
json.dump({
'test_metadata': {
'timestamp': timestamp,
'test_type': 'focused_10_questions',
'duration_seconds': duration,
'questions_tested': len(test_questions),
'configuration': {
'max_concurrent': processor.max_concurrent,
'question_timeout': processor.question_timeout,
'model': 'qwen3-235b'
}
},
'results': {
'accuracy_rate': accuracy,
'success_rate': success,
'classification_stats': classification_stats,
'detailed_results': [
{
'question_number': i+1,
'task_id': r.task_id,
'classification': r.classification,
'status': r.status,
'accuracy_score': r.accuracy_score,
'our_answer': r.our_answer,
'expected_answer': r.expected_answer,
'duration': r.total_duration,
'error_type': r.error_type
} for i, r in enumerate(results['detailed_results'])
]
}
}, f, indent=2)
print(f"\nπŸ“ Results saved to: {results_file}")
return results
except Exception as e:
print(f"❌ Focused test failed: {e}")
import traceback
traceback.print_exc()
return None
async def main():
"""Run the focused accuracy test"""
results = await run_focused_accuracy_test()
if results:
accuracy = results["accuracy_metrics"]["accuracy_rate"]
print(f"\nπŸŽ‰ Focused accuracy test completed!")
print(f"πŸ“Š Final Accuracy: {accuracy:.1%}")
if accuracy >= 0.7:
print(f"🎯 TARGET ACHIEVED: 70%+ accuracy reached!")
print(f"πŸš€ Ready for comprehensive full-scale testing!")
else:
gap = 0.7 - accuracy
print(f"πŸ”§ GAP TO TARGET: {gap:.1%} improvement needed")
if __name__ == "__main__":
asyncio.run(main())