Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Comprehensive Accuracy Test - Full GAIA Benchmark Evaluation | |
Runs all 20 questions through the async batch processor for complete accuracy assessment | |
""" | |
import asyncio | |
import sys | |
from pathlib import Path | |
from datetime import datetime | |
import json | |
# Add parent directory to path for imports | |
sys.path.append(str(Path(__file__).parent.parent)) | |
from tests.async_batch_processor import BatchQuestionProcessor | |
from gaia_web_loader import GAIAQuestionLoaderWeb | |
async def run_comprehensive_accuracy_test(): | |
"""Run comprehensive accuracy test on all available GAIA questions""" | |
print("π― COMPREHENSIVE GAIA ACCURACY TEST") | |
print("=" * 80) | |
print(f"π Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
print(f"π― Goal: Establish baseline accuracy and identify improvement areas") | |
print() | |
try: | |
# Load all questions | |
print("π Loading all GAIA questions...") | |
loader = GAIAQuestionLoaderWeb() | |
all_questions = loader.questions | |
print(f"β Loaded {len(all_questions)} questions from GAIA benchmark") | |
# Show question distribution by level | |
level_counts = {} | |
classification_preview = {} | |
for q in all_questions: | |
level = q.get('Level', 'Unknown') | |
level_counts[level] = level_counts.get(level, 0) + 1 | |
# Quick classification preview (first 5 questions) | |
if len(classification_preview) < 5: | |
task_id = q.get('task_id', 'unknown') | |
question_preview = q.get('question', '')[:60] + "..." | |
has_file = "Yes" if q.get('file_name') else "No" | |
classification_preview[task_id[:8]] = { | |
'question': question_preview, | |
'level': level, | |
'has_file': has_file | |
} | |
print(f"\nπ Question Distribution:") | |
for level, count in sorted(level_counts.items()): | |
print(f" Level {level}: {count} questions") | |
print(f"\nπ Sample Questions:") | |
for task_id, info in classification_preview.items(): | |
print(f" {task_id}... | L{info['level']} | File: {info['has_file']} | {info['question']}") | |
# Initialize batch processor with production settings | |
print(f"\nπ Initializing production-grade batch processor...") | |
processor = BatchQuestionProcessor( | |
max_concurrent=3, # Balanced concurrency for stability | |
question_timeout=900, # 15 minutes per question for complex cases | |
progress_interval=15 # Progress updates every 15 seconds | |
) | |
print(f"βοΈ Configuration:") | |
print(f" - Max Concurrent: {processor.max_concurrent}") | |
print(f" - Question Timeout: {processor.question_timeout}s (15 minutes)") | |
print(f" - Progress Interval: {processor.progress_interval}s") | |
print(f" - Expected Duration: ~{len(all_questions) * 3 // processor.max_concurrent // 60} minutes") | |
# Confirm before starting | |
print(f"\nβ οΈ This will process ALL {len(all_questions)} questions concurrently.") | |
print(f"π Estimated time: {len(all_questions) * 3 // processor.max_concurrent} minutes") | |
print(f"π Starting comprehensive accuracy test...") | |
print() | |
# Process all questions | |
start_time = datetime.now() | |
results = await processor.process_questions_batch( | |
all_questions, | |
solver_kwargs={ | |
"use_kluster": True, | |
"kluster_model": "qwen3-235b" | |
} | |
) | |
end_time = datetime.now() | |
# Comprehensive results analysis | |
print(f"\n" + "=" * 80) | |
print(f"π COMPREHENSIVE TEST RESULTS") | |
print(f"=" * 80) | |
duration = (end_time - start_time).total_seconds() | |
accuracy = results["accuracy_metrics"]["accuracy_rate"] | |
success = results["accuracy_metrics"]["success_rate"] | |
print(f"β±οΈ Total Duration: {int(duration // 60)}m {int(duration % 60)}s") | |
print(f"β Overall Accuracy: {accuracy:.1%} ({results['accuracy_metrics']['correct_answers']}/{results['completed_questions']})") | |
print(f"π― Success Rate: {success:.1%} (including partial matches)") | |
print(f"β‘ Average per Question: {results['performance_metrics']['average_duration']:.1f}s") | |
# Detailed breakdown | |
print(f"\nπ DETAILED BREAKDOWN:") | |
print(f" β CORRECT: {results['accuracy_metrics']['correct_answers']}") | |
print(f" π‘ PARTIAL: {results['accuracy_metrics']['partial_answers']}") | |
print(f" β INCORRECT: {results['accuracy_metrics']['incorrect_answers']}") | |
print(f" β±οΈ TIMEOUT: {results['accuracy_metrics']['timeouts']}") | |
print(f" π₯ ERROR: {results['accuracy_metrics']['errors']}") | |
# Classification performance analysis | |
print(f"\nπ― CLASSIFICATION PERFORMANCE:") | |
classification_performance = {} | |
for result in results["detailed_results"]: | |
classification = result.classification | |
if classification not in classification_performance: | |
classification_performance[classification] = { | |
'total': 0, 'correct': 0, 'partial': 0, 'incorrect': 0 | |
} | |
classification_performance[classification]['total'] += 1 | |
if result.status == 'CORRECT': | |
classification_performance[classification]['correct'] += 1 | |
elif result.status == 'PARTIAL': | |
classification_performance[classification]['partial'] += 1 | |
elif result.status == 'INCORRECT': | |
classification_performance[classification]['incorrect'] += 1 | |
# Sort by accuracy for prioritization | |
sorted_classifications = sorted( | |
classification_performance.items(), | |
key=lambda x: (x[1]['correct'] + x[1]['partial'] * 0.5) / x[1]['total'] if x[1]['total'] > 0 else 0 | |
) | |
for classification, perf in sorted_classifications: | |
total = perf['total'] | |
if total > 0: | |
accuracy_rate = perf['correct'] / total | |
success_rate = (perf['correct'] + perf['partial']) / total | |
print(f" {classification:15} | {accuracy_rate:.1%} acc | {success_rate:.1%} success | {total:2d} questions") | |
# Identify improvement priorities | |
print(f"\nπ§ IMPROVEMENT PRIORITIES:") | |
improvement_priorities = [] | |
for classification, perf in sorted_classifications: | |
total = perf['total'] | |
if total > 0: | |
accuracy_rate = perf['correct'] / total | |
impact_score = total * (1 - accuracy_rate) # Questions * failure rate | |
if accuracy_rate < 0.7: # Less than 70% accuracy | |
priority = "HIGH" if impact_score > 2 else "MEDIUM" | |
improvement_priorities.append({ | |
'classification': classification, | |
'accuracy': accuracy_rate, | |
'total_questions': total, | |
'impact_score': impact_score, | |
'priority': priority | |
}) | |
for priority_item in sorted(improvement_priorities, key=lambda x: x['impact_score'], reverse=True): | |
classification = priority_item['classification'] | |
accuracy = priority_item['accuracy'] | |
total = priority_item['total_questions'] | |
priority = priority_item['priority'] | |
impact = priority_item['impact_score'] | |
print(f" π₯ {priority:6} | {classification:15} | {accuracy:.1%} accuracy | {total} questions | Impact: {impact:.1f}") | |
# Save detailed results | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
results_file = f"logs/comprehensive_accuracy_test_{timestamp}.json" | |
with open(results_file, 'w') as f: | |
json.dump({ | |
'test_metadata': { | |
'timestamp': timestamp, | |
'total_questions': len(all_questions), | |
'duration_seconds': duration, | |
'configuration': { | |
'max_concurrent': processor.max_concurrent, | |
'question_timeout': processor.question_timeout, | |
'model': 'qwen3-235b' | |
} | |
}, | |
'overall_metrics': results['accuracy_metrics'], | |
'classification_performance': classification_performance, | |
'improvement_priorities': improvement_priorities, | |
'detailed_results': [ | |
{ | |
'task_id': r.task_id, | |
'classification': r.classification, | |
'status': r.status, | |
'accuracy_score': r.accuracy_score, | |
'our_answer': r.our_answer, | |
'expected_answer': r.expected_answer, | |
'duration': r.total_duration, | |
'error_type': r.error_type | |
} for r in results['detailed_results'] | |
] | |
}, f, indent=2) | |
print(f"\nπ Detailed results saved to: {results_file}") | |
# Summary and next steps | |
print(f"\nπ― NEXT STEPS RECOMMENDATION:") | |
if accuracy >= 0.9: | |
print(f" π EXCELLENT: {accuracy:.1%} accuracy achieved! Focus on edge cases.") | |
elif accuracy >= 0.7: | |
print(f" β GOOD: {accuracy:.1%} accuracy. Target specific classifications for 90%+.") | |
elif accuracy >= 0.5: | |
print(f" π§ MODERATE: {accuracy:.1%} accuracy. Implement targeted improvements.") | |
else: | |
print(f" π¨ NEEDS WORK: {accuracy:.1%} accuracy. Focus on high-impact areas.") | |
if improvement_priorities: | |
top_priority = improvement_priorities[0] | |
print(f" π― TOP PRIORITY: {top_priority['classification']} ({top_priority['accuracy']:.1%} accuracy, {top_priority['total_questions']} questions)") | |
return results | |
except Exception as e: | |
print(f"β Comprehensive test failed: {e}") | |
import traceback | |
traceback.print_exc() | |
return None | |
async def main(): | |
"""Run the comprehensive accuracy test""" | |
results = await run_comprehensive_accuracy_test() | |
if results: | |
accuracy = results["accuracy_metrics"]["accuracy_rate"] | |
print(f"\nπ Comprehensive accuracy test completed!") | |
print(f"π Final Accuracy: {accuracy:.1%}") | |
if accuracy >= 0.7: | |
print(f"π― TARGET ACHIEVED: 70%+ accuracy reached!") | |
else: | |
gap = 0.7 - accuracy | |
print(f"π§ GAP TO TARGET: {gap:.1%} improvement needed for 70%") | |
if __name__ == "__main__": | |
asyncio.run(main()) |