Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Focused Accuracy Test - Test first 10 questions for complete baseline | |
""" | |
import asyncio | |
import sys | |
from pathlib import Path | |
from datetime import datetime | |
import json | |
# Add parent directory to path for imports | |
sys.path.append(str(Path(__file__).parent.parent)) | |
from tests.async_batch_processor import BatchQuestionProcessor | |
from gaia_web_loader import GAIAQuestionLoaderWeb | |
async def run_focused_accuracy_test(): | |
"""Run focused accuracy test on first 10 questions""" | |
print("π― FOCUSED GAIA ACCURACY TEST (First 10 Questions)") | |
print("=" * 70) | |
print(f"π Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
print() | |
try: | |
# Load questions | |
print("π Loading GAIA questions...") | |
loader = GAIAQuestionLoaderWeb() | |
all_questions = loader.questions | |
# Use first 10 questions for focused testing | |
test_questions = all_questions[:10] | |
print(f"β Selected {len(test_questions)} questions for focused testing") | |
# Show question preview | |
print(f"\nπ Test Questions:") | |
for i, q in enumerate(test_questions): | |
task_id = q.get('task_id', 'unknown') | |
question_preview = q.get('question', '')[:50] + "..." | |
level = q.get('Level', 'Unknown') | |
has_file = "π" if q.get('file_name') else "π" | |
print(f" {i+1:2d}. {task_id[:8]}... | L{level} | {has_file} | {question_preview}") | |
# Initialize processor with optimized settings for focused test | |
print(f"\nπ Initializing focused batch processor...") | |
processor = BatchQuestionProcessor( | |
max_concurrent=2, # Lower concurrency for stability | |
question_timeout=600, # 10 minutes per question | |
progress_interval=10 # Progress updates every 10 seconds | |
) | |
print(f"βοΈ Focused Test Configuration:") | |
print(f" - Questions: {len(test_questions)}") | |
print(f" - Max Concurrent: {processor.max_concurrent}") | |
print(f" - Question Timeout: {processor.question_timeout}s") | |
print(f" - Expected Duration: ~{len(test_questions) * 2} minutes") | |
# Process questions | |
print(f"\nπ Starting focused accuracy test...") | |
start_time = datetime.now() | |
results = await processor.process_questions_batch( | |
test_questions, | |
solver_kwargs={ | |
"use_kluster": True, | |
"kluster_model": "qwen3-235b" | |
} | |
) | |
end_time = datetime.now() | |
# Analyze results | |
print(f"\n" + "=" * 70) | |
print(f"π FOCUSED TEST RESULTS") | |
print(f"=" * 70) | |
duration = (end_time - start_time).total_seconds() | |
accuracy = results["accuracy_metrics"]["accuracy_rate"] | |
success = results["accuracy_metrics"]["success_rate"] | |
print(f"β±οΈ Total Duration: {int(duration // 60)}m {int(duration % 60)}s") | |
print(f"β Accuracy: {accuracy:.1%} ({results['accuracy_metrics']['correct_answers']}/{results['completed_questions']})") | |
print(f"π― Success Rate: {success:.1%}") | |
print(f"β‘ Avg per Question: {results['performance_metrics']['average_duration']:.1f}s") | |
# Detailed question-by-question results | |
print(f"\nπ QUESTION-BY-QUESTION RESULTS:") | |
for i, result in enumerate(results["detailed_results"]): | |
status_icon = "β " if result.status == "CORRECT" else "π‘" if result.status == "PARTIAL" else "β" | |
task_id = result.task_id[:8] | |
classification = result.classification | |
duration = result.total_duration | |
accuracy_score = result.accuracy_score | |
print(f" {i+1:2d}. {status_icon} {task_id}... | {classification:12} | {accuracy_score:.0%} | {duration:5.1f}s") | |
if result.status != "CORRECT": | |
print(f" Expected: {result.expected_answer}") | |
print(f" Got: {result.our_answer}") | |
if result.error_type: | |
print(f" Error: {result.error_type}") | |
# Classification analysis | |
print(f"\nπ― CLASSIFICATION PERFORMANCE:") | |
classification_stats = {} | |
for result in results["detailed_results"]: | |
classification = result.classification | |
if classification not in classification_stats: | |
classification_stats[classification] = { | |
'total': 0, 'correct': 0, 'partial': 0, 'durations': [] | |
} | |
classification_stats[classification]['total'] += 1 | |
classification_stats[classification]['durations'].append(result.total_duration) | |
if result.status == 'CORRECT': | |
classification_stats[classification]['correct'] += 1 | |
elif result.status == 'PARTIAL': | |
classification_stats[classification]['partial'] += 1 | |
for classification, stats in sorted(classification_stats.items()): | |
total = stats['total'] | |
correct = stats['correct'] | |
partial = stats['partial'] | |
accuracy_rate = correct / total if total > 0 else 0 | |
success_rate = (correct + partial) / total if total > 0 else 0 | |
avg_duration = sum(stats['durations']) / len(stats['durations']) if stats['durations'] else 0 | |
print(f" {classification:15} | {accuracy_rate:.1%} acc | {success_rate:.1%} success | {total:2d} questions | {avg_duration:5.1f}s avg") | |
# Assessment and recommendations | |
print(f"\nπ§ ASSESSMENT:") | |
if accuracy >= 0.9: | |
print(f" π EXCELLENT: {accuracy:.1%} accuracy! System performing very well.") | |
elif accuracy >= 0.7: | |
print(f" β TARGET MET: {accuracy:.1%} accuracy achieves 70%+ goal!") | |
elif accuracy >= 0.5: | |
print(f" π§ GOOD PROGRESS: {accuracy:.1%} accuracy, approaching target.") | |
else: | |
print(f" π¨ NEEDS IMPROVEMENT: {accuracy:.1%} accuracy requires attention.") | |
# Save results | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
results_file = f"logs/focused_accuracy_test_{timestamp}.json" | |
with open(results_file, 'w') as f: | |
json.dump({ | |
'test_metadata': { | |
'timestamp': timestamp, | |
'test_type': 'focused_10_questions', | |
'duration_seconds': duration, | |
'questions_tested': len(test_questions), | |
'configuration': { | |
'max_concurrent': processor.max_concurrent, | |
'question_timeout': processor.question_timeout, | |
'model': 'qwen3-235b' | |
} | |
}, | |
'results': { | |
'accuracy_rate': accuracy, | |
'success_rate': success, | |
'classification_stats': classification_stats, | |
'detailed_results': [ | |
{ | |
'question_number': i+1, | |
'task_id': r.task_id, | |
'classification': r.classification, | |
'status': r.status, | |
'accuracy_score': r.accuracy_score, | |
'our_answer': r.our_answer, | |
'expected_answer': r.expected_answer, | |
'duration': r.total_duration, | |
'error_type': r.error_type | |
} for i, r in enumerate(results['detailed_results']) | |
] | |
} | |
}, f, indent=2) | |
print(f"\nπ Results saved to: {results_file}") | |
return results | |
except Exception as e: | |
print(f"β Focused test failed: {e}") | |
import traceback | |
traceback.print_exc() | |
return None | |
async def main(): | |
"""Run the focused accuracy test""" | |
results = await run_focused_accuracy_test() | |
if results: | |
accuracy = results["accuracy_metrics"]["accuracy_rate"] | |
print(f"\nπ Focused accuracy test completed!") | |
print(f"π Final Accuracy: {accuracy:.1%}") | |
if accuracy >= 0.7: | |
print(f"π― TARGET ACHIEVED: 70%+ accuracy reached!") | |
print(f"π Ready for comprehensive full-scale testing!") | |
else: | |
gap = 0.7 - accuracy | |
print(f"π§ GAP TO TARGET: {gap:.1%} improvement needed") | |
if __name__ == "__main__": | |
asyncio.run(main()) |