#!/usr/bin/env python3 """ Focused Accuracy Test - Test first 10 questions for complete baseline """ import asyncio import sys from pathlib import Path from datetime import datetime import json # Add parent directory to path for imports sys.path.append(str(Path(__file__).parent.parent)) from tests.async_batch_processor import BatchQuestionProcessor from gaia_web_loader import GAIAQuestionLoaderWeb async def run_focused_accuracy_test(): """Run focused accuracy test on first 10 questions""" print("šŸŽÆ FOCUSED GAIA ACCURACY TEST (First 10 Questions)") print("=" * 70) print(f"šŸ• Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print() try: # Load questions print("šŸ“‹ Loading GAIA questions...") loader = GAIAQuestionLoaderWeb() all_questions = loader.questions # Use first 10 questions for focused testing test_questions = all_questions[:10] print(f"āœ… Selected {len(test_questions)} questions for focused testing") # Show question preview print(f"\nšŸ“‹ Test Questions:") for i, q in enumerate(test_questions): task_id = q.get('task_id', 'unknown') question_preview = q.get('question', '')[:50] + "..." level = q.get('Level', 'Unknown') has_file = "šŸ“Ž" if q.get('file_name') else "šŸ“" print(f" {i+1:2d}. {task_id[:8]}... | L{level} | {has_file} | {question_preview}") # Initialize processor with optimized settings for focused test print(f"\nšŸš€ Initializing focused batch processor...") processor = BatchQuestionProcessor( max_concurrent=2, # Lower concurrency for stability question_timeout=600, # 10 minutes per question progress_interval=10 # Progress updates every 10 seconds ) print(f"āš™ļø Focused Test Configuration:") print(f" - Questions: {len(test_questions)}") print(f" - Max Concurrent: {processor.max_concurrent}") print(f" - Question Timeout: {processor.question_timeout}s") print(f" - Expected Duration: ~{len(test_questions) * 2} minutes") # Process questions print(f"\nšŸ”„ Starting focused accuracy test...") start_time = datetime.now() results = await processor.process_questions_batch( test_questions, solver_kwargs={ "use_kluster": True, "kluster_model": "qwen3-235b" } ) end_time = datetime.now() # Analyze results print(f"\n" + "=" * 70) print(f"šŸ FOCUSED TEST RESULTS") print(f"=" * 70) duration = (end_time - start_time).total_seconds() accuracy = results["accuracy_metrics"]["accuracy_rate"] success = results["accuracy_metrics"]["success_rate"] print(f"ā±ļø Total Duration: {int(duration // 60)}m {int(duration % 60)}s") print(f"āœ… Accuracy: {accuracy:.1%} ({results['accuracy_metrics']['correct_answers']}/{results['completed_questions']})") print(f"šŸŽÆ Success Rate: {success:.1%}") print(f"⚔ Avg per Question: {results['performance_metrics']['average_duration']:.1f}s") # Detailed question-by-question results print(f"\nšŸ“Š QUESTION-BY-QUESTION RESULTS:") for i, result in enumerate(results["detailed_results"]): status_icon = "āœ…" if result.status == "CORRECT" else "🟔" if result.status == "PARTIAL" else "āŒ" task_id = result.task_id[:8] classification = result.classification duration = result.total_duration accuracy_score = result.accuracy_score print(f" {i+1:2d}. {status_icon} {task_id}... | {classification:12} | {accuracy_score:.0%} | {duration:5.1f}s") if result.status != "CORRECT": print(f" Expected: {result.expected_answer}") print(f" Got: {result.our_answer}") if result.error_type: print(f" Error: {result.error_type}") # Classification analysis print(f"\nšŸŽÆ CLASSIFICATION PERFORMANCE:") classification_stats = {} for result in results["detailed_results"]: classification = result.classification if classification not in classification_stats: classification_stats[classification] = { 'total': 0, 'correct': 0, 'partial': 0, 'durations': [] } classification_stats[classification]['total'] += 1 classification_stats[classification]['durations'].append(result.total_duration) if result.status == 'CORRECT': classification_stats[classification]['correct'] += 1 elif result.status == 'PARTIAL': classification_stats[classification]['partial'] += 1 for classification, stats in sorted(classification_stats.items()): total = stats['total'] correct = stats['correct'] partial = stats['partial'] accuracy_rate = correct / total if total > 0 else 0 success_rate = (correct + partial) / total if total > 0 else 0 avg_duration = sum(stats['durations']) / len(stats['durations']) if stats['durations'] else 0 print(f" {classification:15} | {accuracy_rate:.1%} acc | {success_rate:.1%} success | {total:2d} questions | {avg_duration:5.1f}s avg") # Assessment and recommendations print(f"\nšŸ”§ ASSESSMENT:") if accuracy >= 0.9: print(f" šŸ† EXCELLENT: {accuracy:.1%} accuracy! System performing very well.") elif accuracy >= 0.7: print(f" āœ… TARGET MET: {accuracy:.1%} accuracy achieves 70%+ goal!") elif accuracy >= 0.5: print(f" šŸ”§ GOOD PROGRESS: {accuracy:.1%} accuracy, approaching target.") else: print(f" 🚨 NEEDS IMPROVEMENT: {accuracy:.1%} accuracy requires attention.") # Save results timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") results_file = f"logs/focused_accuracy_test_{timestamp}.json" with open(results_file, 'w') as f: json.dump({ 'test_metadata': { 'timestamp': timestamp, 'test_type': 'focused_10_questions', 'duration_seconds': duration, 'questions_tested': len(test_questions), 'configuration': { 'max_concurrent': processor.max_concurrent, 'question_timeout': processor.question_timeout, 'model': 'qwen3-235b' } }, 'results': { 'accuracy_rate': accuracy, 'success_rate': success, 'classification_stats': classification_stats, 'detailed_results': [ { 'question_number': i+1, 'task_id': r.task_id, 'classification': r.classification, 'status': r.status, 'accuracy_score': r.accuracy_score, 'our_answer': r.our_answer, 'expected_answer': r.expected_answer, 'duration': r.total_duration, 'error_type': r.error_type } for i, r in enumerate(results['detailed_results']) ] } }, f, indent=2) print(f"\nšŸ“ Results saved to: {results_file}") return results except Exception as e: print(f"āŒ Focused test failed: {e}") import traceback traceback.print_exc() return None async def main(): """Run the focused accuracy test""" results = await run_focused_accuracy_test() if results: accuracy = results["accuracy_metrics"]["accuracy_rate"] print(f"\nšŸŽ‰ Focused accuracy test completed!") print(f"šŸ“Š Final Accuracy: {accuracy:.1%}") if accuracy >= 0.7: print(f"šŸŽÆ TARGET ACHIEVED: 70%+ accuracy reached!") print(f"šŸš€ Ready for comprehensive full-scale testing!") else: gap = 0.7 - accuracy print(f"šŸ”§ GAP TO TARGET: {gap:.1%} improvement needed") if __name__ == "__main__": asyncio.run(main())