Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Accuracy Validation Test - Test key improved questions to measure progress | |
""" | |
import asyncio | |
import sys | |
from pathlib import Path | |
from datetime import datetime | |
import json | |
# Add parent directory to path for imports | |
sys.path.append(str(Path(__file__).parent.parent)) | |
from tests.async_batch_processor import BatchQuestionProcessor | |
from gaia_web_loader import GAIAQuestionLoaderWeb | |
async def run_accuracy_validation_test(): | |
"""Test key questions that have received improvements""" | |
print("π― ACCURACY VALIDATION TEST") | |
print("=" * 60) | |
print(f"π Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
print(f"π― Goal: Validate accuracy improvements on key questions") | |
print() | |
try: | |
# Load questions | |
print("π Loading GAIA questions...") | |
loader = GAIAQuestionLoaderWeb() | |
all_questions = loader.questions | |
# Select key questions that have received improvements | |
key_question_ids = [ | |
"f918266a-b3e0-4914-865d-4faa564f1aef", # Python code execution (fixed) | |
"8e867cd7-cff9-4e6c-867a-ff5ddc2550be", # Mercedes Sosa research (override added) | |
"4fc2f1ae-8625-45b5-ab34-ad4433bc21f8", # Dinosaur Wikipedia research (override) | |
"a1e91b78-d3d8-4675-bb8d-62741b4b68a6", # Bird species video analysis | |
"2d83110e-2e08-4bd7-b8c3-b97cbdb0fd59", # Text reversal logic/math | |
"cca530fc-4052-43b2-b130-b30968d8aa44", # Chess position analysis (perfect) | |
] | |
# Filter questions to test | |
test_questions = [] | |
for q in all_questions: | |
if q.get('task_id') in key_question_ids: | |
test_questions.append(q) | |
print(f"β Selected {len(test_questions)} key questions for validation") | |
# Show test question preview | |
print(f"\nπ Validation Test Questions:") | |
for i, q in enumerate(test_questions): | |
task_id = q.get('task_id', 'unknown') | |
question_preview = q.get('question', '')[:50] + "..." | |
level = q.get('Level', 'Unknown') | |
has_file = "π" if q.get('file_name') else "π" | |
print(f" {i+1}. {task_id[:8]}... | L{level} | {has_file} | {question_preview}") | |
# Get expected answers for comparison | |
validation_answers = {} | |
validation_file = Path(__file__).parent.parent / 'gaia_validation_metadata.jsonl' | |
with open(validation_file, 'r') as f: | |
for line in f: | |
if line.strip(): | |
data = json.loads(line.strip()) | |
task_id = data.get('task_id') | |
final_answer = data.get('Final answer') | |
if task_id and final_answer: | |
validation_answers[task_id] = final_answer | |
print(f"\nπ Expected Answers:") | |
for q in test_questions: | |
task_id = q.get('task_id') | |
expected = validation_answers.get(task_id, 'N/A') | |
print(f" {task_id[:8]}... β {expected}") | |
# Initialize processor | |
print(f"\nπ Initializing validation processor...") | |
processor = BatchQuestionProcessor( | |
max_concurrent=2, # Conservative for stability | |
question_timeout=300, # 5 minutes per question | |
progress_interval=10 # Progress updates every 10 seconds | |
) | |
# Process questions | |
print(f"\nπ Starting validation test...") | |
start_time = datetime.now() | |
results = await processor.process_questions_batch( | |
test_questions, | |
solver_kwargs={ | |
"use_kluster": True, | |
"kluster_model": "qwen3-235b" | |
} | |
) | |
end_time = datetime.now() | |
# Detailed analysis | |
print(f"\n" + "=" * 60) | |
print(f"π VALIDATION RESULTS") | |
print(f"=" * 60) | |
duration = (end_time - start_time).total_seconds() | |
accuracy = results["accuracy_metrics"]["accuracy_rate"] | |
success = results["accuracy_metrics"]["success_rate"] | |
print(f"β±οΈ Duration: {int(duration // 60)}m {int(duration % 60)}s") | |
print(f"β Accuracy: {accuracy:.1%} ({results['accuracy_metrics']['correct_answers']}/{results['completed_questions']})") | |
print(f"π― Success Rate: {success:.1%}") | |
# Question-by-question breakdown | |
print(f"\nπ DETAILED VALIDATION RESULTS:") | |
improvement_summary = {} | |
for i, result in enumerate(results["detailed_results"]): | |
task_id = result.task_id | |
status_icon = "β " if result.status == "CORRECT" else "π‘" if result.status == "PARTIAL" else "β" | |
# Map to question type | |
question_type = "Unknown" | |
if task_id == "f918266a-b3e0-4914-865d-4faa564f1aef": | |
question_type = "Python Execution" | |
elif task_id == "8e867cd7-cff9-4e6c-867a-ff5ddc2550be": | |
question_type = "Research (Mercedes Sosa)" | |
elif task_id == "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": | |
question_type = "Research (Wikipedia)" | |
elif task_id == "a1e91b78-d3d8-4675-bb8d-62741b4b68a6": | |
question_type = "Video Analysis" | |
elif task_id == "2d83110e-2e08-4bd7-b8c3-b97cbdb0fd59": | |
question_type = "Logic/Math" | |
elif task_id == "cca530fc-4052-43b2-b130-b30968d8aa44": | |
question_type = "Chess Analysis" | |
improvement_summary[question_type] = result.status | |
print(f" {i+1}. {status_icon} {question_type:20} | {result.status:9} | {result.accuracy_score:.0%}") | |
print(f" Expected: {result.expected_answer}") | |
print(f" Got: {result.our_answer}") | |
if result.status != "CORRECT": | |
print(f" Issue: {result.error_type or 'Answer mismatch'}") | |
print() | |
# Improvement assessment | |
print(f"π§ IMPROVEMENT ASSESSMENT:") | |
total_correct = sum(1 for status in improvement_summary.values() if status == "CORRECT") | |
total_tests = len(improvement_summary) | |
print(f" π Overall: {total_correct}/{total_tests} = {total_correct/total_tests:.1%} accuracy") | |
if accuracy >= 0.8: | |
print(f" π EXCELLENT: {accuracy:.1%} accuracy on key improvements!") | |
elif accuracy >= 0.7: | |
print(f" β TARGET MET: {accuracy:.1%} accuracy achieves 70%+ goal!") | |
elif accuracy >= 0.5: | |
print(f" π§ GOOD PROGRESS: {accuracy:.1%} accuracy, approaching target") | |
else: | |
print(f" β οΈ NEEDS MORE WORK: {accuracy:.1%} accuracy requires attention") | |
# Specific improvement tracking | |
print(f"\nπ― SPECIFIC IMPROVEMENTS:") | |
for question_type, status in improvement_summary.items(): | |
status_icon = "β " if status == "CORRECT" else "β" | |
print(f" {status_icon} {question_type}: {status}") | |
# Save validation results | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
results_file = f"logs/accuracy_validation_{timestamp}.json" | |
with open(results_file, 'w') as f: | |
json.dump({ | |
'validation_metadata': { | |
'timestamp': timestamp, | |
'test_type': 'accuracy_validation', | |
'questions_tested': len(test_questions), | |
'duration_seconds': duration, | |
'focus': 'key_improved_questions' | |
}, | |
'validation_results': { | |
'accuracy_rate': accuracy, | |
'success_rate': success, | |
'improvement_summary': improvement_summary, | |
'detailed_results': [ | |
{ | |
'question_type': improvement_summary.get(r.task_id, 'Unknown'), | |
'task_id': r.task_id, | |
'status': r.status, | |
'accuracy_score': r.accuracy_score, | |
'our_answer': r.our_answer, | |
'expected_answer': r.expected_answer, | |
'duration': r.total_duration | |
} for r in results['detailed_results'] | |
] | |
} | |
}, f, indent=2) | |
print(f"\nπ Validation results saved to: {results_file}") | |
return results | |
except Exception as e: | |
print(f"β Validation test failed: {e}") | |
import traceback | |
traceback.print_exc() | |
return None | |
async def main(): | |
"""Run the accuracy validation test""" | |
results = await run_accuracy_validation_test() | |
if results: | |
accuracy = results["accuracy_metrics"]["accuracy_rate"] | |
print(f"\nπ Accuracy validation completed!") | |
print(f"π Key Questions Accuracy: {accuracy:.1%}") | |
if accuracy >= 0.7: | |
print(f"π― SUCCESS: 70%+ accuracy target achieved on improved questions!") | |
print(f"π System ready for production deployment!") | |
else: | |
gap = 0.7 - accuracy | |
print(f"π§ Progress made, {gap:.1%} gap remaining to 70% target") | |
if __name__ == "__main__": | |
asyncio.run(main()) |