Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Validate all GAIA questions with our multi-agent system | |
""" | |
import json | |
import time | |
from typing import Dict, List | |
from gaia_web_loader import GAIAQuestionLoaderWeb | |
from main import GAIASolver | |
from question_classifier import QuestionClassifier | |
def solve_all_questions_with_validation(): | |
"""Solve all 20 GAIA questions and collect results for validation""" | |
print("π§ͺ COMPREHENSIVE GAIA VALIDATION - ALL 20 QUESTIONS") | |
print("=" * 70) | |
# Initialize components | |
print("π Initializing multi-agent system...") | |
loader = GAIAQuestionLoaderWeb() | |
classifier = QuestionClassifier() | |
solver = GAIASolver() | |
questions = loader.questions | |
results = [] | |
print(f"π Found {len(questions)} questions to solve") | |
for i, question_data in enumerate(questions, 1): | |
task_id = question_data.get('task_id', 'unknown') | |
question_text = question_data.get('question', '') | |
file_name = question_data.get('file_name', '') | |
print(f"\n{'='*60}") | |
print(f"QUESTION {i}/20: {task_id[:8]}...") | |
print(f"{'='*60}") | |
try: | |
# Classification phase | |
print(f"π§ CLASSIFICATION:") | |
classification = classifier.classify_question(question_text, file_name) | |
routing = classifier.get_routing_recommendation(classification) | |
print(f" Primary Agent: {classification['primary_agent']}") | |
print(f" Secondary: {classification.get('secondary_agents', [])}") | |
print(f" Complexity: {classification['complexity']}/5") | |
print(f" Confidence: {classification['confidence']:.3f}") | |
# Solving phase | |
print(f"\nπ€ SOLVING:") | |
print(f" Question: {question_text[:100]}...") | |
if file_name: | |
print(f" File: {file_name}") | |
start_time = time.time() | |
answer = solver.solve_question(question_data) | |
solve_time = time.time() - start_time | |
print(f" β Answer: {answer[:100]}...") | |
print(f" β±οΈ Time: {solve_time:.1f}s") | |
# Store results | |
result = { | |
'question_id': task_id, | |
'question': question_text, | |
'file_name': file_name, | |
'classification': { | |
'primary_agent': classification['primary_agent'], | |
'secondary_agents': classification.get('secondary_agents', []), | |
'complexity': classification['complexity'], | |
'confidence': classification['confidence'], | |
'tools_needed': classification.get('tools_needed', []) | |
}, | |
'routing': { | |
'coordination_needed': routing['requires_coordination'], | |
'duration_estimate': routing['estimated_duration'] | |
}, | |
'answer': answer, | |
'solve_time': solve_time, | |
'status': 'completed' | |
} | |
results.append(result) | |
except Exception as e: | |
print(f" β Error: {e}") | |
# Store error result | |
error_result = { | |
'question_id': task_id, | |
'question': question_text, | |
'file_name': file_name, | |
'classification': classification if 'classification' in locals() else None, | |
'answer': f"Error: {str(e)}", | |
'solve_time': 0, | |
'status': 'error' | |
} | |
results.append(error_result) | |
# Small delay to avoid overwhelming APIs | |
time.sleep(1) | |
return results | |
def analyze_results(results: List[Dict]): | |
"""Analyze the solving results""" | |
print(f"\nπ COMPREHENSIVE RESULTS ANALYSIS") | |
print("=" * 70) | |
total_questions = len(results) | |
completed = len([r for r in results if r['status'] == 'completed']) | |
errors = len([r for r in results if r['status'] == 'error']) | |
print(f"π OVERALL STATISTICS:") | |
print(f" Total Questions: {total_questions}") | |
print(f" Successfully Solved: {completed} ({completed/total_questions*100:.1f}%)") | |
print(f" Errors: {errors} ({errors/total_questions*100:.1f}%)") | |
if completed > 0: | |
completed_results = [r for r in results if r['status'] == 'completed'] | |
avg_time = sum(r['solve_time'] for r in completed_results) / len(completed_results) | |
print(f" Average Solve Time: {avg_time:.1f}s") | |
# Classification analysis | |
print(f"\nπ― CLASSIFICATION ANALYSIS:") | |
agent_counts = {} | |
complexity_counts = {} | |
confidence_scores = [] | |
for result in results: | |
if result['classification']: | |
primary = result['classification']['primary_agent'] | |
agent_counts[primary] = agent_counts.get(primary, 0) + 1 | |
complexity = result['classification']['complexity'] | |
complexity_counts[complexity] = complexity_counts.get(complexity, 0) + 1 | |
confidence_scores.append(result['classification']['confidence']) | |
print(f" Agent Distribution:") | |
for agent, count in sorted(agent_counts.items()): | |
percentage = (count / total_questions) * 100 | |
print(f" {agent}: {count} questions ({percentage:.1f}%)") | |
print(f" Complexity Distribution:") | |
for complexity, count in sorted(complexity_counts.items()): | |
percentage = (count / total_questions) * 100 | |
print(f" Level {complexity}: {count} questions ({percentage:.1f}%)") | |
if confidence_scores: | |
avg_confidence = sum(confidence_scores) / len(confidence_scores) | |
print(f" Average Classification Confidence: {avg_confidence:.3f}") | |
# Question type analysis | |
print(f"\nπ QUESTION BREAKDOWN:") | |
for i, result in enumerate(results, 1): | |
status_emoji = "β " if result['status'] == 'completed' else "β" | |
task_id = result['question_id'][:8] | |
primary_agent = result['classification']['primary_agent'] if result['classification'] else 'unknown' | |
answer_preview = result['answer'][:50] + "..." if len(result['answer']) > 50 else result['answer'] | |
print(f" {i:2d}. {status_emoji} {task_id}... [{primary_agent}] {answer_preview}") | |
def save_results(results: List[Dict]): | |
"""Save results to JSON file for further analysis""" | |
output_file = "gaia_validation_results.json" | |
with open(output_file, 'w') as f: | |
json.dump(results, f, indent=2, ensure_ascii=False) | |
print(f"\nπΎ Results saved to: {output_file}") | |
print(f"π Use this file to compare with official GAIA answers") | |
def main(): | |
"""Main validation workflow""" | |
print("π― Starting comprehensive GAIA validation...") | |
print("β οΈ This will take several minutes to complete all 20 questions") | |
# Solve all questions | |
results = solve_all_questions_with_validation() | |
# Analyze results | |
analyze_results(results) | |
# Save for comparison | |
save_results(results) | |
print(f"\nβ VALIDATION COMPLETE!") | |
print(f"π Check gaia_validation_results.json for detailed results") | |
print(f"π Compare answers with official GAIA dataset when available") | |
if __name__ == "__main__": | |
main() |