#!/usr/bin/env python3 """ Validate our multi-agent system answers against known GAIA results """ import json import requests from gaia_web_loader import GAIAQuestionLoaderWeb from main import GAIASolver from question_classifier import QuestionClassifier # Known correct answers from GAIA validation (manually collected for testing) KNOWN_ANSWERS = { "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": { "question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?", "expected_answer": "FunkMonk", # Need to verify this "our_answer": "JuraForm", "category": "research" }, "2d83110e-a098-4ebb-9987-066c06fa42d0": { "question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI", "expected_answer": "right", "our_answer": "right", "category": "logic_math" }, "cca530fc-4052-43b2-b130-b30968d8aa44": { "question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.", "expected_answer": "Qxg2#", # Need to verify with actual chess analysis "our_answer": "Qxg2#", "category": "multimedia" } } def validate_answer(question_id: str, our_answer: str, expected_answer: str) -> dict: """Validate our answer against the expected answer""" # Clean up answers for comparison our_clean = str(our_answer).strip().lower() expected_clean = str(expected_answer).strip().lower() # Exact match exact_match = our_clean == expected_clean # Contains match (for longer answers) contains_match = expected_clean in our_clean or our_clean in expected_clean # Similarity score (rough) similarity = len(set(our_clean.split()) & set(expected_clean.split())) / max(len(set(our_clean.split())), len(set(expected_clean.split())), 1) return { "exact_match": exact_match, "contains_match": contains_match, "similarity_score": similarity, "our_answer": our_answer, "expected_answer": expected_answer, "status": "CORRECT" if exact_match else "PARTIAL" if contains_match else "INCORRECT" } def test_validation_system(): """Test our validation system with known questions""" print("๐Ÿงช GAIA ANSWER VALIDATION SYSTEM") print("=" * 60) total_tests = len(KNOWN_ANSWERS) correct_count = 0 partial_count = 0 for question_id, data in KNOWN_ANSWERS.items(): print(f"\n๐Ÿ“ Testing Question: {question_id[:8]}...") print(f"Category: {data['category']}") print(f"Question: {data['question'][:80]}...") # Validate our answer validation = validate_answer( question_id, data['our_answer'], data['expected_answer'] ) print(f"\n๐Ÿ“Š VALIDATION RESULTS:") print(f"Our Answer: {validation['our_answer']}") print(f"Expected: {validation['expected_answer']}") print(f"Status: {validation['status']}") print(f"Exact Match: {validation['exact_match']}") print(f"Contains Match: {validation['contains_match']}") print(f"Similarity: {validation['similarity_score']:.2f}") if validation['status'] == "CORRECT": correct_count += 1 print("โœ… CORRECT!") elif validation['status'] == "PARTIAL": partial_count += 1 print("๐ŸŸก PARTIAL MATCH") else: print("โŒ INCORRECT") print(f"\n๐Ÿ“‹ OVERALL VALIDATION SUMMARY:") print("=" * 60) print(f"Total Questions Tested: {total_tests}") print(f"Correct Answers: {correct_count} ({correct_count/total_tests*100:.1f}%)") print(f"Partial Matches: {partial_count} ({partial_count/total_tests*100:.1f}%)") print(f"Incorrect: {total_tests - correct_count - partial_count}") print(f"Overall Success Rate: {(correct_count + partial_count)/total_tests*100:.1f}%") def research_correct_answer(): """Research the correct answer for the Wikipedia dinosaur question""" print("\n๐Ÿ” RESEARCHING CORRECT ANSWER FOR DINOSAUR QUESTION") print("=" * 60) question_id = "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8" print("Question: Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?") print("\n๐Ÿ•ต๏ธ Research Process:") print("1. Need to find Featured Articles promoted in November 2016") print("2. Identify which one was about a dinosaur") print("3. Find the nominator") print("\n๐Ÿ’ก Research Strategy:") print("- Check Wikipedia's Featured Article log for November 2016") print("- Look for dinosaur-related articles promoted that month") print("- Find nomination information") print(f"\n๐Ÿค– Our Answer: JuraForm") print(f"โ“ Need to verify: Was this correct?") print(f"\n๐Ÿ“š Alternative Research Approach:") print("- Search for 'Spinosaurus' article on Wikipedia") print("- Check its promotion history") print("- Verify nomination details") if __name__ == "__main__": test_validation_system() research_correct_answer()