Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Validate our multi-agent system answers against known GAIA results | |
""" | |
import json | |
import requests | |
from gaia_web_loader import GAIAQuestionLoaderWeb | |
from main import GAIASolver | |
from question_classifier import QuestionClassifier | |
# Known correct answers from GAIA validation (manually collected for testing) | |
KNOWN_ANSWERS = { | |
"4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": { | |
"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?", | |
"expected_answer": "FunkMonk", # Need to verify this | |
"our_answer": "JuraForm", | |
"category": "research" | |
}, | |
"2d83110e-a098-4ebb-9987-066c06fa42d0": { | |
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI", | |
"expected_answer": "right", | |
"our_answer": "right", | |
"category": "logic_math" | |
}, | |
"cca530fc-4052-43b2-b130-b30968d8aa44": { | |
"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.", | |
"expected_answer": "Qxg2#", # Need to verify with actual chess analysis | |
"our_answer": "Qxg2#", | |
"category": "multimedia" | |
} | |
} | |
def validate_answer(question_id: str, our_answer: str, expected_answer: str) -> dict: | |
"""Validate our answer against the expected answer""" | |
# Clean up answers for comparison | |
our_clean = str(our_answer).strip().lower() | |
expected_clean = str(expected_answer).strip().lower() | |
# Exact match | |
exact_match = our_clean == expected_clean | |
# Contains match (for longer answers) | |
contains_match = expected_clean in our_clean or our_clean in expected_clean | |
# Similarity score (rough) | |
similarity = len(set(our_clean.split()) & set(expected_clean.split())) / max(len(set(our_clean.split())), len(set(expected_clean.split())), 1) | |
return { | |
"exact_match": exact_match, | |
"contains_match": contains_match, | |
"similarity_score": similarity, | |
"our_answer": our_answer, | |
"expected_answer": expected_answer, | |
"status": "CORRECT" if exact_match else "PARTIAL" if contains_match else "INCORRECT" | |
} | |
def test_validation_system(): | |
"""Test our validation system with known questions""" | |
print("π§ͺ GAIA ANSWER VALIDATION SYSTEM") | |
print("=" * 60) | |
total_tests = len(KNOWN_ANSWERS) | |
correct_count = 0 | |
partial_count = 0 | |
for question_id, data in KNOWN_ANSWERS.items(): | |
print(f"\nπ Testing Question: {question_id[:8]}...") | |
print(f"Category: {data['category']}") | |
print(f"Question: {data['question'][:80]}...") | |
# Validate our answer | |
validation = validate_answer( | |
question_id, | |
data['our_answer'], | |
data['expected_answer'] | |
) | |
print(f"\nπ VALIDATION RESULTS:") | |
print(f"Our Answer: {validation['our_answer']}") | |
print(f"Expected: {validation['expected_answer']}") | |
print(f"Status: {validation['status']}") | |
print(f"Exact Match: {validation['exact_match']}") | |
print(f"Contains Match: {validation['contains_match']}") | |
print(f"Similarity: {validation['similarity_score']:.2f}") | |
if validation['status'] == "CORRECT": | |
correct_count += 1 | |
print("β CORRECT!") | |
elif validation['status'] == "PARTIAL": | |
partial_count += 1 | |
print("π‘ PARTIAL MATCH") | |
else: | |
print("β INCORRECT") | |
print(f"\nπ OVERALL VALIDATION SUMMARY:") | |
print("=" * 60) | |
print(f"Total Questions Tested: {total_tests}") | |
print(f"Correct Answers: {correct_count} ({correct_count/total_tests*100:.1f}%)") | |
print(f"Partial Matches: {partial_count} ({partial_count/total_tests*100:.1f}%)") | |
print(f"Incorrect: {total_tests - correct_count - partial_count}") | |
print(f"Overall Success Rate: {(correct_count + partial_count)/total_tests*100:.1f}%") | |
def research_correct_answer(): | |
"""Research the correct answer for the Wikipedia dinosaur question""" | |
print("\nπ RESEARCHING CORRECT ANSWER FOR DINOSAUR QUESTION") | |
print("=" * 60) | |
question_id = "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8" | |
print("Question: Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?") | |
print("\nπ΅οΈ Research Process:") | |
print("1. Need to find Featured Articles promoted in November 2016") | |
print("2. Identify which one was about a dinosaur") | |
print("3. Find the nominator") | |
print("\nπ‘ Research Strategy:") | |
print("- Check Wikipedia's Featured Article log for November 2016") | |
print("- Look for dinosaur-related articles promoted that month") | |
print("- Find nomination information") | |
print(f"\nπ€ Our Answer: JuraForm") | |
print(f"β Need to verify: Was this correct?") | |
print(f"\nπ Alternative Research Approach:") | |
print("- Search for 'Spinosaurus' article on Wikipedia") | |
print("- Check its promotion history") | |
print("- Verify nomination details") | |
if __name__ == "__main__": | |
test_validation_system() | |
research_correct_answer() |