Spaces:
Running
Running
File size: 5,366 Bytes
c262d1a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
#!/usr/bin/env python3
"""
Validate our multi-agent system answers against known GAIA results
"""
import json
import requests
from gaia_web_loader import GAIAQuestionLoaderWeb
from main import GAIASolver
from question_classifier import QuestionClassifier
# Known correct answers from GAIA validation (manually collected for testing)
KNOWN_ANSWERS = {
"4fc2f1ae-8625-45b5-ab34-ad4433bc21f8": {
"question": "Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?",
"expected_answer": "FunkMonk", # Need to verify this
"our_answer": "JuraForm",
"category": "research"
},
"2d83110e-a098-4ebb-9987-066c06fa42d0": {
"question": ".rewsna eht sa \"tfel\" drow eht fo etisoppo eht etirw ,ecnetnes siht dnatsrednu uoy fI",
"expected_answer": "right",
"our_answer": "right",
"category": "logic_math"
},
"cca530fc-4052-43b2-b130-b30968d8aa44": {
"question": "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation.",
"expected_answer": "Qxg2#", # Need to verify with actual chess analysis
"our_answer": "Qxg2#",
"category": "multimedia"
}
}
def validate_answer(question_id: str, our_answer: str, expected_answer: str) -> dict:
"""Validate our answer against the expected answer"""
# Clean up answers for comparison
our_clean = str(our_answer).strip().lower()
expected_clean = str(expected_answer).strip().lower()
# Exact match
exact_match = our_clean == expected_clean
# Contains match (for longer answers)
contains_match = expected_clean in our_clean or our_clean in expected_clean
# Similarity score (rough)
similarity = len(set(our_clean.split()) & set(expected_clean.split())) / max(len(set(our_clean.split())), len(set(expected_clean.split())), 1)
return {
"exact_match": exact_match,
"contains_match": contains_match,
"similarity_score": similarity,
"our_answer": our_answer,
"expected_answer": expected_answer,
"status": "CORRECT" if exact_match else "PARTIAL" if contains_match else "INCORRECT"
}
def test_validation_system():
"""Test our validation system with known questions"""
print("π§ͺ GAIA ANSWER VALIDATION SYSTEM")
print("=" * 60)
total_tests = len(KNOWN_ANSWERS)
correct_count = 0
partial_count = 0
for question_id, data in KNOWN_ANSWERS.items():
print(f"\nπ Testing Question: {question_id[:8]}...")
print(f"Category: {data['category']}")
print(f"Question: {data['question'][:80]}...")
# Validate our answer
validation = validate_answer(
question_id,
data['our_answer'],
data['expected_answer']
)
print(f"\nπ VALIDATION RESULTS:")
print(f"Our Answer: {validation['our_answer']}")
print(f"Expected: {validation['expected_answer']}")
print(f"Status: {validation['status']}")
print(f"Exact Match: {validation['exact_match']}")
print(f"Contains Match: {validation['contains_match']}")
print(f"Similarity: {validation['similarity_score']:.2f}")
if validation['status'] == "CORRECT":
correct_count += 1
print("β
CORRECT!")
elif validation['status'] == "PARTIAL":
partial_count += 1
print("π‘ PARTIAL MATCH")
else:
print("β INCORRECT")
print(f"\nπ OVERALL VALIDATION SUMMARY:")
print("=" * 60)
print(f"Total Questions Tested: {total_tests}")
print(f"Correct Answers: {correct_count} ({correct_count/total_tests*100:.1f}%)")
print(f"Partial Matches: {partial_count} ({partial_count/total_tests*100:.1f}%)")
print(f"Incorrect: {total_tests - correct_count - partial_count}")
print(f"Overall Success Rate: {(correct_count + partial_count)/total_tests*100:.1f}%")
def research_correct_answer():
"""Research the correct answer for the Wikipedia dinosaur question"""
print("\nπ RESEARCHING CORRECT ANSWER FOR DINOSAUR QUESTION")
print("=" * 60)
question_id = "4fc2f1ae-8625-45b5-ab34-ad4433bc21f8"
print("Question: Who nominated the only Featured Article on English Wikipedia about a dinosaur that was promoted in November 2016?")
print("\nπ΅οΈ Research Process:")
print("1. Need to find Featured Articles promoted in November 2016")
print("2. Identify which one was about a dinosaur")
print("3. Find the nominator")
print("\nπ‘ Research Strategy:")
print("- Check Wikipedia's Featured Article log for November 2016")
print("- Look for dinosaur-related articles promoted that month")
print("- Find nomination information")
print(f"\nπ€ Our Answer: JuraForm")
print(f"β Need to verify: Was this correct?")
print(f"\nπ Alternative Research Approach:")
print("- Search for 'Spinosaurus' article on Wikipedia")
print("- Check its promotion history")
print("- Verify nomination details")
if __name__ == "__main__":
test_validation_system()
research_correct_answer() |