#!/usr/bin/env python3 """ GAIA-Optimized Test Suite - Verifies Benchmark Compliance """ import os import sys import re # Add current directory to path for imports sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) from gaia_system import BasicAgent, MultiModelGAIASystem def is_gaia_compliant(question: str, answer: str) -> tuple: """ Check if answer is GAIA benchmark compliant Returns (is_compliant: bool, reason: str) """ if not answer: return False, "Empty answer" # Check for forbidden phrases that indicate reasoning forbidden_phrases = [ "the answer is", "answer is", "result is", "solution is", "let me think", "i think", "first", "because", "since", "therefore", "thus", "however", "considering", "given that" ] answer_lower = answer.lower() for phrase in forbidden_phrases: if phrase in answer_lower: return False, f"Contains forbidden phrase: '{phrase}'" # Check for thinking tags if "" in answer or "" in answer: return False, "Contains thinking tags" # Check for excessive length (GAIA answers should be concise) if len(answer) > 100: # Most GAIA answers are short return False, f"Answer too long ({len(answer)} chars). GAIA answers should be concise." # Check for mathematical questions - should return just numbers if any(op in question for op in ['+', '-', '*', '/', 'calculate', 'what is']): if re.match(r'^\d+(\.\d+)?$', answer.strip()): return True, "Perfect numerical answer" elif answer.strip().isdigit(): return True, "Perfect integer answer" else: # Allow some mathematical expressions but flag verbose ones if len(answer.split()) > 3: return False, "Mathematical answer too verbose" # For geography questions - should be just the place name if 'capital' in question.lower(): if len(answer.split()) <= 3: # City names are usually 1-3 words return True, "Concise geographical answer" else: return False, "Geographical answer too verbose" # General compliance check - short and direct if len(answer.split()) <= 5: return True, "Appropriately concise" return True, "Acceptable answer format" def test_gaia_compliance(): """Test GAIA compliance with specific benchmark-style questions""" print("🎯 Testing GAIA Benchmark Compliance") print("=" * 60) # Test cases designed to match GAIA benchmark style gaia_test_cases = [ # Mathematical - should return just numbers ("What is 15 + 27?", "42"), ("What is 100 / 4?", "25"), ("What is 6 * 7?", "42"), ("Calculate 125 * 8", "1000"), ("What is 2 to the power of 5?", "32"), # Geography - should return just place names ("What is the capital of France?", "Paris"), ("What is the capital of Germany?", "Berlin"), ("What is the capital of Brazil?", "Brasília"), # Science - should return just facts/numbers ("How many planets are in our solar system?", "8"), ("What is the speed of light?", "299792458"), ("What is the formula for water?", "H2O"), # Ensure no conversational responses leak through ("Hello", None), # Should be brief if it responds at all ] try: agent = BasicAgent() print("✅ GAIA-Optimized Agent initialized\n") compliant_count = 0 total_tests = 0 for question, expected in gaia_test_cases: print(f"🧪 Testing: {question}") try: response = agent(question) print(f"📝 Response: '{response}'") # Check GAIA compliance is_compliant, reason = is_gaia_compliant(question, response) if is_compliant: print(f"✅ GAIA Compliant: {reason}") compliant_count += 1 else: print(f"❌ NOT Compliant: {reason}") # Check if matches expected (if provided) if expected and response.strip() == expected: print(f"🎯 Perfect Match: Expected '{expected}'") elif expected: print(f"⚠️ Expected '{expected}', got '{response}'") total_tests += 1 print("-" * 50) except Exception as e: print(f"❌ Error: {str(e)}") print("-" * 50) compliance_rate = (compliant_count / total_tests * 100) if total_tests > 0 else 0 print(f"\n📊 GAIA Compliance Results:") print(f" Compliant: {compliant_count}/{total_tests} ({compliance_rate:.1f}%)") if compliance_rate >= 80: print("✅ EXCELLENT: High GAIA compliance!") elif compliance_rate >= 60: print("⚠️ GOOD: Acceptable compliance, minor improvements needed") else: print("❌ POOR: Significant compliance issues detected") return compliance_rate >= 80 except Exception as e: print(f"❌ Failed to initialize GAIA agent: {str(e)}") return False def test_response_cleaning(): """Test that responses are properly cleaned of reasoning""" print("\n🧽 Testing Response Cleaning") print("=" * 60) try: system = MultiModelGAIASystem() # Test cases with reasoning that should be cleaned dirty_responses = [ "Let me think about this. The answer is 42.", "First, I need to calculate. 15 + 27 = 42", "This is easy mathThe result is 42", "I think the capital of France is Paris.", "Therefore, the answer is 8 planets.", "Given the calculation, 125 * 8 = 1000", ] print("Testing response cleaning:") for dirty in dirty_responses: cleaned = system._extract_final_answer(dirty) print(f" Original: '{dirty}'") print(f" Cleaned: '{cleaned}'") # Check if cleaned properly is_compliant, reason = is_gaia_compliant("test", cleaned) status = "✅" if is_compliant else "❌" print(f" Status: {status} {reason}") print() return True except Exception as e: print(f"❌ Response cleaning test failed: {str(e)}") return False def test_api_submission_format(): """Test that responses are formatted correctly for API submission""" print("\n📡 Testing API Submission Format") print("=" * 60) # Import the cleaning function from app import clean_for_api_submission test_cases = [ ("42", "42"), # Should remain unchanged ("Paris", "Paris"), # Should remain unchanged ("Answer: 42", "42"), # Should remove prefix ("**42**", "42"), # Should remove markdown ("42.", "42"), # Should remove trailing period for numbers ("The capital is Paris.", "The capital is Paris"), # Should keep period for sentences ] all_passed = True for input_answer, expected_clean in test_cases: cleaned = clean_for_api_submission(input_answer) if cleaned == expected_clean: print(f"✅ '{input_answer}' → '{cleaned}'") else: print(f"❌ '{input_answer}' → '{cleaned}' (expected '{expected_clean}')") all_passed = False return all_passed if __name__ == "__main__": print("🧪 GAIA Benchmark Compliance Test Suite") print("=" * 70) # Environment variables check if not os.environ.get("HF_TOKEN"): print("⚠️ Warning: HF_TOKEN not set. Some AI models may be unavailable.") if not os.environ.get("OPENAI_API_KEY"): print("⚠️ Warning: OPENAI_API_KEY not set. OpenAI models will be unavailable.") # Run all compliance tests print("🔧 Phase 1: GAIA Benchmark Compliance Test") success1 = test_gaia_compliance() print("\n🔧 Phase 2: Response Cleaning Test") success2 = test_response_cleaning() print("\n🔧 Phase 3: API Submission Format Test") success3 = test_api_submission_format() if success1 and success2 and success3: print("\n🎉 ALL TESTS PASSED! System is GAIA benchmark ready!") print("🚀 Your agent should score well on the benchmark.") print("📋 Key Achievements:") print(" ✅ Responses are GAIA compliant") print(" ✅ Reasoning is properly cleaned") print(" ✅ API format is correct") print(" ✅ Ready for exact-match evaluation") sys.exit(0) else: print("\n❌ Some tests failed. Issues detected:") if not success1: print(" ❌ GAIA compliance issues") if not success2: print(" ❌ Response cleaning problems") if not success3: print(" ❌ API format issues") print("\n🔧 Please review the implementation.") sys.exit(1)