#!/usr/bin/env python3 """ Quick Clean Test - Test 5 representative questions without overrides """ import os import sys import json import time from pathlib import Path from dotenv import load_dotenv # Load environment variables load_dotenv() # Add parent directory to path for imports sys.path.append(str(Path(__file__).parent.parent)) # Local imports from gaia_web_loader import GAIAQuestionLoaderWeb from main import GAIASolver from question_classifier import QuestionClassifier def load_validation_answers(): """Load correct answers from GAIA validation metadata""" answers = {} try: validation_path = Path(__file__).parent.parent / 'gaia_validation_metadata.jsonl' with open(validation_path, 'r') as f: for line in f: if line.strip(): data = json.loads(line.strip()) task_id = data.get('task_id') final_answer = data.get('Final answer') if task_id and final_answer: answers[task_id] = final_answer except Exception as e: print(f"โš ๏ธ Could not load validation data: {e}") return answers def validate_answer(task_id: str, our_answer: str, validation_answers: dict): """Validate our answer against the correct answer""" if task_id not in validation_answers: return None expected = str(validation_answers[task_id]).strip() our_clean = str(our_answer).strip() # Exact match if our_clean.lower() == expected.lower(): return {"status": "CORRECT", "expected": expected, "our": our_clean} # Check if our answer contains the expected answer if expected.lower() in our_clean.lower(): return {"status": "PARTIAL", "expected": expected, "our": our_clean} return {"status": "INCORRECT", "expected": expected, "our": our_clean} def test_single_question(question_data, validation_answers, model="qwen3-235b"): """Test a single question without any overrides""" task_id = question_data.get('task_id', 'unknown') try: print(f"๐Ÿงช [{task_id[:8]}...] Starting...") # Initialize solver and classifier solver = GAIASolver(use_kluster=True, kluster_model=model) classifier = QuestionClassifier() # Classify the question question_text = question_data.get('question', '') file_name = question_data.get('file_name', '') classification = classifier.classify_question(question_text, file_name) # Solve the question (NO OVERRIDES - pure LLM reasoning) start_time = time.time() answer = solver.solve_question(question_data) end_time = time.time() duration = end_time - start_time # Validate answer validation_result = validate_answer(task_id, answer, validation_answers) result = { 'task_id': task_id, 'question_type': classification['primary_agent'], 'our_answer': str(answer), 'expected_answer': validation_result['expected'] if validation_result else 'N/A', 'status': validation_result['status'] if validation_result else 'NO_VALIDATION', 'duration': duration, } status_icon = "โœ…" if result['status'] == "CORRECT" else "๐ŸŸก" if result['status'] == "PARTIAL" else "โŒ" print(f"{status_icon} [{task_id[:8]}...] {result['status']} | {result['question_type']} | {duration:.1f}s") print(f" Expected: {result['expected_answer']}") print(f" Got: {result['our_answer']}") return result except Exception as e: print(f"โŒ [{task_id[:8]}...] ERROR: {str(e)}") return { 'task_id': task_id, 'question_type': 'error', 'our_answer': '', 'expected_answer': validation_answers.get(task_id, 'N/A'), 'status': 'ERROR', 'duration': 0.0, 'error': str(e) } def run_quick_clean_test(): """Run quick clean test on 5 representative questions""" print("๐Ÿงช QUICK CLEAN TEST - NO OVERRIDES") print("=" * 50) print("๐ŸŽฏ Testing 5 representative questions") print("๐Ÿšซ No hardcoded answers or overrides") print("๐Ÿค– Pure LLM + Tools reasoning only") print() # Load questions and validation data loader = GAIAQuestionLoaderWeb() all_questions = loader.questions validation_answers = load_validation_answers() # Select 5 representative questions across different types test_question_ids = [ "8e867cd7-cff9-4e6c-867a-ff5ddc2550be", # Research (Mercedes Sosa) "a1e91b78-d3d8-4675-bb8d-62741b4b68a6", # Video Analysis (bird species) "2d83110e-a098-4ebb-9987-066c06fa42d0", # Logic/Math (text reversal) "cca530fc-4052-43b2-b130-b30968d8aa44", # Chess Analysis "f918266a-b3e0-4914-865d-4faa564f1aef", # Python execution ] test_questions = [] for q in all_questions: if q.get('task_id') in test_question_ids: test_questions.append(q) print(f"โœ… Selected {len(test_questions)} test questions") # Show questions print(f"\n๐Ÿ“‹ Test Questions:") for i, q in enumerate(test_questions): task_id = q.get('task_id', 'unknown') question_preview = q.get('question', '')[:40] + "..." expected = validation_answers.get(task_id, 'N/A') print(f" {i+1}. {task_id[:8]}... โ†’ {expected}") print(f" {question_preview}") print(f"\n๐Ÿš€ Starting quick clean test...") # Process questions start_time = time.time() results = [] for i, question_data in enumerate(test_questions): print(f"\n๐Ÿ“Š Progress: {i+1}/{len(test_questions)}") result = test_single_question(question_data, validation_answers) results.append(result) end_time = time.time() total_duration = end_time - start_time # Analyze results print(f"\n" + "=" * 50) print(f"๐Ÿ QUICK CLEAN TEST RESULTS") print(f"=" * 50) # Calculate metrics total_questions = len(results) correct_answers = len([r for r in results if r['status'] == 'CORRECT']) partial_answers = len([r for r in results if r['status'] == 'PARTIAL']) incorrect_answers = len([r for r in results if r['status'] == 'INCORRECT']) errors = len([r for r in results if r['status'] == 'ERROR']) accuracy_rate = correct_answers / total_questions * 100 success_rate = (correct_answers + partial_answers) / total_questions * 100 print(f"โฑ๏ธ Total Duration: {int(total_duration // 60)}m {int(total_duration % 60)}s") print(f"โœ… **REAL ACCURACY: {accuracy_rate:.1f}%** ({correct_answers}/{total_questions})") print(f"๐ŸŽฏ Success Rate: {success_rate:.1f}% (including partial)") print(f"\n๐Ÿ“Š BREAKDOWN:") print(f" โœ… CORRECT: {correct_answers}") print(f" ๐ŸŸก PARTIAL: {partial_answers}") print(f" โŒ INCORRECT: {incorrect_answers}") print(f" ๐Ÿ’ฅ ERROR: {errors}") # Question-by-question results print(f"\n๐Ÿ“‹ DETAILED RESULTS:") for i, result in enumerate(results): status_icon = "โœ…" if result['status'] == "CORRECT" else "๐ŸŸก" if result['status'] == "PARTIAL" else "โŒ" print(f" {i+1}. {status_icon} {result['question_type']:12} | {result['status']:9}") print(f" Expected: {result['expected_answer']}") print(f" Got: {result['our_answer']}") if 'error' in result: print(f" Error: {result['error']}") # Final assessment print(f"\n๐ŸŽฏ HONEST ASSESSMENT:") print(f"๐Ÿšซ NO CHEATING - Pure LLM reasoning only") print(f"๐Ÿ“Š **Real System Accuracy: {accuracy_rate:.1f}%**") if accuracy_rate >= 70: print(f"๐Ÿ† EXCELLENT: Achieves 70%+ target!") elif accuracy_rate >= 50: print(f"๐Ÿ”ง GOOD: Solid performance, room for improvement") elif accuracy_rate >= 30: print(f"โš ๏ธ MODERATE: Needs significant improvements") else: print(f"๐Ÿšจ POOR: Requires major system overhaul") return accuracy_rate, results if __name__ == "__main__": accuracy, results = run_quick_clean_test() print(f"\n๐ŸŽ‰ Quick clean test completed!") print(f"๐Ÿ“Š **REAL ACCURACY: {accuracy:.1f}%**") print(f"๐Ÿ” This is honest performance without any overrides!")