#!/usr/bin/env python3 """ Logged Clean Test - Test all questions with proper logging and no overrides """ import os import sys import json import time from pathlib import Path from dotenv import load_dotenv # Load environment variables load_dotenv() # Add parent directory to path for imports sys.path.append(str(Path(__file__).parent.parent)) # Local imports from gaia_web_loader import GAIAQuestionLoaderWeb from main import GAIASolver from question_classifier import QuestionClassifier from tests.test_logging_utils import test_logger def load_validation_answers(): """Load correct answers from GAIA validation metadata""" answers = {} try: validation_path = Path(__file__).parent.parent / 'gaia_validation_metadata.jsonl' with open(validation_path, 'r') as f: for line in f: if line.strip(): data = json.loads(line.strip()) task_id = data.get('task_id') final_answer = data.get('Final answer') if task_id and final_answer: answers[task_id] = final_answer except Exception as e: print(f"โš ๏ธ Could not load validation data: {e}") return answers def validate_answer(task_id: str, our_answer: str, validation_answers: dict): """Validate our answer against the correct answer""" if task_id not in validation_answers: return None expected = str(validation_answers[task_id]).strip() our_clean = str(our_answer).strip() # Exact match if our_clean.lower() == expected.lower(): return {"status": "CORRECT", "expected": expected, "our": our_clean} # Check if our answer contains the expected answer if expected.lower() in our_clean.lower(): return {"status": "PARTIAL", "expected": expected, "our": our_clean} return {"status": "INCORRECT", "expected": expected, "our": our_clean} def test_single_question(question_data, validation_answers, model="qwen3-235b"): """Test a single question without any overrides - WITH LOGGING""" task_id = question_data.get('task_id', 'unknown') # Use the same logging approach as test_specific_question.py with test_logger("clean_batch_question", task_id): try: print(f"๐Ÿงช Testing question: {task_id}") print("=" * 60) # Initialize solver and classifier print(f"๐Ÿš€ Initializing GAIA Solver with Kluster.ai {model}...") solver = GAIASolver(use_kluster=True, kluster_model=model) print("๐Ÿง  Initializing Question Classifier...") classifier = QuestionClassifier() # Display question details print(f"โœ… Found question!") print(f"๐Ÿ“ Question: {question_data.get('question', 'N/A')}") print(f"๐Ÿท๏ธ Level: {question_data.get('Level', 'Unknown')}") print(f"๐Ÿ“Ž Has file: {'Yes' if question_data.get('file_name') else 'No'}") if question_data.get('file_name'): print(f"๐Ÿ“„ File: {question_data.get('file_name')}") # Classify the question print(f"\n๐Ÿง  QUESTION CLASSIFICATION:") print("-" * 40) question_text = question_data.get('question', '') file_name = question_data.get('file_name', '') classification = classifier.classify_question(question_text, file_name) print(f"๐ŸŽฏ Primary Agent: {classification['primary_agent']}") if classification['secondary_agents']: print(f"๐Ÿค Secondary Agents: {', '.join(classification['secondary_agents'])}") print(f"๐Ÿ“Š Complexity: {classification['complexity']}/5") print(f"๐ŸŽฒ Confidence: {classification['confidence']:.3f}") print(f"๐Ÿ”ง Tools Needed: {', '.join(classification['tools_needed'][:3])}") if len(classification['tools_needed']) > 3: print(f" (+{len(classification['tools_needed'])-3} more tools)") print(f"๐Ÿ’ญ Reasoning: {classification['reasoning']}") # Solve the question (NO OVERRIDES - pure LLM reasoning) print(f"\n๐Ÿค– Solving question...") print(f"๐ŸŽฏ Question type: {classification['primary_agent']}") print(f"๐Ÿ”„ Processing... (NO OVERRIDES - Pure LLM + Tools)") start_time = time.time() answer = solver.solve_question(question_data) end_time = time.time() duration = end_time - start_time print(f"โœ… Completed in {duration:.1f} seconds") # Validate answer print(f"\n๐Ÿ” ANSWER VALIDATION:") print("-" * 40) validation_result = validate_answer(task_id, answer, validation_answers) if validation_result: print(f"Expected Answer: {validation_result['expected']}") print(f"Our Answer: {validation_result['our']}") print(f"Status: {validation_result['status']}") if validation_result['status'] == 'CORRECT': print(f"โœ… PERFECT MATCH!") elif validation_result['status'] == 'PARTIAL': print(f"๐ŸŸก PARTIAL MATCH - contains correct answer") else: print(f"โŒ INCORRECT - answers don't match") else: print(f"โš ๏ธ No validation data available for question {task_id}") print(f"\n๐Ÿ“‹ FINAL RESULTS:") print("=" * 60) print(f"Task ID: {task_id}") print(f"Question Type: {classification['primary_agent']}") print(f"Classification Confidence: {classification['confidence']:.3f}") print(f"Our Answer: {answer}") if validation_result: print(f"Expected Answer: {validation_result['expected']}") print(f"Validation Status: {validation_result['status']}") print(f"Duration: {duration:.1f}s") print(f"๐Ÿšซ NO OVERRIDES APPLIED - Pure LLM reasoning") result = { 'task_id': task_id, 'question_type': classification['primary_agent'], 'complexity': classification['complexity'], 'confidence': classification['confidence'], 'our_answer': str(answer), 'expected_answer': validation_result['expected'] if validation_result else 'N/A', 'status': validation_result['status'] if validation_result else 'NO_VALIDATION', 'duration': duration, 'question_preview': question_data.get('question', '')[:50] + "..." } status_icon = "โœ…" if result['status'] == "CORRECT" else "๐ŸŸก" if result['status'] == "PARTIAL" else "โŒ" print(f"\n{status_icon} FINAL STATUS: {result['status']}") return result except Exception as e: print(f"โŒ Error testing question: {e}") import traceback traceback.print_exc() return { 'task_id': task_id, 'question_type': 'error', 'complexity': 0, 'confidence': 0.0, 'our_answer': '', 'expected_answer': validation_answers.get(task_id, 'N/A'), 'status': 'ERROR', 'duration': 0.0, 'error': str(e), 'question_preview': question_data.get('question', '')[:50] + "..." } def run_logged_clean_test(): """Run logged clean test on all questions""" print("๐Ÿงช LOGGED CLEAN TEST - NO OVERRIDES") print("=" * 60) print("๐ŸŽฏ Goal: Measure real accuracy with full logging") print("๐Ÿšซ No hardcoded answers or overrides") print("๐Ÿค– Pure LLM + Tools reasoning only") print("๐Ÿ“ Full detailed logs will be created") print() # Load questions and validation data print("๐Ÿ“‹ Loading GAIA questions...") loader = GAIAQuestionLoaderWeb() all_questions = loader.questions validation_answers = load_validation_answers() print(f"โœ… Loaded {len(all_questions)} questions") print(f"โœ… Loaded {len(validation_answers)} validation answers") # Show question preview print(f"\n๐Ÿ“‹ Questions to test:") for i, q in enumerate(all_questions[:3]): # Show first 3 task_id = q.get('task_id', 'unknown') question_preview = q.get('question', '')[:40] + "..." level = q.get('Level', 'Unknown') expected = validation_answers.get(task_id, 'N/A') has_file = "๐Ÿ“Ž" if q.get('file_name') else "๐Ÿ“" print(f" {i+1}. {task_id[:8]}... | L{level} | {has_file} | Expected: {expected}") print(f" {question_preview}") if len(all_questions) > 3: print(f" ... and {len(all_questions) - 3} more questions") print(f"\n๐Ÿš€ Starting logged clean test...") print(f"๐Ÿ“ Each question will create a detailed log file") print(f"โฑ๏ธ Estimated time: ~{len(all_questions) * 2} minutes") # Process first 3 questions for demonstration (you can change this) test_questions = all_questions[:3] # Test first 3 questions start_time = time.time() results = [] for i, question_data in enumerate(test_questions): print(f"\n" + "="*80) print(f"๐Ÿ“Š PROGRESS: {i+1}/{len(test_questions)}") print(f"๐Ÿ”„ Processing question {question_data.get('task_id', 'unknown')[:8]}...") result = test_single_question(question_data, validation_answers) results.append(result) # Show progress completed = i + 1 correct_so_far = len([r for r in results if r['status'] == 'CORRECT']) current_accuracy = correct_so_far / completed * 100 print(f"๐Ÿ“ˆ Current accuracy: {current_accuracy:.1f}% ({correct_so_far}/{completed})") end_time = time.time() total_duration = end_time - start_time # Final analysis print(f"\n" + "=" * 80) print(f"๐Ÿ LOGGED CLEAN TEST RESULTS") print(f"=" * 80) # Calculate metrics total_questions = len(results) correct_answers = len([r for r in results if r['status'] == 'CORRECT']) partial_answers = len([r for r in results if r['status'] == 'PARTIAL']) incorrect_answers = len([r for r in results if r['status'] == 'INCORRECT']) errors = len([r for r in results if r['status'] == 'ERROR']) accuracy_rate = correct_answers / total_questions * 100 success_rate = (correct_answers + partial_answers) / total_questions * 100 print(f"โฑ๏ธ Total Duration: {int(total_duration // 60)}m {int(total_duration % 60)}s") print(f"โœ… **HONEST ACCURACY: {accuracy_rate:.1f}%** ({correct_answers}/{total_questions})") print(f"๐ŸŽฏ Success Rate: {success_rate:.1f}% (including partial)") print(f"โšก Avg per Question: {total_duration/total_questions:.1f}s") print(f"\n๐Ÿ“Š DETAILED BREAKDOWN:") print(f" โœ… CORRECT: {correct_answers} ({correct_answers/total_questions:.1%})") print(f" ๐ŸŸก PARTIAL: {partial_answers} ({partial_answers/total_questions:.1%})") print(f" โŒ INCORRECT: {incorrect_answers} ({incorrect_answers/total_questions:.1%})") print(f" ๐Ÿ’ฅ ERROR: {errors} ({errors/total_questions:.1%})") # Question-by-question results print(f"\n๐Ÿ“‹ DETAILED QUESTION RESULTS:") for i, result in enumerate(results): status_icon = "โœ…" if result['status'] == "CORRECT" else "๐ŸŸก" if result['status'] == "PARTIAL" else "โŒ" print(f" {i+1}. {status_icon} {result['task_id'][:8]}... | {result['question_type']:12} | {result['status']:9} | {result['duration']:5.1f}s") print(f" Expected: {result['expected_answer']}") print(f" Got: {result['our_answer']}") if 'error' in result: print(f" Error: {result['error']}") # Save results timestamp = time.strftime("%Y%m%d_%H%M%S") results_file = f"logs/logged_clean_test_{timestamp}.json" with open(results_file, 'w') as f: json.dump({ 'test_metadata': { 'timestamp': timestamp, 'test_type': 'logged_clean_test_no_overrides', 'total_questions': total_questions, 'duration_seconds': total_duration, 'model': 'qwen3-235b', 'note': 'Pure LLM reasoning with full logging' }, 'metrics': { 'accuracy_rate': accuracy_rate, 'success_rate': success_rate, 'correct_answers': correct_answers, 'partial_answers': partial_answers, 'incorrect_answers': incorrect_answers, 'errors': errors }, 'detailed_results': results }, f, indent=2) print(f"\n๐Ÿ“ Results summary saved to: {results_file}") print(f"๐Ÿ“ Individual question logs saved to: logs/clean_batch_question__*.log") # Final assessment print(f"\n๐ŸŽฏ HONEST ASSESSMENT:") print(f"๐Ÿšซ NO CHEATING - Pure LLM reasoning only") print(f"๐Ÿ“Š **Real System Accuracy: {accuracy_rate:.1f}%**") if accuracy_rate >= 70: print(f"๐Ÿ† EXCELLENT: Achieves 70%+ target!") elif accuracy_rate >= 50: print(f"๐Ÿ”ง GOOD: Solid performance, room for improvement") elif accuracy_rate >= 30: print(f"โš ๏ธ MODERATE: Needs significant improvements") else: print(f"๐Ÿšจ POOR: Requires major system overhaul") print(f"\n๐Ÿ“ Check the log files for detailed execution traces!") return accuracy_rate, results if __name__ == "__main__": accuracy, results = run_logged_clean_test() print(f"\n๐ŸŽ‰ Logged clean test completed!") print(f"๐Ÿ“Š **HONEST ACCURACY: {accuracy:.1f}%**") print(f"๐Ÿ” Full logs available in logs/ directory")