Final_Assignment

Running

File size: 14,186 Bytes

c262d1a

#!/usr/bin/env python3
"""
Logged Clean Test - Test all questions with proper logging and no overrides
"""

import os
import sys
import json
import time
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Add parent directory to path for imports
sys.path.append(str(Path(__file__).parent.parent))

# Local imports
from gaia_web_loader import GAIAQuestionLoaderWeb
from main import GAIASolver
from question_classifier import QuestionClassifier
from tests.test_logging_utils import test_logger


def load_validation_answers():
    """Load correct answers from GAIA validation metadata"""
    answers = {}
    try:
        validation_path = Path(__file__).parent.parent / 'gaia_validation_metadata.jsonl'
        with open(validation_path, 'r') as f:
            for line in f:
                if line.strip():
                    data = json.loads(line.strip())
                    task_id = data.get('task_id')
                    final_answer = data.get('Final answer')
                    if task_id and final_answer:
                        answers[task_id] = final_answer
    except Exception as e:
        print(f"⚠️ Could not load validation data: {e}")
    return answers


def validate_answer(task_id: str, our_answer: str, validation_answers: dict):
    """Validate our answer against the correct answer"""
    if task_id not in validation_answers:
        return None
    
    expected = str(validation_answers[task_id]).strip()
    our_clean = str(our_answer).strip()
    
    # Exact match
    if our_clean.lower() == expected.lower():
        return {"status": "CORRECT", "expected": expected, "our": our_clean}
    
    # Check if our answer contains the expected answer
    if expected.lower() in our_clean.lower():
        return {"status": "PARTIAL", "expected": expected, "our": our_clean}
    
    return {"status": "INCORRECT", "expected": expected, "our": our_clean}


def test_single_question(question_data, validation_answers, model="qwen3-235b"):
    """Test a single question without any overrides - WITH LOGGING"""
    task_id = question_data.get('task_id', 'unknown')
    
    # Use the same logging approach as test_specific_question.py
    with test_logger("clean_batch_question", task_id):
        try:
            print(f"🧪 Testing question: {task_id}")
            print("=" * 60)
            
            # Initialize solver and classifier
            print(f"🚀 Initializing GAIA Solver with Kluster.ai {model}...")
            solver = GAIASolver(use_kluster=True, kluster_model=model)
            print("🧠 Initializing Question Classifier...")
            classifier = QuestionClassifier()
            
            # Display question details
            print(f"✅ Found question!")
            print(f"📝 Question: {question_data.get('question', 'N/A')}")
            print(f"🏷️  Level: {question_data.get('Level', 'Unknown')}")
            print(f"📎 Has file: {'Yes' if question_data.get('file_name') else 'No'}")
            if question_data.get('file_name'):
                print(f"📄 File: {question_data.get('file_name')}")
            
            # Classify the question
            print(f"\n🧠 QUESTION CLASSIFICATION:")
            print("-" * 40)
            question_text = question_data.get('question', '')
            file_name = question_data.get('file_name', '')
            classification = classifier.classify_question(question_text, file_name)
            
            print(f"🎯 Primary Agent: {classification['primary_agent']}")
            if classification['secondary_agents']:
                print(f"🤝 Secondary Agents: {', '.join(classification['secondary_agents'])}")
            print(f"📊 Complexity: {classification['complexity']}/5")
            print(f"🎲 Confidence: {classification['confidence']:.3f}")
            print(f"🔧 Tools Needed: {', '.join(classification['tools_needed'][:3])}")
            if len(classification['tools_needed']) > 3:
                print(f"     (+{len(classification['tools_needed'])-3} more tools)")
            print(f"💭 Reasoning: {classification['reasoning']}")
            
            # Solve the question (NO OVERRIDES - pure LLM reasoning)
            print(f"\n🤖 Solving question...")
            print(f"🎯 Question type: {classification['primary_agent']}")
            print(f"🔄 Processing... (NO OVERRIDES - Pure LLM + Tools)")
            
            start_time = time.time()
            answer = solver.solve_question(question_data)
            end_time = time.time()
            
            duration = end_time - start_time
            print(f"✅ Completed in {duration:.1f} seconds")
            
            # Validate answer
            print(f"\n🔍 ANSWER VALIDATION:")
            print("-" * 40)
            validation_result = validate_answer(task_id, answer, validation_answers)
            
            if validation_result:
                print(f"Expected Answer: {validation_result['expected']}")
                print(f"Our Answer: {validation_result['our']}")
                print(f"Status: {validation_result['status']}")
                if validation_result['status'] == 'CORRECT':
                    print(f"✅ PERFECT MATCH!")
                elif validation_result['status'] == 'PARTIAL':
                    print(f"🟡 PARTIAL MATCH - contains correct answer")
                else:
                    print(f"❌ INCORRECT - answers don't match")
            else:
                print(f"⚠️ No validation data available for question {task_id}")
            
            print(f"\n📋 FINAL RESULTS:")
            print("=" * 60)
            print(f"Task ID: {task_id}")
            print(f"Question Type: {classification['primary_agent']}")
            print(f"Classification Confidence: {classification['confidence']:.3f}")
            print(f"Our Answer: {answer}")
            if validation_result:
                print(f"Expected Answer: {validation_result['expected']}")
                print(f"Validation Status: {validation_result['status']}")
            print(f"Duration: {duration:.1f}s")
            print(f"🚫 NO OVERRIDES APPLIED - Pure LLM reasoning")
            
            result = {
                'task_id': task_id,
                'question_type': classification['primary_agent'],
                'complexity': classification['complexity'],
                'confidence': classification['confidence'],
                'our_answer': str(answer),
                'expected_answer': validation_result['expected'] if validation_result else 'N/A',
                'status': validation_result['status'] if validation_result else 'NO_VALIDATION',
                'duration': duration,
                'question_preview': question_data.get('question', '')[:50] + "..."
            }
            
            status_icon = "✅" if result['status'] == "CORRECT" else "🟡" if result['status'] == "PARTIAL" else "❌"
            print(f"\n{status_icon} FINAL STATUS: {result['status']}")
            
            return result
            
        except Exception as e:
            print(f"❌ Error testing question: {e}")
            import traceback
            traceback.print_exc()
            
            return {
                'task_id': task_id,
                'question_type': 'error',
                'complexity': 0,
                'confidence': 0.0,
                'our_answer': '',
                'expected_answer': validation_answers.get(task_id, 'N/A'),
                'status': 'ERROR',
                'duration': 0.0,
                'error': str(e),
                'question_preview': question_data.get('question', '')[:50] + "..."
            }


def run_logged_clean_test():
    """Run logged clean test on all questions"""
    
    print("🧪 LOGGED CLEAN TEST - NO OVERRIDES")
    print("=" * 60)
    print("🎯 Goal: Measure real accuracy with full logging")
    print("🚫 No hardcoded answers or overrides")
    print("🤖 Pure LLM + Tools reasoning only")
    print("📝 Full detailed logs will be created")
    print()
    
    # Load questions and validation data
    print("📋 Loading GAIA questions...")
    loader = GAIAQuestionLoaderWeb()
    all_questions = loader.questions
    validation_answers = load_validation_answers()
    
    print(f"✅ Loaded {len(all_questions)} questions")
    print(f"✅ Loaded {len(validation_answers)} validation answers")
    
    # Show question preview
    print(f"\n📋 Questions to test:")
    for i, q in enumerate(all_questions[:3]):  # Show first 3
        task_id = q.get('task_id', 'unknown')
        question_preview = q.get('question', '')[:40] + "..."
        level = q.get('Level', 'Unknown')
        expected = validation_answers.get(task_id, 'N/A')
        has_file = "📎" if q.get('file_name') else "📝"
        print(f"  {i+1}. {task_id[:8]}... | L{level} | {has_file} | Expected: {expected}")
        print(f"     {question_preview}")
    
    if len(all_questions) > 3:
        print(f"  ... and {len(all_questions) - 3} more questions")
    
    print(f"\n🚀 Starting logged clean test...")
    print(f"📝 Each question will create a detailed log file")
    print(f"⏱️  Estimated time: ~{len(all_questions) * 2} minutes")
    
    # Process first 3 questions for demonstration (you can change this)
    test_questions = all_questions[:3]  # Test first 3 questions
    
    start_time = time.time()
    results = []
    
    for i, question_data in enumerate(test_questions):
        print(f"\n" + "="*80)
        print(f"📊 PROGRESS: {i+1}/{len(test_questions)}")
        print(f"🔄 Processing question {question_data.get('task_id', 'unknown')[:8]}...")
        
        result = test_single_question(question_data, validation_answers)
        results.append(result)
        
        # Show progress
        completed = i + 1
        correct_so_far = len([r for r in results if r['status'] == 'CORRECT'])
        current_accuracy = correct_so_far / completed * 100
        print(f"📈 Current accuracy: {current_accuracy:.1f}% ({correct_so_far}/{completed})")
    
    end_time = time.time()
    total_duration = end_time - start_time
    
    # Final analysis
    print(f"\n" + "=" * 80)
    print(f"🏁 LOGGED CLEAN TEST RESULTS")
    print(f"=" * 80)
    
    # Calculate metrics
    total_questions = len(results)
    correct_answers = len([r for r in results if r['status'] == 'CORRECT'])
    partial_answers = len([r for r in results if r['status'] == 'PARTIAL'])
    incorrect_answers = len([r for r in results if r['status'] == 'INCORRECT'])
    errors = len([r for r in results if r['status'] == 'ERROR'])
    
    accuracy_rate = correct_answers / total_questions * 100
    success_rate = (correct_answers + partial_answers) / total_questions * 100
    
    print(f"⏱️  Total Duration: {int(total_duration // 60)}m {int(total_duration % 60)}s")
    print(f"✅ **HONEST ACCURACY: {accuracy_rate:.1f}%** ({correct_answers}/{total_questions})")
    print(f"🎯 Success Rate: {success_rate:.1f}% (including partial)")
    print(f"⚡ Avg per Question: {total_duration/total_questions:.1f}s")
    
    print(f"\n📊 DETAILED BREAKDOWN:")
    print(f"  ✅ CORRECT: {correct_answers} ({correct_answers/total_questions:.1%})")
    print(f"  🟡 PARTIAL: {partial_answers} ({partial_answers/total_questions:.1%})")
    print(f"  ❌ INCORRECT: {incorrect_answers} ({incorrect_answers/total_questions:.1%})")
    print(f"  💥 ERROR: {errors} ({errors/total_questions:.1%})")
    
    # Question-by-question results
    print(f"\n📋 DETAILED QUESTION RESULTS:")
    for i, result in enumerate(results):
        status_icon = "✅" if result['status'] == "CORRECT" else "🟡" if result['status'] == "PARTIAL" else "❌"
        print(f"  {i+1}. {status_icon} {result['task_id'][:8]}... | {result['question_type']:12} | {result['status']:9} | {result['duration']:5.1f}s")
        print(f"      Expected: {result['expected_answer']}")
        print(f"      Got:      {result['our_answer']}")
        if 'error' in result:
            print(f"      Error:    {result['error']}")
    
    # Save results
    timestamp = time.strftime("%Y%m%d_%H%M%S")
    results_file = f"logs/logged_clean_test_{timestamp}.json"
    
    with open(results_file, 'w') as f:
        json.dump({
            'test_metadata': {
                'timestamp': timestamp,
                'test_type': 'logged_clean_test_no_overrides',
                'total_questions': total_questions,
                'duration_seconds': total_duration,
                'model': 'qwen3-235b',
                'note': 'Pure LLM reasoning with full logging'
            },
            'metrics': {
                'accuracy_rate': accuracy_rate,
                'success_rate': success_rate,
                'correct_answers': correct_answers,
                'partial_answers': partial_answers,
                'incorrect_answers': incorrect_answers,
                'errors': errors
            },
            'detailed_results': results
        }, f, indent=2)
    
    print(f"\n📁 Results summary saved to: {results_file}")
    print(f"📝 Individual question logs saved to: logs/clean_batch_question_<id>_*.log")
    
    # Final assessment
    print(f"\n🎯 HONEST ASSESSMENT:")
    print(f"🚫 NO CHEATING - Pure LLM reasoning only")
    print(f"📊 **Real System Accuracy: {accuracy_rate:.1f}%**")
    
    if accuracy_rate >= 70:
        print(f"🏆 EXCELLENT: Achieves 70%+ target!")
    elif accuracy_rate >= 50:
        print(f"🔧 GOOD: Solid performance, room for improvement")
    elif accuracy_rate >= 30:
        print(f"⚠️ MODERATE: Needs significant improvements")
    else:
        print(f"🚨 POOR: Requires major system overhaul")
    
    print(f"\n📝 Check the log files for detailed execution traces!")
    
    return accuracy_rate, results


if __name__ == "__main__":
    accuracy, results = run_logged_clean_test()
    print(f"\n🎉 Logged clean test completed!")
    print(f"📊 **HONEST ACCURACY: {accuracy:.1f}%**")
    print(f"🔍 Full logs available in logs/ directory")