Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Logged Clean Test - Test all questions with proper logging and no overrides | |
""" | |
import os | |
import sys | |
import json | |
import time | |
from pathlib import Path | |
from dotenv import load_dotenv | |
# Load environment variables | |
load_dotenv() | |
# Add parent directory to path for imports | |
sys.path.append(str(Path(__file__).parent.parent)) | |
# Local imports | |
from gaia_web_loader import GAIAQuestionLoaderWeb | |
from main import GAIASolver | |
from question_classifier import QuestionClassifier | |
from tests.test_logging_utils import test_logger | |
def load_validation_answers(): | |
"""Load correct answers from GAIA validation metadata""" | |
answers = {} | |
try: | |
validation_path = Path(__file__).parent.parent / 'gaia_validation_metadata.jsonl' | |
with open(validation_path, 'r') as f: | |
for line in f: | |
if line.strip(): | |
data = json.loads(line.strip()) | |
task_id = data.get('task_id') | |
final_answer = data.get('Final answer') | |
if task_id and final_answer: | |
answers[task_id] = final_answer | |
except Exception as e: | |
print(f"β οΈ Could not load validation data: {e}") | |
return answers | |
def validate_answer(task_id: str, our_answer: str, validation_answers: dict): | |
"""Validate our answer against the correct answer""" | |
if task_id not in validation_answers: | |
return None | |
expected = str(validation_answers[task_id]).strip() | |
our_clean = str(our_answer).strip() | |
# Exact match | |
if our_clean.lower() == expected.lower(): | |
return {"status": "CORRECT", "expected": expected, "our": our_clean} | |
# Check if our answer contains the expected answer | |
if expected.lower() in our_clean.lower(): | |
return {"status": "PARTIAL", "expected": expected, "our": our_clean} | |
return {"status": "INCORRECT", "expected": expected, "our": our_clean} | |
def test_single_question(question_data, validation_answers, model="qwen3-235b"): | |
"""Test a single question without any overrides - WITH LOGGING""" | |
task_id = question_data.get('task_id', 'unknown') | |
# Use the same logging approach as test_specific_question.py | |
with test_logger("clean_batch_question", task_id): | |
try: | |
print(f"π§ͺ Testing question: {task_id}") | |
print("=" * 60) | |
# Initialize solver and classifier | |
print(f"π Initializing GAIA Solver with Kluster.ai {model}...") | |
solver = GAIASolver(use_kluster=True, kluster_model=model) | |
print("π§ Initializing Question Classifier...") | |
classifier = QuestionClassifier() | |
# Display question details | |
print(f"β Found question!") | |
print(f"π Question: {question_data.get('question', 'N/A')}") | |
print(f"π·οΈ Level: {question_data.get('Level', 'Unknown')}") | |
print(f"π Has file: {'Yes' if question_data.get('file_name') else 'No'}") | |
if question_data.get('file_name'): | |
print(f"π File: {question_data.get('file_name')}") | |
# Classify the question | |
print(f"\nπ§ QUESTION CLASSIFICATION:") | |
print("-" * 40) | |
question_text = question_data.get('question', '') | |
file_name = question_data.get('file_name', '') | |
classification = classifier.classify_question(question_text, file_name) | |
print(f"π― Primary Agent: {classification['primary_agent']}") | |
if classification['secondary_agents']: | |
print(f"π€ Secondary Agents: {', '.join(classification['secondary_agents'])}") | |
print(f"π Complexity: {classification['complexity']}/5") | |
print(f"π² Confidence: {classification['confidence']:.3f}") | |
print(f"π§ Tools Needed: {', '.join(classification['tools_needed'][:3])}") | |
if len(classification['tools_needed']) > 3: | |
print(f" (+{len(classification['tools_needed'])-3} more tools)") | |
print(f"π Reasoning: {classification['reasoning']}") | |
# Solve the question (NO OVERRIDES - pure LLM reasoning) | |
print(f"\nπ€ Solving question...") | |
print(f"π― Question type: {classification['primary_agent']}") | |
print(f"π Processing... (NO OVERRIDES - Pure LLM + Tools)") | |
start_time = time.time() | |
answer = solver.solve_question(question_data) | |
end_time = time.time() | |
duration = end_time - start_time | |
print(f"β Completed in {duration:.1f} seconds") | |
# Validate answer | |
print(f"\nπ ANSWER VALIDATION:") | |
print("-" * 40) | |
validation_result = validate_answer(task_id, answer, validation_answers) | |
if validation_result: | |
print(f"Expected Answer: {validation_result['expected']}") | |
print(f"Our Answer: {validation_result['our']}") | |
print(f"Status: {validation_result['status']}") | |
if validation_result['status'] == 'CORRECT': | |
print(f"β PERFECT MATCH!") | |
elif validation_result['status'] == 'PARTIAL': | |
print(f"π‘ PARTIAL MATCH - contains correct answer") | |
else: | |
print(f"β INCORRECT - answers don't match") | |
else: | |
print(f"β οΈ No validation data available for question {task_id}") | |
print(f"\nπ FINAL RESULTS:") | |
print("=" * 60) | |
print(f"Task ID: {task_id}") | |
print(f"Question Type: {classification['primary_agent']}") | |
print(f"Classification Confidence: {classification['confidence']:.3f}") | |
print(f"Our Answer: {answer}") | |
if validation_result: | |
print(f"Expected Answer: {validation_result['expected']}") | |
print(f"Validation Status: {validation_result['status']}") | |
print(f"Duration: {duration:.1f}s") | |
print(f"π« NO OVERRIDES APPLIED - Pure LLM reasoning") | |
result = { | |
'task_id': task_id, | |
'question_type': classification['primary_agent'], | |
'complexity': classification['complexity'], | |
'confidence': classification['confidence'], | |
'our_answer': str(answer), | |
'expected_answer': validation_result['expected'] if validation_result else 'N/A', | |
'status': validation_result['status'] if validation_result else 'NO_VALIDATION', | |
'duration': duration, | |
'question_preview': question_data.get('question', '')[:50] + "..." | |
} | |
status_icon = "β " if result['status'] == "CORRECT" else "π‘" if result['status'] == "PARTIAL" else "β" | |
print(f"\n{status_icon} FINAL STATUS: {result['status']}") | |
return result | |
except Exception as e: | |
print(f"β Error testing question: {e}") | |
import traceback | |
traceback.print_exc() | |
return { | |
'task_id': task_id, | |
'question_type': 'error', | |
'complexity': 0, | |
'confidence': 0.0, | |
'our_answer': '', | |
'expected_answer': validation_answers.get(task_id, 'N/A'), | |
'status': 'ERROR', | |
'duration': 0.0, | |
'error': str(e), | |
'question_preview': question_data.get('question', '')[:50] + "..." | |
} | |
def run_logged_clean_test(): | |
"""Run logged clean test on all questions""" | |
print("π§ͺ LOGGED CLEAN TEST - NO OVERRIDES") | |
print("=" * 60) | |
print("π― Goal: Measure real accuracy with full logging") | |
print("π« No hardcoded answers or overrides") | |
print("π€ Pure LLM + Tools reasoning only") | |
print("π Full detailed logs will be created") | |
print() | |
# Load questions and validation data | |
print("π Loading GAIA questions...") | |
loader = GAIAQuestionLoaderWeb() | |
all_questions = loader.questions | |
validation_answers = load_validation_answers() | |
print(f"β Loaded {len(all_questions)} questions") | |
print(f"β Loaded {len(validation_answers)} validation answers") | |
# Show question preview | |
print(f"\nπ Questions to test:") | |
for i, q in enumerate(all_questions[:3]): # Show first 3 | |
task_id = q.get('task_id', 'unknown') | |
question_preview = q.get('question', '')[:40] + "..." | |
level = q.get('Level', 'Unknown') | |
expected = validation_answers.get(task_id, 'N/A') | |
has_file = "π" if q.get('file_name') else "π" | |
print(f" {i+1}. {task_id[:8]}... | L{level} | {has_file} | Expected: {expected}") | |
print(f" {question_preview}") | |
if len(all_questions) > 3: | |
print(f" ... and {len(all_questions) - 3} more questions") | |
print(f"\nπ Starting logged clean test...") | |
print(f"π Each question will create a detailed log file") | |
print(f"β±οΈ Estimated time: ~{len(all_questions) * 2} minutes") | |
# Process first 3 questions for demonstration (you can change this) | |
test_questions = all_questions[:3] # Test first 3 questions | |
start_time = time.time() | |
results = [] | |
for i, question_data in enumerate(test_questions): | |
print(f"\n" + "="*80) | |
print(f"π PROGRESS: {i+1}/{len(test_questions)}") | |
print(f"π Processing question {question_data.get('task_id', 'unknown')[:8]}...") | |
result = test_single_question(question_data, validation_answers) | |
results.append(result) | |
# Show progress | |
completed = i + 1 | |
correct_so_far = len([r for r in results if r['status'] == 'CORRECT']) | |
current_accuracy = correct_so_far / completed * 100 | |
print(f"π Current accuracy: {current_accuracy:.1f}% ({correct_so_far}/{completed})") | |
end_time = time.time() | |
total_duration = end_time - start_time | |
# Final analysis | |
print(f"\n" + "=" * 80) | |
print(f"π LOGGED CLEAN TEST RESULTS") | |
print(f"=" * 80) | |
# Calculate metrics | |
total_questions = len(results) | |
correct_answers = len([r for r in results if r['status'] == 'CORRECT']) | |
partial_answers = len([r for r in results if r['status'] == 'PARTIAL']) | |
incorrect_answers = len([r for r in results if r['status'] == 'INCORRECT']) | |
errors = len([r for r in results if r['status'] == 'ERROR']) | |
accuracy_rate = correct_answers / total_questions * 100 | |
success_rate = (correct_answers + partial_answers) / total_questions * 100 | |
print(f"β±οΈ Total Duration: {int(total_duration // 60)}m {int(total_duration % 60)}s") | |
print(f"β **HONEST ACCURACY: {accuracy_rate:.1f}%** ({correct_answers}/{total_questions})") | |
print(f"π― Success Rate: {success_rate:.1f}% (including partial)") | |
print(f"β‘ Avg per Question: {total_duration/total_questions:.1f}s") | |
print(f"\nπ DETAILED BREAKDOWN:") | |
print(f" β CORRECT: {correct_answers} ({correct_answers/total_questions:.1%})") | |
print(f" π‘ PARTIAL: {partial_answers} ({partial_answers/total_questions:.1%})") | |
print(f" β INCORRECT: {incorrect_answers} ({incorrect_answers/total_questions:.1%})") | |
print(f" π₯ ERROR: {errors} ({errors/total_questions:.1%})") | |
# Question-by-question results | |
print(f"\nπ DETAILED QUESTION RESULTS:") | |
for i, result in enumerate(results): | |
status_icon = "β " if result['status'] == "CORRECT" else "π‘" if result['status'] == "PARTIAL" else "β" | |
print(f" {i+1}. {status_icon} {result['task_id'][:8]}... | {result['question_type']:12} | {result['status']:9} | {result['duration']:5.1f}s") | |
print(f" Expected: {result['expected_answer']}") | |
print(f" Got: {result['our_answer']}") | |
if 'error' in result: | |
print(f" Error: {result['error']}") | |
# Save results | |
timestamp = time.strftime("%Y%m%d_%H%M%S") | |
results_file = f"logs/logged_clean_test_{timestamp}.json" | |
with open(results_file, 'w') as f: | |
json.dump({ | |
'test_metadata': { | |
'timestamp': timestamp, | |
'test_type': 'logged_clean_test_no_overrides', | |
'total_questions': total_questions, | |
'duration_seconds': total_duration, | |
'model': 'qwen3-235b', | |
'note': 'Pure LLM reasoning with full logging' | |
}, | |
'metrics': { | |
'accuracy_rate': accuracy_rate, | |
'success_rate': success_rate, | |
'correct_answers': correct_answers, | |
'partial_answers': partial_answers, | |
'incorrect_answers': incorrect_answers, | |
'errors': errors | |
}, | |
'detailed_results': results | |
}, f, indent=2) | |
print(f"\nπ Results summary saved to: {results_file}") | |
print(f"π Individual question logs saved to: logs/clean_batch_question_<id>_*.log") | |
# Final assessment | |
print(f"\nπ― HONEST ASSESSMENT:") | |
print(f"π« NO CHEATING - Pure LLM reasoning only") | |
print(f"π **Real System Accuracy: {accuracy_rate:.1f}%**") | |
if accuracy_rate >= 70: | |
print(f"π EXCELLENT: Achieves 70%+ target!") | |
elif accuracy_rate >= 50: | |
print(f"π§ GOOD: Solid performance, room for improvement") | |
elif accuracy_rate >= 30: | |
print(f"β οΈ MODERATE: Needs significant improvements") | |
else: | |
print(f"π¨ POOR: Requires major system overhaul") | |
print(f"\nπ Check the log files for detailed execution traces!") | |
return accuracy_rate, results | |
if __name__ == "__main__": | |
accuracy, results = run_logged_clean_test() | |
print(f"\nπ Logged clean test completed!") | |
print(f"π **HONEST ACCURACY: {accuracy:.1f}%**") | |
print(f"π Full logs available in logs/ directory") |