Spaces:

schoolkithub
/

GAIA_AGE

Sleeping

File size: 7,931 Bytes

945d0d0

import json
import os
from typing import List, Dict
from agent import GAIAAgent

def normalize_answer(answer: str) -> str:
    """Normalize answer for comparison."""
    if not answer:
        return ""
    
    # Remove common prefixes/suffixes
    answer = answer.strip()
    
    # Remove quotes if they wrap the entire answer
    if (answer.startswith('"') and answer.endswith('"')) or (answer.startswith("'") and answer.endswith("'")):
        answer = answer[1:-1]
    
    # Convert to lowercase for comparison
    return answer.lower().strip()

def extract_final_answer(response: str) -> str:
    """Extract the final answer from the model response."""
    if "FINAL ANSWER:" in response:
        answer = response.split("FINAL ANSWER:")[1].strip()
        # Clean up the answer - remove any trailing explanation
        answer = answer.split('\n')[0].strip()
        return answer
    
    # If no FINAL ANSWER format, try to extract from end of response
    lines = response.strip().split('\n')
    return lines[-1].strip()

def load_gaia_dataset(dataset_path: str) -> List[Dict]:
    """Load GAIA dataset from JSON/JSONL file."""
    tasks = []
    
    if not os.path.exists(dataset_path):
        print(f"Dataset file not found: {dataset_path}")
        return tasks
    
    try:
        with open(dataset_path, "r", encoding="utf-8") as f:
            if dataset_path.endswith('.jsonl'):
                # JSONL format - one JSON object per line
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    if line:
                        try:
                            task = json.loads(line)
                            tasks.append(task)
                        except json.JSONDecodeError as e:
                            print(f"Error parsing line {line_num}: {e}")
            else:
                # Regular JSON format
                data = json.load(f)
                if isinstance(data, list):
                    tasks = data
                elif isinstance(data, dict) and 'tasks' in data:
                    tasks = data['tasks']
                else:
                    print("Unexpected JSON format")
    
    except Exception as e:
        print(f"Error loading dataset: {e}")
    
    print(f"Loaded {len(tasks)} tasks from {dataset_path}")
    return tasks

def create_sample_dataset() -> List[Dict]:
    """Create a sample dataset for testing if no GAIA dataset is available."""
    sample_tasks = [
        {
            "task_id": "sample_1",
            "question": "What is 15 + 27?",
            "answer": "42",
            "level": 1,
            "file_name": None
        },
        {
            "task_id": "sample_2", 
            "question": "What is the capital of France?",
            "answer": "Paris",
            "level": 1,
            "file_name": None
        },
        {
            "task_id": "sample_3",
            "question": "How many days are in a leap year?",
            "answer": "366",
            "level": 1,
            "file_name": None
        },
        {
            "task_id": "sample_4",
            "question": "What is 2 * 6 * 7?",
            "answer": "84",
            "level": 1,
            "file_name": None
        },
        {
            "task_id": "sample_5",
            "question": "What year did World War II end?",
            "answer": "1945",
            "level": 1,
            "file_name": None
        }
    ]
    
    print("Using sample dataset for testing")
    return sample_tasks

def evaluate_agent(dataset_path: str = None, max_tasks: int = None) -> float:
    """Evaluate the GAIA agent on the dataset."""
    # Load dataset
    if dataset_path and os.path.exists(dataset_path):
        tasks = load_gaia_dataset(dataset_path)
    else:
        print("No dataset file found, using sample tasks for testing")
        tasks = create_sample_dataset()
    
    if not tasks:
        print("No tasks to evaluate")
        return 0.0
    
    # Limit number of tasks if specified
    if max_tasks:
        tasks = tasks[:max_tasks]
        print(f"Evaluating on first {len(tasks)} tasks")
    
    # Initialize agent
    print("Initializing GAIA agent...")
    agent = GAIAAgent()
    
    # Test API connection first
    print("Testing API connection...")
    test_response = agent.test_grok()
    if "error" in test_response.lower():
        print(f"API test failed: {test_response}")
        return 0.0
    else:
        print("API connection successful!")
    
    # Process tasks
    correct = 0
    total = len(tasks)
    submission_entries = []
    
    print(f"\nStarting evaluation on {total} tasks...")
    print("=" * 50)
    
    for i, task in enumerate(tasks, 1):
        task_id = task.get("task_id", f"task_{i}")
        question = task.get("question", "")
        expected_answer = task.get("answer", "")
        
        print(f"\nTask {i}/{total}: {task_id}")
        print(f"Question: {question[:100]}{'...' if len(question) > 100 else ''}")
        
        try:
            # Process task with agent
            response = agent.process_task(task)
            predicted_answer = extract_final_answer(response)
            
            print(f"Expected: {expected_answer}")
            print(f"Predicted: {predicted_answer}")
            
            # Compare answers (normalized)
            is_correct = normalize_answer(predicted_answer) == normalize_answer(expected_answer)
            
            if is_correct:
                correct += 1
                print("✅ CORRECT")
            else:
                print("❌ INCORRECT")
            
            # Store submission entry
            submission_entries.append({
                "task_id": task_id,
                "model_answer": predicted_answer,
                "reasoning_trace": response
            })
            
        except Exception as e:
            print(f"Error processing task {task_id}: {e}")
            submission_entries.append({
                "task_id": task_id,
                "model_answer": "ERROR",
                "reasoning_trace": f"Error: {str(e)}"
            })
        
        # Progress update
        current_score = (correct / i) * 100
        print(f"Current score: {correct}/{i} = {current_score:.1f}%")
        print("-" * 30)
    
    # Final score
    final_score = (correct / total) * 100
    
    # Save submission file
    try:
        with open("submission.jsonl", "w", encoding="utf-8") as f:
            for entry in submission_entries:
                f.write(json.dumps(entry) + "\n")
        print(f"\nSubmission saved to submission.jsonl")
    except Exception as e:
        print(f"Error saving submission: {e}")
    
    # Print final results
    print("=" * 50)
    print("FINAL RESULTS")
    print("=" * 50)
    print(f"Total tasks: {total}")
    print(f"Correct answers: {correct}")
    print(f"Final score: {final_score:.2f}%")
    
    if final_score >= 30:
        print("🎉 CONGRATULATIONS! Score ≥30% - Certificate achieved!")
    else:
        print(f"📈 Score below 30%. Need {30 - final_score:.2f}% more for certificate.")
    
    return final_score

def main():
    """Main evaluation function."""
    import argparse
    
    parser = argparse.ArgumentParser(description="Evaluate GAIA agent")
    parser.add_argument("--dataset", type=str, default="gaia_test.json", 
                       help="Path to GAIA dataset file")
    parser.add_argument("--max-tasks", type=int, default=None,
                       help="Maximum number of tasks to evaluate")
    
    args = parser.parse_args()
    
    score = evaluate_agent(args.dataset, args.max_tasks)
    
    print(f"\nFinal evaluation score: {score:.2f}%")
    
    if score >= 30:
        print("Certificate requirements met! 🎉")
    else:
        print("Keep working to reach 30% for the certificate! 💪")

if __name__ == "__main__":
    main()