Final_Assignment

Running

File size: 9,768 Bytes

37cadfb

#!/usr/bin/env python3
"""
Test for YouTube question processing in GAIA system
"""

import os
import sys
import json
from pathlib import Path
import importlib
import asyncio
import re

# Import the module containing the YouTube video analysis tool
import gaia_tools
from main import GAIASolver, CodeAgent, GAIA_TOOLS
from question_classifier import QuestionClassifier
from async_complete_test_hf import HFAsyncGAIATestSystem

# Original analyze_youtube_video function
original_analyze_youtube_video = gaia_tools.analyze_youtube_video

# Create a mock analyze_youtube_video function
def mock_analyze_youtube_video(video_url, question, max_frames=10):
    """Mock implementation that returns a predefined answer for bird species question"""
    print(f"📹 Mock analyzing YouTube video: {video_url}")
    # Clean the URL in case there's a trailing comma
    cleaned_url = video_url.rstrip(',')
    
    # For the specific URL in the GAIA task
    if "L1vXCYZAYYM" in cleaned_url:
        return """
**🎥 Gemini 2.0 Flash Video+Audio Analysis**
**Title:** Bird Identification Challenge: Backyard Birds in Spring
**Duration:** 3:42
**File Size:** 45.2MB
**Question:** What is the highest number of bird species to be on camera simultaneously?

**Analysis Results:**
After careful frame-by-frame analysis of the video, the highest number of different bird species visible simultaneously is 3. 
This occurs at approximately 1:23 into the video, where we can see:
1. American Robin
2. Northern Cardinal
3. Blue Jay

These three species are clearly visible in the same frame at this timestamp.
"""
    # Generic response for other URLs
    return """
**🎥 Gemini 2.0 Flash Video+Audio Analysis**
**Title:** Unknown Video
**Duration:** Unknown
**File Size:** Unknown
**Question:** Unknown

**Analysis Results:**
Unable to analyze the video content. Please provide a valid YouTube URL.
"""

# YouTube URL regex pattern
YOUTUBE_URL_PATTERN = r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/.+?(?=\s|$)'

def extract_youtube_url(text):
    """Extract YouTube URL from text"""
    match = re.search(YOUTUBE_URL_PATTERN, text)
    if match:
        return match.group(0)
    return None

def direct_force_tools_execution(solver, youtube_url, question_text):
    """Directly execute the YouTube analysis tool via the solver's agent"""
    # Create a direct prompt that forces the YouTube analysis
    force_prompt = f"""
You need to analyze a YouTube video to answer a specific question.

YOUTUBE VIDEO URL: {youtube_url}
QUESTION: {question_text}

CRITICAL INSTRUCTIONS:
1. Use the analyze_youtube_video tool with the provided URL
2. Extract the answer from the tool's response
3. Provide ONLY the final numerical answer
"""
    # Create a fresh agent using the same approach as in GAIASolver
    print("🤖 Creating fresh agent for direct execution...")
    agent = CodeAgent(
        model=solver.model,
        tools=GAIA_TOOLS,
        max_steps=12,
        verbosity_level=1  # Lower verbosity for cleaner output
    )
    
    # Run the agent with the forcing prompt
    print("🔍 Running direct analysis...")
    response = agent.run(force_prompt)
    return str(response)

def test_direct_youtube_question():
    """Test processing of YouTube question directly"""
    # Create question with the YouTube URL
    question = {
        'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
        'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?',
        'Final Answer': '3'  # Assuming this is the correct answer based on GAIA metadata
    }
    
    # Replace the function in the module with our mock
    print("🔄 Replacing YouTube analysis tool with mock implementation...")
    gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
    
    try:
        # Initialize components after patching
        solver = GAIASolver()
        classifier = QuestionClassifier()
        
        # Classify the question
        print("🧩 Classifying question...")
        classification = classifier.classify_question(question['Question'])
        print(f"📋 Classification: {classification['primary_agent']}")
        print(f"🔧 Tools needed: {classification.get('tools_needed', [])}")
        
        # Extract YouTube URL from question
        youtube_url = extract_youtube_url(question['Question'])
        if youtube_url:
            # Remove any trailing comma
            youtube_url = youtube_url.rstrip(',')
            print(f"🔗 Extracted YouTube URL: {youtube_url}")
        
        # Use a direct approach to force tool execution
        print("\n🧠 Processing question with direct YouTube analyzer execution...")
        try:
            direct_result = direct_force_tools_execution(
                solver, 
                youtube_url,
                "What is the highest number of bird species to be on camera simultaneously?"
            )
            print(f"\n🔍 Direct result: {direct_result}")
        except Exception as e:
            print(f"\n⚠️ Direct test error: {e}")
            direct_result = "Error in direct execution"
        
        # Also try the normal processing path
        print("\n🧠 Processing question with standard solver...")
        try:
            result = solver.solve_question(question)
            print(f"\n✅ Standard result: {result}")
        except Exception as e:
            print(f"\n⚠️ Standard test error: {e}")
            result = "Error in standard execution"
        
        # Validate result
        expected = str(question['Final Answer']).strip().lower()
        actual = str(result).strip().lower()
        validation_status = "✓ correct" if expected == actual else "✗ incorrect"
        print(f"🔎 Validation: {validation_status}")
        
        # If direct result contains the answer, check that too
        if "3" in direct_result:
            print(f"🔎 Direct validation: ✓ correct")
        else:
            print(f"🔎 Direct validation: ✗ incorrect")
        
    finally:
        # Restore original function
        print("🔄 Restoring original YouTube analysis tool...")
        gaia_tools.analyze_youtube_video = original_analyze_youtube_video

async def test_async_youtube_question():
    """Test processing of YouTube question using the async test system"""
    # Replace the function in the module with our mock
    print("🔄 Replacing YouTube analysis tool with mock implementation in async test...")
    gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
    
    try:
        # Create async test system
        system = HFAsyncGAIATestSystem(
            max_concurrent=1,
            timeout_seconds=60,
            output_dir="/tmp/async_youtube_test"
        )
        
        # Create a single question test
        questions = [
            {
                'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
                'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?',
                'Final Answer': '3'
            }
        ]
        
        # Override the load_gaia_questions method to use our single question
        async def mock_load_questions(*args, **kwargs):
            return questions
        
        # Save the original method and replace it
        original_load_method = system.load_gaia_questions
        system.load_gaia_questions = mock_load_questions
        
        # Create a capturing wrapper for the solve_question method
        # Instead of replacing the solve_question method, we'll just run the test
        # Create a wrapper that ensures the mocking is active
        async def solving_wrapper():
            # Make extra sure the mock is in place during the test
            gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
            
            # Print confirmation of active mock
            print("📹 Mock is active for async test - will analyze YouTube video")
            
        # Just call our wrapper to set up the mock
        await solving_wrapper()
        
        # Run the test
        print("🚀 Running async test with YouTube question...")
        result = await system.run_comprehensive_test(question_limit=1)
        
        # Print results
        print("\n📊 Async Test Results:")
        print(f"Total questions processed: {result['total_questions']}")
        print(f"Status counts: {result['status_counts']}")
        
        # Check answer from the first question
        question_id = questions[0]['task_id']
        if question_id in result['results']:
            question_result = result['results'][question_id]
            answer = question_result.get('answer', 'No answer')
            validation = question_result.get('validation_status', 'unknown')
            print(f"\nQuestion ID: {question_id}")
            print(f"Answer: {answer}")
            print(f"Validation: {validation}")
        else:
            print(f"No results found for question ID {question_id}")
            
        # Restore the original method
        system.load_gaia_questions = original_load_method
        
    finally:
        # Restore original function
        print("🔄 Restoring original YouTube analysis tool...")
        gaia_tools.analyze_youtube_video = original_analyze_youtube_video
    
async def main():
    """Run both tests"""
    print("🚀 Starting direct YouTube question test...")
    test_direct_youtube_question()
    
    print("\n\n🚀 Starting async YouTube question test...")
    await test_async_youtube_question()
    
    print("\n✅ All tests completed!")

if __name__ == "__main__":
    asyncio.run(main())