File size: 9,768 Bytes
37cadfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#!/usr/bin/env python3
"""
Test for YouTube question processing in GAIA system
"""

import os
import sys
import json
from pathlib import Path
import importlib
import asyncio
import re

# Import the module containing the YouTube video analysis tool
import gaia_tools
from main import GAIASolver, CodeAgent, GAIA_TOOLS
from question_classifier import QuestionClassifier
from async_complete_test_hf import HFAsyncGAIATestSystem

# Original analyze_youtube_video function
original_analyze_youtube_video = gaia_tools.analyze_youtube_video

# Create a mock analyze_youtube_video function
def mock_analyze_youtube_video(video_url, question, max_frames=10):
    """Mock implementation that returns a predefined answer for bird species question"""
    print(f"πŸ“Ή Mock analyzing YouTube video: {video_url}")
    # Clean the URL in case there's a trailing comma
    cleaned_url = video_url.rstrip(',')
    
    # For the specific URL in the GAIA task
    if "L1vXCYZAYYM" in cleaned_url:
        return """
**πŸŽ₯ Gemini 2.0 Flash Video+Audio Analysis**
**Title:** Bird Identification Challenge: Backyard Birds in Spring
**Duration:** 3:42
**File Size:** 45.2MB
**Question:** What is the highest number of bird species to be on camera simultaneously?

**Analysis Results:**
After careful frame-by-frame analysis of the video, the highest number of different bird species visible simultaneously is 3. 
This occurs at approximately 1:23 into the video, where we can see:
1. American Robin
2. Northern Cardinal
3. Blue Jay

These three species are clearly visible in the same frame at this timestamp.
"""
    # Generic response for other URLs
    return """
**πŸŽ₯ Gemini 2.0 Flash Video+Audio Analysis**
**Title:** Unknown Video
**Duration:** Unknown
**File Size:** Unknown
**Question:** Unknown

**Analysis Results:**
Unable to analyze the video content. Please provide a valid YouTube URL.
"""

# YouTube URL regex pattern
YOUTUBE_URL_PATTERN = r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/.+?(?=\s|$)'

def extract_youtube_url(text):
    """Extract YouTube URL from text"""
    match = re.search(YOUTUBE_URL_PATTERN, text)
    if match:
        return match.group(0)
    return None

def direct_force_tools_execution(solver, youtube_url, question_text):
    """Directly execute the YouTube analysis tool via the solver's agent"""
    # Create a direct prompt that forces the YouTube analysis
    force_prompt = f"""
You need to analyze a YouTube video to answer a specific question.

YOUTUBE VIDEO URL: {youtube_url}
QUESTION: {question_text}

CRITICAL INSTRUCTIONS:
1. Use the analyze_youtube_video tool with the provided URL
2. Extract the answer from the tool's response
3. Provide ONLY the final numerical answer
"""
    # Create a fresh agent using the same approach as in GAIASolver
    print("πŸ€– Creating fresh agent for direct execution...")
    agent = CodeAgent(
        model=solver.model,
        tools=GAIA_TOOLS,
        max_steps=12,
        verbosity_level=1  # Lower verbosity for cleaner output
    )
    
    # Run the agent with the forcing prompt
    print("πŸ” Running direct analysis...")
    response = agent.run(force_prompt)
    return str(response)

def test_direct_youtube_question():
    """Test processing of YouTube question directly"""
    # Create question with the YouTube URL
    question = {
        'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
        'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?',
        'Final Answer': '3'  # Assuming this is the correct answer based on GAIA metadata
    }
    
    # Replace the function in the module with our mock
    print("πŸ”„ Replacing YouTube analysis tool with mock implementation...")
    gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
    
    try:
        # Initialize components after patching
        solver = GAIASolver()
        classifier = QuestionClassifier()
        
        # Classify the question
        print("🧩 Classifying question...")
        classification = classifier.classify_question(question['Question'])
        print(f"πŸ“‹ Classification: {classification['primary_agent']}")
        print(f"πŸ”§ Tools needed: {classification.get('tools_needed', [])}")
        
        # Extract YouTube URL from question
        youtube_url = extract_youtube_url(question['Question'])
        if youtube_url:
            # Remove any trailing comma
            youtube_url = youtube_url.rstrip(',')
            print(f"πŸ”— Extracted YouTube URL: {youtube_url}")
        
        # Use a direct approach to force tool execution
        print("\n🧠 Processing question with direct YouTube analyzer execution...")
        try:
            direct_result = direct_force_tools_execution(
                solver, 
                youtube_url,
                "What is the highest number of bird species to be on camera simultaneously?"
            )
            print(f"\nπŸ” Direct result: {direct_result}")
        except Exception as e:
            print(f"\n⚠️ Direct test error: {e}")
            direct_result = "Error in direct execution"
        
        # Also try the normal processing path
        print("\n🧠 Processing question with standard solver...")
        try:
            result = solver.solve_question(question)
            print(f"\nβœ… Standard result: {result}")
        except Exception as e:
            print(f"\n⚠️ Standard test error: {e}")
            result = "Error in standard execution"
        
        # Validate result
        expected = str(question['Final Answer']).strip().lower()
        actual = str(result).strip().lower()
        validation_status = "βœ“ correct" if expected == actual else "βœ— incorrect"
        print(f"πŸ”Ž Validation: {validation_status}")
        
        # If direct result contains the answer, check that too
        if "3" in direct_result:
            print(f"πŸ”Ž Direct validation: βœ“ correct")
        else:
            print(f"πŸ”Ž Direct validation: βœ— incorrect")
        
    finally:
        # Restore original function
        print("πŸ”„ Restoring original YouTube analysis tool...")
        gaia_tools.analyze_youtube_video = original_analyze_youtube_video

async def test_async_youtube_question():
    """Test processing of YouTube question using the async test system"""
    # Replace the function in the module with our mock
    print("πŸ”„ Replacing YouTube analysis tool with mock implementation in async test...")
    gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
    
    try:
        # Create async test system
        system = HFAsyncGAIATestSystem(
            max_concurrent=1,
            timeout_seconds=60,
            output_dir="/tmp/async_youtube_test"
        )
        
        # Create a single question test
        questions = [
            {
                'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
                'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?',
                'Final Answer': '3'
            }
        ]
        
        # Override the load_gaia_questions method to use our single question
        async def mock_load_questions(*args, **kwargs):
            return questions
        
        # Save the original method and replace it
        original_load_method = system.load_gaia_questions
        system.load_gaia_questions = mock_load_questions
        
        # Create a capturing wrapper for the solve_question method
        # Instead of replacing the solve_question method, we'll just run the test
        # Create a wrapper that ensures the mocking is active
        async def solving_wrapper():
            # Make extra sure the mock is in place during the test
            gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
            
            # Print confirmation of active mock
            print("πŸ“Ή Mock is active for async test - will analyze YouTube video")
            
        # Just call our wrapper to set up the mock
        await solving_wrapper()
        
        # Run the test
        print("πŸš€ Running async test with YouTube question...")
        result = await system.run_comprehensive_test(question_limit=1)
        
        # Print results
        print("\nπŸ“Š Async Test Results:")
        print(f"Total questions processed: {result['total_questions']}")
        print(f"Status counts: {result['status_counts']}")
        
        # Check answer from the first question
        question_id = questions[0]['task_id']
        if question_id in result['results']:
            question_result = result['results'][question_id]
            answer = question_result.get('answer', 'No answer')
            validation = question_result.get('validation_status', 'unknown')
            print(f"\nQuestion ID: {question_id}")
            print(f"Answer: {answer}")
            print(f"Validation: {validation}")
        else:
            print(f"No results found for question ID {question_id}")
            
        # Restore the original method
        system.load_gaia_questions = original_load_method
        
    finally:
        # Restore original function
        print("πŸ”„ Restoring original YouTube analysis tool...")
        gaia_tools.analyze_youtube_video = original_analyze_youtube_video
    
async def main():
    """Run both tests"""
    print("πŸš€ Starting direct YouTube question test...")
    test_direct_youtube_question()
    
    print("\n\nπŸš€ Starting async YouTube question test...")
    await test_async_youtube_question()
    
    print("\nβœ… All tests completed!")

if __name__ == "__main__":
    asyncio.run(main())