Spaces:
Running
Running
File size: 9,768 Bytes
37cadfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
#!/usr/bin/env python3
"""
Test for YouTube question processing in GAIA system
"""
import os
import sys
import json
from pathlib import Path
import importlib
import asyncio
import re
# Import the module containing the YouTube video analysis tool
import gaia_tools
from main import GAIASolver, CodeAgent, GAIA_TOOLS
from question_classifier import QuestionClassifier
from async_complete_test_hf import HFAsyncGAIATestSystem
# Original analyze_youtube_video function
original_analyze_youtube_video = gaia_tools.analyze_youtube_video
# Create a mock analyze_youtube_video function
def mock_analyze_youtube_video(video_url, question, max_frames=10):
"""Mock implementation that returns a predefined answer for bird species question"""
print(f"πΉ Mock analyzing YouTube video: {video_url}")
# Clean the URL in case there's a trailing comma
cleaned_url = video_url.rstrip(',')
# For the specific URL in the GAIA task
if "L1vXCYZAYYM" in cleaned_url:
return """
**π₯ Gemini 2.0 Flash Video+Audio Analysis**
**Title:** Bird Identification Challenge: Backyard Birds in Spring
**Duration:** 3:42
**File Size:** 45.2MB
**Question:** What is the highest number of bird species to be on camera simultaneously?
**Analysis Results:**
After careful frame-by-frame analysis of the video, the highest number of different bird species visible simultaneously is 3.
This occurs at approximately 1:23 into the video, where we can see:
1. American Robin
2. Northern Cardinal
3. Blue Jay
These three species are clearly visible in the same frame at this timestamp.
"""
# Generic response for other URLs
return """
**π₯ Gemini 2.0 Flash Video+Audio Analysis**
**Title:** Unknown Video
**Duration:** Unknown
**File Size:** Unknown
**Question:** Unknown
**Analysis Results:**
Unable to analyze the video content. Please provide a valid YouTube URL.
"""
# YouTube URL regex pattern
YOUTUBE_URL_PATTERN = r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/.+?(?=\s|$)'
def extract_youtube_url(text):
"""Extract YouTube URL from text"""
match = re.search(YOUTUBE_URL_PATTERN, text)
if match:
return match.group(0)
return None
def direct_force_tools_execution(solver, youtube_url, question_text):
"""Directly execute the YouTube analysis tool via the solver's agent"""
# Create a direct prompt that forces the YouTube analysis
force_prompt = f"""
You need to analyze a YouTube video to answer a specific question.
YOUTUBE VIDEO URL: {youtube_url}
QUESTION: {question_text}
CRITICAL INSTRUCTIONS:
1. Use the analyze_youtube_video tool with the provided URL
2. Extract the answer from the tool's response
3. Provide ONLY the final numerical answer
"""
# Create a fresh agent using the same approach as in GAIASolver
print("π€ Creating fresh agent for direct execution...")
agent = CodeAgent(
model=solver.model,
tools=GAIA_TOOLS,
max_steps=12,
verbosity_level=1 # Lower verbosity for cleaner output
)
# Run the agent with the forcing prompt
print("π Running direct analysis...")
response = agent.run(force_prompt)
return str(response)
def test_direct_youtube_question():
"""Test processing of YouTube question directly"""
# Create question with the YouTube URL
question = {
'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?',
'Final Answer': '3' # Assuming this is the correct answer based on GAIA metadata
}
# Replace the function in the module with our mock
print("π Replacing YouTube analysis tool with mock implementation...")
gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
try:
# Initialize components after patching
solver = GAIASolver()
classifier = QuestionClassifier()
# Classify the question
print("π§© Classifying question...")
classification = classifier.classify_question(question['Question'])
print(f"π Classification: {classification['primary_agent']}")
print(f"π§ Tools needed: {classification.get('tools_needed', [])}")
# Extract YouTube URL from question
youtube_url = extract_youtube_url(question['Question'])
if youtube_url:
# Remove any trailing comma
youtube_url = youtube_url.rstrip(',')
print(f"π Extracted YouTube URL: {youtube_url}")
# Use a direct approach to force tool execution
print("\nπ§ Processing question with direct YouTube analyzer execution...")
try:
direct_result = direct_force_tools_execution(
solver,
youtube_url,
"What is the highest number of bird species to be on camera simultaneously?"
)
print(f"\nπ Direct result: {direct_result}")
except Exception as e:
print(f"\nβ οΈ Direct test error: {e}")
direct_result = "Error in direct execution"
# Also try the normal processing path
print("\nπ§ Processing question with standard solver...")
try:
result = solver.solve_question(question)
print(f"\nβ
Standard result: {result}")
except Exception as e:
print(f"\nβ οΈ Standard test error: {e}")
result = "Error in standard execution"
# Validate result
expected = str(question['Final Answer']).strip().lower()
actual = str(result).strip().lower()
validation_status = "β correct" if expected == actual else "β incorrect"
print(f"π Validation: {validation_status}")
# If direct result contains the answer, check that too
if "3" in direct_result:
print(f"π Direct validation: β correct")
else:
print(f"π Direct validation: β incorrect")
finally:
# Restore original function
print("π Restoring original YouTube analysis tool...")
gaia_tools.analyze_youtube_video = original_analyze_youtube_video
async def test_async_youtube_question():
"""Test processing of YouTube question using the async test system"""
# Replace the function in the module with our mock
print("π Replacing YouTube analysis tool with mock implementation in async test...")
gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
try:
# Create async test system
system = HFAsyncGAIATestSystem(
max_concurrent=1,
timeout_seconds=60,
output_dir="/tmp/async_youtube_test"
)
# Create a single question test
questions = [
{
'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6',
'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?',
'Final Answer': '3'
}
]
# Override the load_gaia_questions method to use our single question
async def mock_load_questions(*args, **kwargs):
return questions
# Save the original method and replace it
original_load_method = system.load_gaia_questions
system.load_gaia_questions = mock_load_questions
# Create a capturing wrapper for the solve_question method
# Instead of replacing the solve_question method, we'll just run the test
# Create a wrapper that ensures the mocking is active
async def solving_wrapper():
# Make extra sure the mock is in place during the test
gaia_tools.analyze_youtube_video = mock_analyze_youtube_video
# Print confirmation of active mock
print("πΉ Mock is active for async test - will analyze YouTube video")
# Just call our wrapper to set up the mock
await solving_wrapper()
# Run the test
print("π Running async test with YouTube question...")
result = await system.run_comprehensive_test(question_limit=1)
# Print results
print("\nπ Async Test Results:")
print(f"Total questions processed: {result['total_questions']}")
print(f"Status counts: {result['status_counts']}")
# Check answer from the first question
question_id = questions[0]['task_id']
if question_id in result['results']:
question_result = result['results'][question_id]
answer = question_result.get('answer', 'No answer')
validation = question_result.get('validation_status', 'unknown')
print(f"\nQuestion ID: {question_id}")
print(f"Answer: {answer}")
print(f"Validation: {validation}")
else:
print(f"No results found for question ID {question_id}")
# Restore the original method
system.load_gaia_questions = original_load_method
finally:
# Restore original function
print("π Restoring original YouTube analysis tool...")
gaia_tools.analyze_youtube_video = original_analyze_youtube_video
async def main():
"""Run both tests"""
print("π Starting direct YouTube question test...")
test_direct_youtube_question()
print("\n\nπ Starting async YouTube question test...")
await test_async_youtube_question()
print("\nβ
All tests completed!")
if __name__ == "__main__":
asyncio.run(main())
|