Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Test for YouTube question processing in GAIA system | |
""" | |
import os | |
import sys | |
import json | |
from pathlib import Path | |
import importlib | |
import asyncio | |
import re | |
# Import the module containing the YouTube video analysis tool | |
import gaia_tools | |
from main import GAIASolver, CodeAgent, GAIA_TOOLS | |
from question_classifier import QuestionClassifier | |
from async_complete_test_hf import HFAsyncGAIATestSystem | |
# Original analyze_youtube_video function | |
original_analyze_youtube_video = gaia_tools.analyze_youtube_video | |
# Create a mock analyze_youtube_video function | |
def mock_analyze_youtube_video(video_url, question, max_frames=10): | |
"""Mock implementation that returns a predefined answer for bird species question""" | |
print(f"πΉ Mock analyzing YouTube video: {video_url}") | |
# Clean the URL in case there's a trailing comma | |
cleaned_url = video_url.rstrip(',') | |
# For the specific URL in the GAIA task | |
if "L1vXCYZAYYM" in cleaned_url: | |
return """ | |
**π₯ Gemini 2.0 Flash Video+Audio Analysis** | |
**Title:** Bird Identification Challenge: Backyard Birds in Spring | |
**Duration:** 3:42 | |
**File Size:** 45.2MB | |
**Question:** What is the highest number of bird species to be on camera simultaneously? | |
**Analysis Results:** | |
After careful frame-by-frame analysis of the video, the highest number of different bird species visible simultaneously is 3. | |
This occurs at approximately 1:23 into the video, where we can see: | |
1. American Robin | |
2. Northern Cardinal | |
3. Blue Jay | |
These three species are clearly visible in the same frame at this timestamp. | |
""" | |
# Generic response for other URLs | |
return """ | |
**π₯ Gemini 2.0 Flash Video+Audio Analysis** | |
**Title:** Unknown Video | |
**Duration:** Unknown | |
**File Size:** Unknown | |
**Question:** Unknown | |
**Analysis Results:** | |
Unable to analyze the video content. Please provide a valid YouTube URL. | |
""" | |
# YouTube URL regex pattern | |
YOUTUBE_URL_PATTERN = r'(https?://)?(www\.)?(youtube\.com|youtu\.?be)/.+?(?=\s|$)' | |
def extract_youtube_url(text): | |
"""Extract YouTube URL from text""" | |
match = re.search(YOUTUBE_URL_PATTERN, text) | |
if match: | |
return match.group(0) | |
return None | |
def direct_force_tools_execution(solver, youtube_url, question_text): | |
"""Directly execute the YouTube analysis tool via the solver's agent""" | |
# Create a direct prompt that forces the YouTube analysis | |
force_prompt = f""" | |
You need to analyze a YouTube video to answer a specific question. | |
YOUTUBE VIDEO URL: {youtube_url} | |
QUESTION: {question_text} | |
CRITICAL INSTRUCTIONS: | |
1. Use the analyze_youtube_video tool with the provided URL | |
2. Extract the answer from the tool's response | |
3. Provide ONLY the final numerical answer | |
""" | |
# Create a fresh agent using the same approach as in GAIASolver | |
print("π€ Creating fresh agent for direct execution...") | |
agent = CodeAgent( | |
model=solver.model, | |
tools=GAIA_TOOLS, | |
max_steps=12, | |
verbosity_level=1 # Lower verbosity for cleaner output | |
) | |
# Run the agent with the forcing prompt | |
print("π Running direct analysis...") | |
response = agent.run(force_prompt) | |
return str(response) | |
def test_direct_youtube_question(): | |
"""Test processing of YouTube question directly""" | |
# Create question with the YouTube URL | |
question = { | |
'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6', | |
'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?', | |
'Final Answer': '3' # Assuming this is the correct answer based on GAIA metadata | |
} | |
# Replace the function in the module with our mock | |
print("π Replacing YouTube analysis tool with mock implementation...") | |
gaia_tools.analyze_youtube_video = mock_analyze_youtube_video | |
try: | |
# Initialize components after patching | |
solver = GAIASolver() | |
classifier = QuestionClassifier() | |
# Classify the question | |
print("π§© Classifying question...") | |
classification = classifier.classify_question(question['Question']) | |
print(f"π Classification: {classification['primary_agent']}") | |
print(f"π§ Tools needed: {classification.get('tools_needed', [])}") | |
# Extract YouTube URL from question | |
youtube_url = extract_youtube_url(question['Question']) | |
if youtube_url: | |
# Remove any trailing comma | |
youtube_url = youtube_url.rstrip(',') | |
print(f"π Extracted YouTube URL: {youtube_url}") | |
# Use a direct approach to force tool execution | |
print("\nπ§ Processing question with direct YouTube analyzer execution...") | |
try: | |
direct_result = direct_force_tools_execution( | |
solver, | |
youtube_url, | |
"What is the highest number of bird species to be on camera simultaneously?" | |
) | |
print(f"\nπ Direct result: {direct_result}") | |
except Exception as e: | |
print(f"\nβ οΈ Direct test error: {e}") | |
direct_result = "Error in direct execution" | |
# Also try the normal processing path | |
print("\nπ§ Processing question with standard solver...") | |
try: | |
result = solver.solve_question(question) | |
print(f"\nβ Standard result: {result}") | |
except Exception as e: | |
print(f"\nβ οΈ Standard test error: {e}") | |
result = "Error in standard execution" | |
# Validate result | |
expected = str(question['Final Answer']).strip().lower() | |
actual = str(result).strip().lower() | |
validation_status = "β correct" if expected == actual else "β incorrect" | |
print(f"π Validation: {validation_status}") | |
# If direct result contains the answer, check that too | |
if "3" in direct_result: | |
print(f"π Direct validation: β correct") | |
else: | |
print(f"π Direct validation: β incorrect") | |
finally: | |
# Restore original function | |
print("π Restoring original YouTube analysis tool...") | |
gaia_tools.analyze_youtube_video = original_analyze_youtube_video | |
async def test_async_youtube_question(): | |
"""Test processing of YouTube question using the async test system""" | |
# Replace the function in the module with our mock | |
print("π Replacing YouTube analysis tool with mock implementation in async test...") | |
gaia_tools.analyze_youtube_video = mock_analyze_youtube_video | |
try: | |
# Create async test system | |
system = HFAsyncGAIATestSystem( | |
max_concurrent=1, | |
timeout_seconds=60, | |
output_dir="/tmp/async_youtube_test" | |
) | |
# Create a single question test | |
questions = [ | |
{ | |
'task_id': 'a1e91b78-d3d8-4675-bb8d-62741b4b68a6', | |
'Question': 'In the video https://www.youtube.com/watch?v=L1vXCYZAYYM, what is the highest number of bird species to be on camera simultaneously?', | |
'Final Answer': '3' | |
} | |
] | |
# Override the load_gaia_questions method to use our single question | |
async def mock_load_questions(*args, **kwargs): | |
return questions | |
# Save the original method and replace it | |
original_load_method = system.load_gaia_questions | |
system.load_gaia_questions = mock_load_questions | |
# Create a capturing wrapper for the solve_question method | |
# Instead of replacing the solve_question method, we'll just run the test | |
# Create a wrapper that ensures the mocking is active | |
async def solving_wrapper(): | |
# Make extra sure the mock is in place during the test | |
gaia_tools.analyze_youtube_video = mock_analyze_youtube_video | |
# Print confirmation of active mock | |
print("πΉ Mock is active for async test - will analyze YouTube video") | |
# Just call our wrapper to set up the mock | |
await solving_wrapper() | |
# Run the test | |
print("π Running async test with YouTube question...") | |
result = await system.run_comprehensive_test(question_limit=1) | |
# Print results | |
print("\nπ Async Test Results:") | |
print(f"Total questions processed: {result['total_questions']}") | |
print(f"Status counts: {result['status_counts']}") | |
# Check answer from the first question | |
question_id = questions[0]['task_id'] | |
if question_id in result['results']: | |
question_result = result['results'][question_id] | |
answer = question_result.get('answer', 'No answer') | |
validation = question_result.get('validation_status', 'unknown') | |
print(f"\nQuestion ID: {question_id}") | |
print(f"Answer: {answer}") | |
print(f"Validation: {validation}") | |
else: | |
print(f"No results found for question ID {question_id}") | |
# Restore the original method | |
system.load_gaia_questions = original_load_method | |
finally: | |
# Restore original function | |
print("π Restoring original YouTube analysis tool...") | |
gaia_tools.analyze_youtube_video = original_analyze_youtube_video | |
async def main(): | |
"""Run both tests""" | |
print("π Starting direct YouTube question test...") | |
test_direct_youtube_question() | |
print("\n\nπ Starting async YouTube question test...") | |
await test_async_youtube_question() | |
print("\nβ All tests completed!") | |
if __name__ == "__main__": | |
asyncio.run(main()) | |