Spaces:
Running
Running
#!/usr/bin/env python3 | |
""" | |
Analyze GAIA test results and generate specific improvement recommendations | |
""" | |
import json | |
import argparse | |
from pathlib import Path | |
from collections import defaultdict, Counter | |
from typing import Dict, List, Optional | |
class GAIAResultsAnalyzer: | |
"""Analyze test results and generate actionable improvement recommendations""" | |
def __init__(self, results_file: str): | |
self.results_file = results_file | |
self.results_data = self.load_results() | |
def load_results(self) -> Dict: | |
"""Load test results from JSON file""" | |
try: | |
with open(self.results_file, 'r') as f: | |
return json.load(f) | |
except FileNotFoundError: | |
print(f"β Results file not found: {self.results_file}") | |
return {} | |
except json.JSONDecodeError: | |
print(f"β Invalid JSON in results file: {self.results_file}") | |
return {} | |
def analyze_overall_performance(self): | |
"""Analyze overall testing performance""" | |
if not self.results_data: | |
return | |
print("π OVERALL PERFORMANCE ANALYSIS") | |
print("=" * 50) | |
overall_stats = self.results_data.get('overall_stats', {}) | |
agent_performance = self.results_data.get('agent_performance', {}) | |
print(f"Total Questions: {overall_stats.get('total_questions', 0)}") | |
print(f"Success Rate: {overall_stats.get('success_rate', 0):.1f}%") | |
print(f"Successful: {overall_stats.get('successful', 0)}") | |
print(f"Errors: {overall_stats.get('errors', 0)}") | |
print(f"\nπ― AGENT PERFORMANCE BREAKDOWN:") | |
for agent_type, stats in sorted(agent_performance.items(), key=lambda x: x[1]['success_rate'], reverse=True): | |
success_rate = stats['success_rate'] | |
status_emoji = "π’" if success_rate >= 90 else "π‘" if success_rate >= 70 else "π΄" | |
print(f" {status_emoji} {agent_type}: {success_rate:.1f}% ({stats['successful']}/{stats['total_questions']})") | |
if stats['average_solve_time'] > 0: | |
print(f" Average Time: {stats['average_solve_time']:.1f}s") | |
def analyze_error_patterns(self): | |
"""Analyze error patterns across all agent types""" | |
print(f"\nπ ERROR PATTERN ANALYSIS") | |
print("=" * 50) | |
error_patterns = self.results_data.get('error_patterns', {}) | |
if not error_patterns: | |
print("π No error patterns found!") | |
return | |
# Aggregate error types across all agents | |
all_error_types = Counter() | |
for agent_type, errors in error_patterns.items(): | |
print(f"\nπ¨ {agent_type.upper()} ERRORS:") | |
agent_error_types = Counter() | |
for error in errors: | |
error_type = error.get('error_type', 'UNKNOWN') | |
agent_error_types[error_type] += 1 | |
all_error_types[error_type] += 1 | |
for error_type, count in agent_error_types.most_common(): | |
print(f" - {error_type}: {count} occurrences") | |
print(f"\nπ MOST COMMON ERROR TYPES (All Agents):") | |
for error_type, count in all_error_types.most_common(5): | |
print(f" {count}Γ {error_type}") | |
def generate_specific_improvements(self): | |
"""Generate specific, actionable improvement recommendations""" | |
print(f"\nπ‘ SPECIFIC IMPROVEMENT RECOMMENDATIONS") | |
print("=" * 50) | |
agent_performance = self.results_data.get('agent_performance', {}) | |
error_patterns = self.results_data.get('error_patterns', {}) | |
detailed_results = self.results_data.get('detailed_results', []) | |
# Analyze each agent type | |
for agent_type, stats in agent_performance.items(): | |
success_rate = stats['success_rate'] | |
print(f"\nπ― {agent_type.upper()} AGENT IMPROVEMENTS:") | |
if success_rate >= 95: | |
print(f" β Excellent performance! Focus on optimization:") | |
print(f" - Fine-tune prompts for edge cases") | |
print(f" - Optimize solve time (current: {stats.get('average_solve_time', 0):.1f}s)") | |
elif success_rate >= 80: | |
print(f" π‘ Good performance with improvement opportunities:") | |
self.suggest_improvements_for_agent(agent_type, error_patterns.get(agent_type, []), detailed_results) | |
elif success_rate >= 60: | |
print(f" π Moderate performance - needs attention:") | |
self.suggest_improvements_for_agent(agent_type, error_patterns.get(agent_type, []), detailed_results) | |
print(f" - Consider prompt engineering review") | |
print(f" - Add more robust error handling") | |
else: | |
print(f" π΄ Poor performance - requires major overhaul:") | |
self.suggest_improvements_for_agent(agent_type, error_patterns.get(agent_type, []), detailed_results) | |
print(f" - Review agent architecture and tool selection") | |
print(f" - Consider multi-agent coordination") | |
print(f" - Implement comprehensive testing for this agent type") | |
def suggest_improvements_for_agent(self, agent_type: str, errors: List[Dict], all_results: List[Dict]): | |
"""Generate specific improvement suggestions for an agent type""" | |
if not errors: | |
print(f" - No specific errors to address") | |
return | |
# Analyze error types for this agent | |
error_type_counts = Counter() | |
specific_errors = defaultdict(list) | |
for error in errors: | |
error_type = error.get('error_type', 'UNKNOWN') | |
error_type_counts[error_type] += 1 | |
specific_errors[error_type].append(error) | |
# Generate specific fixes for top error types | |
for error_type, count in error_type_counts.most_common(3): | |
print(f" - Fix {error_type} errors ({count} occurrences):") | |
self.suggest_fix_for_error_type(error_type, specific_errors[error_type]) | |
def suggest_fix_for_error_type(self, error_type: str, specific_errors: List[Dict]): | |
"""Suggest specific fixes for error types with examples""" | |
fixes = { | |
'API_OVERLOAD': [ | |
"Implement exponential backoff with retry logic", | |
"Add multiple API endpoint fallbacks", | |
"Implement request queuing and rate limiting" | |
], | |
'TIMEOUT': [ | |
"Increase timeout limits in API calls", | |
"Implement progress tracking for long operations", | |
"Break down complex operations into smaller steps" | |
], | |
'AUTHENTICATION': [ | |
"Verify all API keys are correctly configured", | |
"Add API key validation at startup", | |
"Implement automatic token refresh mechanisms" | |
], | |
'WIKIPEDIA_TOOL': [ | |
"Enhance Wikipedia search with multiple search strategies", | |
"Add fallback to direct HTTP requests", | |
"Improve article name parsing and disambiguation" | |
], | |
'CHESS_TOOL': [ | |
"Enhance FEN notation validation and correction", | |
"Add multiple chess engine backends", | |
"Implement position verification with multiple tools" | |
], | |
'EXCEL_TOOL': [ | |
"Add support for more Excel formats (.xlsb, .csv)", | |
"Implement better column detection algorithms", | |
"Add data validation and error recovery" | |
], | |
'VIDEO_TOOL': [ | |
"Implement video size and duration limits", | |
"Add fallback to frame-only analysis", | |
"Improve audio extraction and transcription" | |
], | |
'GEMINI_API': [ | |
"Add Gemini API error handling and retries", | |
"Implement fallback to other vision models", | |
"Add request size validation and optimization" | |
], | |
'FILE_PROCESSING': [ | |
"Enhance file download with retry logic", | |
"Add file format validation before processing", | |
"Implement temporary file cleanup mechanisms" | |
], | |
'HALLUCINATION': [ | |
"Strengthen anti-hallucination prompts", | |
"Force tool output usage over model reasoning", | |
"Add response validation against tool outputs" | |
], | |
'PARSING_ERROR': [ | |
"Improve output parsing with multiple regex patterns", | |
"Add structured output validation", | |
"Implement fallback parsing strategies" | |
] | |
} | |
suggestions = fixes.get(error_type, ["Investigate root cause and implement appropriate fix"]) | |
for suggestion in suggestions[:2]: # Show top 2 suggestions | |
print(f" β {suggestion}") | |
# Show example error if available | |
if specific_errors: | |
example = specific_errors[0] | |
question_id = example.get('question_id', 'unknown')[:8] | |
print(f" Example: {question_id}... - {example.get('question_preview', '')[:50]}...") | |
def generate_prompt_improvements(self): | |
"""Generate specific prompt improvement suggestions""" | |
print(f"\nπ PROMPT IMPROVEMENT SUGGESTIONS") | |
print("=" * 50) | |
detailed_results = self.results_data.get('detailed_results', []) | |
failed_results = [r for r in detailed_results if r['status'] == 'error'] | |
if not failed_results: | |
print("π No failed results to analyze for prompt improvements!") | |
return | |
# Group failures by agent type | |
failures_by_agent = defaultdict(list) | |
for result in failed_results: | |
failures_by_agent[result['agent_type']].append(result) | |
for agent_type, failures in failures_by_agent.items(): | |
print(f"\nπ― {agent_type.upper()} PROMPT IMPROVEMENTS:") | |
# Analyze common failure patterns | |
question_patterns = [] | |
for failure in failures: | |
question = failure.get('question', '') | |
if len(question) > 50: | |
question_patterns.append(question[:100] + "...") | |
if agent_type == 'research': | |
print(f" - Add more specific Wikipedia search guidance") | |
print(f" - Strengthen temporal query parsing (e.g., 'as of July 2023')") | |
print(f" - Enhance data extraction and validation prompts") | |
elif agent_type == 'multimedia': | |
print(f" - Improve video/audio analysis instructions") | |
print(f" - Add specific guidance for character dialogue extraction") | |
print(f" - Enhance image analysis with structured output requirements") | |
elif agent_type == 'logic_math': | |
print(f" - Add step-by-step mathematical reasoning guidance") | |
print(f" - Strengthen calculation verification prompts") | |
print(f" - Improve pattern recognition instructions") | |
elif agent_type == 'file_processing': | |
print(f" - Enhance Excel analysis with column filtering guidance") | |
print(f" - Add specific data aggregation instructions") | |
print(f" - Improve Python code execution safety prompts") | |
# Show example failed questions | |
if question_patterns: | |
print(f" Failed question examples:") | |
for pattern in question_patterns[:2]: | |
print(f" - {pattern}") | |
def create_action_plan(self): | |
"""Create a prioritized action plan for improvements""" | |
print(f"\nπ PRIORITIZED ACTION PLAN") | |
print("=" * 50) | |
agent_performance = self.results_data.get('agent_performance', {}) | |
# Sort agents by success rate (lowest first - highest priority) | |
sorted_agents = sorted(agent_performance.items(), key=lambda x: x[1]['success_rate']) | |
print(f"Priority order (based on success rate):") | |
for i, (agent_type, stats) in enumerate(sorted_agents, 1): | |
success_rate = stats['success_rate'] | |
total_questions = stats['total_questions'] | |
print(f"\n{i}. {agent_type.upper()} AGENT (Success: {success_rate:.1f}%)") | |
print(f" Questions: {total_questions}") | |
if success_rate < 70: | |
print(f" π΄ HIGH PRIORITY - Major improvements needed") | |
print(f" Actions: Review architecture, enhance tools, rewrite prompts") | |
elif success_rate < 85: | |
print(f" π‘ MEDIUM PRIORITY - Targeted improvements") | |
print(f" Actions: Fix specific error patterns, optimize prompts") | |
else: | |
print(f" π’ LOW PRIORITY - Fine-tuning only") | |
print(f" Actions: Edge case handling, performance optimization") | |
print(f"\nπ RECOMMENDED WORKFLOW:") | |
print(f"1. Start with highest priority agent type") | |
print(f"2. Implement suggested improvements") | |
print(f"3. Re-test only that agent type: --agent-types {sorted_agents[0][0] if sorted_agents else 'unknown'}") | |
print(f"4. Repeat until success rate > 85%") | |
print(f"5. Move to next priority agent type") | |
def main(): | |
"""Main CLI interface for results analysis""" | |
parser = argparse.ArgumentParser(description="Analyze GAIA test results and generate improvement recommendations") | |
parser.add_argument('results_file', help='Path to the test results JSON file') | |
parser.add_argument('--detailed', action='store_true', help='Show detailed analysis including individual errors') | |
args = parser.parse_args() | |
if not Path(args.results_file).exists(): | |
print(f"β Results file not found: {args.results_file}") | |
return | |
analyzer = GAIAResultsAnalyzer(args.results_file) | |
print("π GAIA TEST RESULTS ANALYSIS") | |
print("=" * 70) | |
analyzer.analyze_overall_performance() | |
analyzer.analyze_error_patterns() | |
analyzer.generate_specific_improvements() | |
analyzer.generate_prompt_improvements() | |
analyzer.create_action_plan() | |
print(f"\nβ ANALYSIS COMPLETE!") | |
print(f"π Use the action plan above to prioritize improvements") | |
if __name__ == "__main__": | |
main() |