Spaces:

schoolkithub
/

GAIA_AGE

Sleeping

File size: 4,179 Bytes

import gradio as gr
import json
import os
from datetime import datetime
from agent import GAIAAgent
from evaluate import evaluate_agent, create_sample_dataset
import traceback

def run_evaluation():
    """Run the GAIA evaluation and return results."""
    try:
        print("Starting GAIA Agent Evaluation...")
        print("=" * 50)
        
        # Initialize agent
        agent = GAIAAgent()
        
        # Test API connection first
        print("Testing xAI API connection...")
        test_response = agent.test_grok()
        print(f"API Test Response: {test_response}")
        
        # Run evaluation on sample dataset (since we don't have the full GAIA dataset)
        print("\nRunning evaluation on sample tasks...")
        score = evaluate_agent(dataset_path=None, max_tasks=10)
        
        # Read submission file if it exists
        submission_content = ""
        if os.path.exists("submission.jsonl"):
            with open("submission.jsonl", "r") as f:
                submission_content = f.read()
        
        # Format results
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        results = f"""
# GAIA Agent Evaluation Results

**Timestamp:** {timestamp}
**Final Score:** {score:.2f}%
**Certificate Status:** {'✅ ACHIEVED (≥30%)' if score >= 30 else '❌ NOT ACHIEVED (<30%)'}

## API Connection Status
{test_response}

## Submission File Preview
```json
{submission_content[:500]}{'...' if len(submission_content) > 500 else ''}
```

## Next Steps
{'🎉 Congratulations! You can now claim your Certificate of Excellence!' if score >= 30 else '💪 Keep improving your agent to reach the 30% threshold.'}
        """
        
        return results, score
        
    except Exception as e:
        error_msg = f"""
# Evaluation Error

**Error:** {str(e)}

**Traceback:**
```
{traceback.format_exc()}
```

Please check the logs and fix any issues before retrying.
        """
        return error_msg, 0.0

def create_interface():
    """Create the Gradio interface."""
    
    with gr.Blocks(title="GAIA Agent Evaluation", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # 🤖 GAIA Agent Evaluation
        
        This is your GAIA benchmark agent for the Hugging Face Agents Course Certificate of Excellence.
        
        **Goal:** Achieve ≥30% score on GAIA benchmark tasks
        
        Click the button below to run the evaluation and submit your answers.
        
        ⚠️ **Note:** This may take several minutes to complete. Please be patient.
        """)
        
        with gr.Row():
            run_btn = gr.Button("🚀 Run Evaluation & Submit All Answers", variant="primary", size="lg")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("## Run Status / Submission Result")
                results_output = gr.Markdown("Click the button above to start evaluation...")
            
            with gr.Column():
                gr.Markdown("## Score")
                score_output = gr.Number(label="Final Score (%)", value=0.0, interactive=False)
        
        # Event handler
        run_btn.click(
            fn=run_evaluation,
            inputs=[],
            outputs=[results_output, score_output],
            show_progress=True
        )
        
        gr.Markdown("""
        ---
        
        ## About This Agent
        
        - **API:** xAI Grok for reasoning
        - **Tools:** Web search, file handling, math calculations
        - **Fallbacks:** Local knowledge for common questions
        - **Target:** 30% accuracy for certificate eligibility
        
        ## Troubleshooting
        
        If you encounter issues:
        1. Check the container logs in the "Logs" tab
        2. Verify API credentials and internet connectivity
        3. Ensure all dependencies are installed
        
        **Good luck! 🍀**
        """)
    
    return demo

if __name__ == "__main__":
    # Create and launch the interface
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True,
        show_api=False
    )