File size: 4,179 Bytes
4fae186
945d0d0
 
 
 
 
 
4fae186
945d0d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fae186
945d0d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import gradio as gr
import json
import os
from datetime import datetime
from agent import GAIAAgent
from evaluate import evaluate_agent, create_sample_dataset
import traceback

def run_evaluation():
    """Run the GAIA evaluation and return results."""
    try:
        print("Starting GAIA Agent Evaluation...")
        print("=" * 50)
        
        # Initialize agent
        agent = GAIAAgent()
        
        # Test API connection first
        print("Testing xAI API connection...")
        test_response = agent.test_grok()
        print(f"API Test Response: {test_response}")
        
        # Run evaluation on sample dataset (since we don't have the full GAIA dataset)
        print("\nRunning evaluation on sample tasks...")
        score = evaluate_agent(dataset_path=None, max_tasks=10)
        
        # Read submission file if it exists
        submission_content = ""
        if os.path.exists("submission.jsonl"):
            with open("submission.jsonl", "r") as f:
                submission_content = f.read()
        
        # Format results
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        results = f"""
# GAIA Agent Evaluation Results

**Timestamp:** {timestamp}
**Final Score:** {score:.2f}%
**Certificate Status:** {'βœ… ACHIEVED (β‰₯30%)' if score >= 30 else '❌ NOT ACHIEVED (<30%)'}

## API Connection Status
{test_response}

## Submission File Preview
```json
{submission_content[:500]}{'...' if len(submission_content) > 500 else ''}
```

## Next Steps
{'πŸŽ‰ Congratulations! You can now claim your Certificate of Excellence!' if score >= 30 else 'πŸ’ͺ Keep improving your agent to reach the 30% threshold.'}
        """
        
        return results, score
        
    except Exception as e:
        error_msg = f"""
# Evaluation Error

**Error:** {str(e)}

**Traceback:**
```
{traceback.format_exc()}
```

Please check the logs and fix any issues before retrying.
        """
        return error_msg, 0.0

def create_interface():
    """Create the Gradio interface."""
    
    with gr.Blocks(title="GAIA Agent Evaluation", theme=gr.themes.Soft()) as demo:
        gr.Markdown("""
        # πŸ€– GAIA Agent Evaluation
        
        This is your GAIA benchmark agent for the Hugging Face Agents Course Certificate of Excellence.
        
        **Goal:** Achieve β‰₯30% score on GAIA benchmark tasks
        
        Click the button below to run the evaluation and submit your answers.
        
        ⚠️ **Note:** This may take several minutes to complete. Please be patient.
        """)
        
        with gr.Row():
            run_btn = gr.Button("πŸš€ Run Evaluation & Submit All Answers", variant="primary", size="lg")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("## Run Status / Submission Result")
                results_output = gr.Markdown("Click the button above to start evaluation...")
            
            with gr.Column():
                gr.Markdown("## Score")
                score_output = gr.Number(label="Final Score (%)", value=0.0, interactive=False)
        
        # Event handler
        run_btn.click(
            fn=run_evaluation,
            inputs=[],
            outputs=[results_output, score_output],
            show_progress=True
        )
        
        gr.Markdown("""
        ---
        
        ## About This Agent
        
        - **API:** xAI Grok for reasoning
        - **Tools:** Web search, file handling, math calculations
        - **Fallbacks:** Local knowledge for common questions
        - **Target:** 30% accuracy for certificate eligibility
        
        ## Troubleshooting
        
        If you encounter issues:
        1. Check the container logs in the "Logs" tab
        2. Verify API credentials and internet connectivity
        3. Ensure all dependencies are installed
        
        **Good luck! πŸ€**
        """)
    
    return demo

if __name__ == "__main__":
    # Create and launch the interface
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        show_error=True,
        show_api=False
    )