schoolkithub's picture
Upload 8 files
f56e889 verified
raw
history blame
4.18 kB
import gradio as gr
import json
import os
from datetime import datetime
from agent import GAIAAgent
from evaluate import evaluate_agent, create_sample_dataset
import traceback
def run_evaluation():
"""Run the GAIA evaluation and return results."""
try:
print("Starting GAIA Agent Evaluation...")
print("=" * 50)
# Initialize agent
agent = GAIAAgent()
# Test API connection first
print("Testing xAI API connection...")
test_response = agent.test_grok()
print(f"API Test Response: {test_response}")
# Run evaluation on sample dataset (since we don't have the full GAIA dataset)
print("\nRunning evaluation on sample tasks...")
score = evaluate_agent(dataset_path=None, max_tasks=10)
# Read submission file if it exists
submission_content = ""
if os.path.exists("submission.jsonl"):
with open("submission.jsonl", "r") as f:
submission_content = f.read()
# Format results
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
results = f"""
# GAIA Agent Evaluation Results
**Timestamp:** {timestamp}
**Final Score:** {score:.2f}%
**Certificate Status:** {'βœ… ACHIEVED (β‰₯30%)' if score >= 30 else '❌ NOT ACHIEVED (<30%)'}
## API Connection Status
{test_response}
## Submission File Preview
```json
{submission_content[:500]}{'...' if len(submission_content) > 500 else ''}
```
## Next Steps
{'πŸŽ‰ Congratulations! You can now claim your Certificate of Excellence!' if score >= 30 else 'πŸ’ͺ Keep improving your agent to reach the 30% threshold.'}
"""
return results, score
except Exception as e:
error_msg = f"""
# Evaluation Error
**Error:** {str(e)}
**Traceback:**
```
{traceback.format_exc()}
```
Please check the logs and fix any issues before retrying.
"""
return error_msg, 0.0
def create_interface():
"""Create the Gradio interface."""
with gr.Blocks(title="GAIA Agent Evaluation", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# πŸ€– GAIA Agent Evaluation
This is your GAIA benchmark agent for the Hugging Face Agents Course Certificate of Excellence.
**Goal:** Achieve β‰₯30% score on GAIA benchmark tasks
Click the button below to run the evaluation and submit your answers.
⚠️ **Note:** This may take several minutes to complete. Please be patient.
""")
with gr.Row():
run_btn = gr.Button("πŸš€ Run Evaluation & Submit All Answers", variant="primary", size="lg")
with gr.Row():
with gr.Column():
gr.Markdown("## Run Status / Submission Result")
results_output = gr.Markdown("Click the button above to start evaluation...")
with gr.Column():
gr.Markdown("## Score")
score_output = gr.Number(label="Final Score (%)", value=0.0, interactive=False)
# Event handler
run_btn.click(
fn=run_evaluation,
inputs=[],
outputs=[results_output, score_output],
show_progress=True
)
gr.Markdown("""
---
## About This Agent
- **API:** xAI Grok for reasoning
- **Tools:** Web search, file handling, math calculations
- **Fallbacks:** Local knowledge for common questions
- **Target:** 30% accuracy for certificate eligibility
## Troubleshooting
If you encounter issues:
1. Check the container logs in the "Logs" tab
2. Verify API credentials and internet connectivity
3. Ensure all dependencies are installed
**Good luck! πŸ€**
""")
return demo
if __name__ == "__main__":
# Create and launch the interface
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
show_api=False
)