|
import gradio as gr |
|
import json |
|
import os |
|
from datetime import datetime |
|
from agent import GAIAAgent |
|
from evaluate import evaluate_agent, create_sample_dataset |
|
import traceback |
|
|
|
def run_evaluation(): |
|
"""Run the GAIA evaluation and return results.""" |
|
try: |
|
print("Starting GAIA Agent Evaluation...") |
|
print("=" * 50) |
|
|
|
|
|
agent = GAIAAgent() |
|
|
|
|
|
print("Testing xAI API connection...") |
|
test_response = agent.test_grok() |
|
print(f"API Test Response: {test_response}") |
|
|
|
|
|
print("\nRunning evaluation on sample tasks...") |
|
score = evaluate_agent(dataset_path=None, max_tasks=10) |
|
|
|
|
|
submission_content = "" |
|
if os.path.exists("submission.jsonl"): |
|
with open("submission.jsonl", "r") as f: |
|
submission_content = f.read() |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") |
|
|
|
results = f""" |
|
# GAIA Agent Evaluation Results |
|
|
|
**Timestamp:** {timestamp} |
|
**Final Score:** {score:.2f}% |
|
**Certificate Status:** {'β
ACHIEVED (β₯30%)' if score >= 30 else 'β NOT ACHIEVED (<30%)'} |
|
|
|
## API Connection Status |
|
{test_response} |
|
|
|
## Submission File Preview |
|
```json |
|
{submission_content[:500]}{'...' if len(submission_content) > 500 else ''} |
|
``` |
|
|
|
## Next Steps |
|
{'π Congratulations! You can now claim your Certificate of Excellence!' if score >= 30 else 'πͺ Keep improving your agent to reach the 30% threshold.'} |
|
""" |
|
|
|
return results, score |
|
|
|
except Exception as e: |
|
error_msg = f""" |
|
# Evaluation Error |
|
|
|
**Error:** {str(e)} |
|
|
|
**Traceback:** |
|
``` |
|
{traceback.format_exc()} |
|
``` |
|
|
|
Please check the logs and fix any issues before retrying. |
|
""" |
|
return error_msg, 0.0 |
|
|
|
def create_interface(): |
|
"""Create the Gradio interface.""" |
|
|
|
with gr.Blocks(title="GAIA Agent Evaluation", theme=gr.themes.Soft()) as demo: |
|
gr.Markdown(""" |
|
# π€ GAIA Agent Evaluation |
|
|
|
This is your GAIA benchmark agent for the Hugging Face Agents Course Certificate of Excellence. |
|
|
|
**Goal:** Achieve β₯30% score on GAIA benchmark tasks |
|
|
|
Click the button below to run the evaluation and submit your answers. |
|
|
|
β οΈ **Note:** This may take several minutes to complete. Please be patient. |
|
""") |
|
|
|
with gr.Row(): |
|
run_btn = gr.Button("π Run Evaluation & Submit All Answers", variant="primary", size="lg") |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("## Run Status / Submission Result") |
|
results_output = gr.Markdown("Click the button above to start evaluation...") |
|
|
|
with gr.Column(): |
|
gr.Markdown("## Score") |
|
score_output = gr.Number(label="Final Score (%)", value=0.0, interactive=False) |
|
|
|
|
|
run_btn.click( |
|
fn=run_evaluation, |
|
inputs=[], |
|
outputs=[results_output, score_output], |
|
show_progress=True |
|
) |
|
|
|
gr.Markdown(""" |
|
--- |
|
|
|
## About This Agent |
|
|
|
- **API:** xAI Grok for reasoning |
|
- **Tools:** Web search, file handling, math calculations |
|
- **Fallbacks:** Local knowledge for common questions |
|
- **Target:** 30% accuracy for certificate eligibility |
|
|
|
## Troubleshooting |
|
|
|
If you encounter issues: |
|
1. Check the container logs in the "Logs" tab |
|
2. Verify API credentials and internet connectivity |
|
3. Ensure all dependencies are installed |
|
|
|
**Good luck! π** |
|
""") |
|
|
|
return demo |
|
|
|
if __name__ == "__main__": |
|
|
|
demo = create_interface() |
|
demo.launch( |
|
server_name="0.0.0.0", |
|
server_port=7860, |
|
show_error=True, |
|
show_api=False |
|
) |