Spaces:
Sleeping
Sleeping
File size: 4,179 Bytes
4fae186 945d0d0 4fae186 945d0d0 4fae186 945d0d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import gradio as gr
import json
import os
from datetime import datetime
from agent import GAIAAgent
from evaluate import evaluate_agent, create_sample_dataset
import traceback
def run_evaluation():
"""Run the GAIA evaluation and return results."""
try:
print("Starting GAIA Agent Evaluation...")
print("=" * 50)
# Initialize agent
agent = GAIAAgent()
# Test API connection first
print("Testing xAI API connection...")
test_response = agent.test_grok()
print(f"API Test Response: {test_response}")
# Run evaluation on sample dataset (since we don't have the full GAIA dataset)
print("\nRunning evaluation on sample tasks...")
score = evaluate_agent(dataset_path=None, max_tasks=10)
# Read submission file if it exists
submission_content = ""
if os.path.exists("submission.jsonl"):
with open("submission.jsonl", "r") as f:
submission_content = f.read()
# Format results
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
results = f"""
# GAIA Agent Evaluation Results
**Timestamp:** {timestamp}
**Final Score:** {score:.2f}%
**Certificate Status:** {'β
ACHIEVED (β₯30%)' if score >= 30 else 'β NOT ACHIEVED (<30%)'}
## API Connection Status
{test_response}
## Submission File Preview
```json
{submission_content[:500]}{'...' if len(submission_content) > 500 else ''}
```
## Next Steps
{'π Congratulations! You can now claim your Certificate of Excellence!' if score >= 30 else 'πͺ Keep improving your agent to reach the 30% threshold.'}
"""
return results, score
except Exception as e:
error_msg = f"""
# Evaluation Error
**Error:** {str(e)}
**Traceback:**
```
{traceback.format_exc()}
```
Please check the logs and fix any issues before retrying.
"""
return error_msg, 0.0
def create_interface():
"""Create the Gradio interface."""
with gr.Blocks(title="GAIA Agent Evaluation", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# π€ GAIA Agent Evaluation
This is your GAIA benchmark agent for the Hugging Face Agents Course Certificate of Excellence.
**Goal:** Achieve β₯30% score on GAIA benchmark tasks
Click the button below to run the evaluation and submit your answers.
β οΈ **Note:** This may take several minutes to complete. Please be patient.
""")
with gr.Row():
run_btn = gr.Button("π Run Evaluation & Submit All Answers", variant="primary", size="lg")
with gr.Row():
with gr.Column():
gr.Markdown("## Run Status / Submission Result")
results_output = gr.Markdown("Click the button above to start evaluation...")
with gr.Column():
gr.Markdown("## Score")
score_output = gr.Number(label="Final Score (%)", value=0.0, interactive=False)
# Event handler
run_btn.click(
fn=run_evaluation,
inputs=[],
outputs=[results_output, score_output],
show_progress=True
)
gr.Markdown("""
---
## About This Agent
- **API:** xAI Grok for reasoning
- **Tools:** Web search, file handling, math calculations
- **Fallbacks:** Local knowledge for common questions
- **Target:** 30% accuracy for certificate eligibility
## Troubleshooting
If you encounter issues:
1. Check the container logs in the "Logs" tab
2. Verify API credentials and internet connectivity
3. Ensure all dependencies are installed
**Good luck! π**
""")
return demo
if __name__ == "__main__":
# Create and launch the interface
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True,
show_api=False
) |