Spaces:
Running
Running
Commit
·
6441bc6
0
Parent(s):
Leaderboard deployment 2025-07-16 18:05:41
Browse files- README.md +69 -0
- app.py +266 -0
- image/image.png +0 -0
- logo.png +1 -0
- process_data/README.md +94 -0
- process_data/__init__.py +11 -0
- process_data/__pycache__/__init__.cpython-312.pyc +0 -0
- process_data/__pycache__/config.cpython-312.pyc +0 -0
- process_data/__pycache__/download_data.cpython-312.pyc +0 -0
- process_data/config.py +25 -0
- process_data/config_db.py +32 -0
- process_data/db_to_hf.py +167 -0
- process_data/download_data.py +149 -0
- process_data/requirements.txt +5 -0
- process_data/run_pipeline.sh +34 -0
- requirements.txt +5 -0
- src/__init__.py +3 -0
- src/__pycache__/__init__.cpython-312.pyc +0 -0
- src/__pycache__/about.cpython-312.pyc +0 -0
- src/__pycache__/display_utils.cpython-312.pyc +0 -0
- src/__pycache__/leaderboard_utils.cpython-312.pyc +0 -0
- src/about.py +66 -0
- src/display_utils.py +566 -0
- src/leaderboard_utils.py +193 -0
README.md
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: FutureBench Leaderboard
|
3 |
+
emoji: 🔮
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.44.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
---
|
11 |
+
|
12 |
+
# FutureBench Leaderboard App
|
13 |
+
|
14 |
+
A minimal Gradio application for viewing FutureBench prediction data. This app downloads datasets from HuggingFace on startup and provides a web interface to explore the data.
|
15 |
+
|
16 |
+
## Features
|
17 |
+
|
18 |
+
- 📊 **Data Summary**: View dataset statistics and information
|
19 |
+
- 🔍 **Sample Data**: Browse sample prediction records
|
20 |
+
- 📋 **About**: Learn about the FutureBench system
|
21 |
+
- 🔄 **Auto-refresh**: Download latest data on startup
|
22 |
+
- 📅 **Date Range Slider**: Filter the leaderboard by a custom date span
|
23 |
+
|
24 |
+
## Setup
|
25 |
+
|
26 |
+
1. Install dependencies:
|
27 |
+
```bash
|
28 |
+
pip install -r requirements.txt
|
29 |
+
```
|
30 |
+
|
31 |
+
2. (Optional) Set your HuggingFace token for private repositories:
|
32 |
+
```bash
|
33 |
+
export HF_TOKEN=your_token_here
|
34 |
+
```
|
35 |
+
|
36 |
+
## Running the App
|
37 |
+
|
38 |
+
Launch the Gradio application:
|
39 |
+
|
40 |
+
```bash
|
41 |
+
python app.py
|
42 |
+
```
|
43 |
+
|
44 |
+
The app will:
|
45 |
+
1. Download datasets from HuggingFace repositories on startup
|
46 |
+
2. Process the data and create summaries
|
47 |
+
3. Launch a web interface at `http://localhost:7860`
|
48 |
+
|
49 |
+
## Data Sources
|
50 |
+
|
51 |
+
The app downloads data from these HuggingFace repositories:
|
52 |
+
- `futurebench/requests` - Evaluation queue
|
53 |
+
- `futurebench/results` - Evaluation results
|
54 |
+
- `futurebench/data` - Main prediction dataset
|
55 |
+
|
56 |
+
## Structure
|
57 |
+
|
58 |
+
- `app.py` - Main Gradio application
|
59 |
+
- `process_data/` - Data processing utilities
|
60 |
+
- `requirements.txt` - Python dependencies
|
61 |
+
- `README.md` - This file
|
62 |
+
|
63 |
+
## Next Steps
|
64 |
+
|
65 |
+
This is a minimal version focusing on data download and display. Future enhancements will include:
|
66 |
+
- Full leaderboard with model rankings
|
67 |
+
- Interactive filtering and sorting
|
68 |
+
- Detailed performance metrics
|
69 |
+
- Model comparison tools
|
app.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import pandas as pd
|
5 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
+
from gradio_rangeslider import RangeSlider
|
7 |
+
from huggingface_hub import snapshot_download
|
8 |
+
|
9 |
+
# Import our data processing utilities
|
10 |
+
from process_data import API, DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, PREDICTIONS_CSV_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
11 |
+
|
12 |
+
# Import our leaderboard components
|
13 |
+
from src.about import ABOUT_TEXT, INTRODUCTION_TEXT, TITLE
|
14 |
+
from src.display_utils import CUSTOM_CSS, get_display_columns
|
15 |
+
from src.leaderboard_utils import create_leaderboard_df, get_available_weeks, get_leaderboard_summary
|
16 |
+
|
17 |
+
# Global variables for data
|
18 |
+
PREDICTIONS_DF = None
|
19 |
+
LEADERBOARD_DF = None
|
20 |
+
PREDICTION_DATES = []
|
21 |
+
AVAILABLE_WEEKS = []
|
22 |
+
DATA_SUMMARY = {}
|
23 |
+
|
24 |
+
|
25 |
+
def restart_space():
|
26 |
+
"""Restart the space if needed"""
|
27 |
+
API.restart_space(repo_id=REPO_ID)
|
28 |
+
|
29 |
+
|
30 |
+
def download_and_process_data():
|
31 |
+
"""Download and process data on startup"""
|
32 |
+
global PREDICTIONS_DF, LEADERBOARD_DF, PREDICTION_DATES, AVAILABLE_WEEKS, DATA_SUMMARY
|
33 |
+
|
34 |
+
print("=== Starting Data Download ===")
|
35 |
+
|
36 |
+
# Download eval requests (queue)
|
37 |
+
try:
|
38 |
+
print(f"Downloading eval requests to {EVAL_REQUESTS_PATH}")
|
39 |
+
snapshot_download(
|
40 |
+
repo_id=QUEUE_REPO,
|
41 |
+
local_dir=EVAL_REQUESTS_PATH,
|
42 |
+
repo_type="dataset",
|
43 |
+
tqdm_class=None,
|
44 |
+
etag_timeout=30,
|
45 |
+
token=TOKEN,
|
46 |
+
)
|
47 |
+
print("✓ Eval requests downloaded successfully")
|
48 |
+
except Exception as e:
|
49 |
+
print(f"Error downloading eval requests: {e}")
|
50 |
+
|
51 |
+
# Download eval results
|
52 |
+
try:
|
53 |
+
print(f"Downloading eval results to {EVAL_RESULTS_PATH}")
|
54 |
+
snapshot_download(
|
55 |
+
repo_id=RESULTS_REPO,
|
56 |
+
local_dir=EVAL_RESULTS_PATH,
|
57 |
+
repo_type="dataset",
|
58 |
+
tqdm_class=None,
|
59 |
+
etag_timeout=30,
|
60 |
+
token=TOKEN,
|
61 |
+
)
|
62 |
+
print("✓ Eval results downloaded successfully")
|
63 |
+
except Exception as e:
|
64 |
+
print(f"Error downloading eval results: {e}")
|
65 |
+
|
66 |
+
# Download prediction data (main dataset)
|
67 |
+
try:
|
68 |
+
print(f"Downloading prediction data to {PREDICTIONS_CSV_PATH}")
|
69 |
+
snapshot_download(
|
70 |
+
repo_id=DATA_REPO,
|
71 |
+
local_dir=PREDICTIONS_CSV_PATH,
|
72 |
+
repo_type="dataset",
|
73 |
+
tqdm_class=None,
|
74 |
+
etag_timeout=30,
|
75 |
+
token=TOKEN,
|
76 |
+
)
|
77 |
+
print("✓ Prediction data downloaded successfully")
|
78 |
+
except Exception as e:
|
79 |
+
print(f"Error downloading prediction data: {e}")
|
80 |
+
|
81 |
+
# Process the data
|
82 |
+
print("=== Processing Data ===")
|
83 |
+
|
84 |
+
# Load the main dataset
|
85 |
+
csv_path = os.path.join(PREDICTIONS_CSV_PATH, "data.csv")
|
86 |
+
if os.path.exists(csv_path):
|
87 |
+
print(f"Loading data from {csv_path}")
|
88 |
+
PREDICTIONS_DF = pd.read_csv(csv_path)
|
89 |
+
|
90 |
+
# Convert date columns
|
91 |
+
PREDICTIONS_DF["open_to_bet_until"] = pd.to_datetime(PREDICTIONS_DF["open_to_bet_until"])
|
92 |
+
PREDICTIONS_DF["prediction_created_at"] = pd.to_datetime(PREDICTIONS_DF["prediction_created_at"])
|
93 |
+
|
94 |
+
# Get prediction dates
|
95 |
+
PREDICTION_DATES = sorted(PREDICTIONS_DF["open_to_bet_until"].dt.date.unique())
|
96 |
+
|
97 |
+
# Get available weeks for filtering
|
98 |
+
AVAILABLE_WEEKS = get_available_weeks(PREDICTIONS_DF)
|
99 |
+
|
100 |
+
# Create leaderboard
|
101 |
+
print("Creating leaderboard...")
|
102 |
+
LEADERBOARD_DF = create_leaderboard_df(PREDICTIONS_DF)
|
103 |
+
|
104 |
+
# Create data summary
|
105 |
+
leaderboard_summary = get_leaderboard_summary(LEADERBOARD_DF)
|
106 |
+
DATA_SUMMARY = {
|
107 |
+
"total_records": len(PREDICTIONS_DF),
|
108 |
+
"unique_events": PREDICTIONS_DF["event_id"].nunique(),
|
109 |
+
"unique_algorithms": PREDICTIONS_DF["algorithm_name"].nunique(),
|
110 |
+
"unique_event_types": PREDICTIONS_DF["event_type"].nunique(),
|
111 |
+
"date_range": f"{PREDICTION_DATES[0]} to {PREDICTION_DATES[-1]}" if PREDICTION_DATES else "N/A",
|
112 |
+
"algorithms": PREDICTIONS_DF["algorithm_name"].unique().tolist(),
|
113 |
+
"event_types": PREDICTIONS_DF["event_type"].unique().tolist(),
|
114 |
+
"leaderboard_summary": leaderboard_summary,
|
115 |
+
}
|
116 |
+
|
117 |
+
print("✓ Data processed successfully")
|
118 |
+
print(f" - Total records: {DATA_SUMMARY['total_records']}")
|
119 |
+
print(f" - Unique events: {DATA_SUMMARY['unique_events']}")
|
120 |
+
print(f" - Unique algorithms: {DATA_SUMMARY['unique_algorithms']}")
|
121 |
+
print(f" - Leaderboard models: {leaderboard_summary['total_models']}")
|
122 |
+
print(f" - Date range: {DATA_SUMMARY['date_range']}")
|
123 |
+
|
124 |
+
else:
|
125 |
+
print(f"❌ Error: data.csv not found at {csv_path}")
|
126 |
+
PREDICTIONS_DF = pd.DataFrame()
|
127 |
+
LEADERBOARD_DF = pd.DataFrame()
|
128 |
+
DATA_SUMMARY = {"error": "No data found"}
|
129 |
+
|
130 |
+
|
131 |
+
def get_leaderboard(date_range=None):
|
132 |
+
"""Return leaderboard filtered by date range"""
|
133 |
+
if PREDICTIONS_DF is None or PREDICTIONS_DF.empty:
|
134 |
+
return pd.DataFrame({"message": ["No data available"]})
|
135 |
+
|
136 |
+
# Determine range of dates to filter by
|
137 |
+
if not PREDICTION_DATES:
|
138 |
+
return pd.DataFrame({"message": ["No dates available"]})
|
139 |
+
|
140 |
+
if date_range is None:
|
141 |
+
start_idx, end_idx = 0, len(PREDICTION_DATES) - 1
|
142 |
+
else:
|
143 |
+
start_idx, end_idx = date_range
|
144 |
+
start_idx = max(0, min(start_idx, len(PREDICTION_DATES) - 1))
|
145 |
+
end_idx = max(start_idx, min(end_idx, len(PREDICTION_DATES) - 1))
|
146 |
+
start_idx, end_idx = int(start_idx), int(end_idx)
|
147 |
+
|
148 |
+
week_range = (PREDICTION_DATES[start_idx], PREDICTION_DATES[end_idx])
|
149 |
+
|
150 |
+
# Create filtered leaderboard
|
151 |
+
filtered_leaderboard = create_leaderboard_df(PREDICTIONS_DF, week_range)
|
152 |
+
|
153 |
+
if filtered_leaderboard.empty:
|
154 |
+
return pd.DataFrame({"message": ["No data available for selected week"]})
|
155 |
+
|
156 |
+
# Return only display columns
|
157 |
+
display_cols = get_display_columns()
|
158 |
+
available_cols = [col for col in display_cols if col in filtered_leaderboard.columns]
|
159 |
+
|
160 |
+
return filtered_leaderboard[available_cols]
|
161 |
+
|
162 |
+
|
163 |
+
def get_data_summary():
|
164 |
+
"""Return formatted data summary"""
|
165 |
+
if not DATA_SUMMARY:
|
166 |
+
return "No data loaded"
|
167 |
+
|
168 |
+
if "error" in DATA_SUMMARY:
|
169 |
+
return f"Error: {DATA_SUMMARY['error']}"
|
170 |
+
|
171 |
+
summary = DATA_SUMMARY.get("leaderboard_summary", {})
|
172 |
+
|
173 |
+
summary_text = f"""
|
174 |
+
# 🏆 Leaderboard Summary
|
175 |
+
|
176 |
+
- **Models Ranked**: {summary.get("total_models", 0)}
|
177 |
+
- **Total Predictions**: {summary.get("total_predictions", 0):,}
|
178 |
+
- **Average Accuracy**: {summary.get("avg_accuracy", 0):.1f}%
|
179 |
+
|
180 |
+
# 📊 Dataset Overview
|
181 |
+
|
182 |
+
- **Total Records**: {DATA_SUMMARY["total_records"]:,}
|
183 |
+
- **Unique Events**: {DATA_SUMMARY["unique_events"]:,}
|
184 |
+
- **Event Types**: {DATA_SUMMARY["unique_event_types"]}
|
185 |
+
- **Date Range**: {DATA_SUMMARY["date_range"]}
|
186 |
+
|
187 |
+
## 🤖 Models
|
188 |
+
{", ".join(DATA_SUMMARY["algorithms"])}
|
189 |
+
|
190 |
+
## 📋 Event Types
|
191 |
+
{", ".join(DATA_SUMMARY["event_types"])}
|
192 |
+
"""
|
193 |
+
|
194 |
+
return summary_text
|
195 |
+
|
196 |
+
|
197 |
+
def get_sample_data():
|
198 |
+
"""Return sample of the data"""
|
199 |
+
if PREDICTIONS_DF is None or PREDICTIONS_DF.empty:
|
200 |
+
return pd.DataFrame({"message": ["No data available"]})
|
201 |
+
|
202 |
+
# Return first 10 rows with key columns
|
203 |
+
sample_cols = ["event_id", "question", "event_type", "algorithm_name", "actual_prediction", "result", "open_to_bet_until"]
|
204 |
+
available_cols = [col for col in sample_cols if col in PREDICTIONS_DF.columns]
|
205 |
+
|
206 |
+
return PREDICTIONS_DF[available_cols].head(10)
|
207 |
+
|
208 |
+
|
209 |
+
def refresh_all_data(date_range=None):
|
210 |
+
"""Refresh all data and return updated components"""
|
211 |
+
download_and_process_data()
|
212 |
+
return (
|
213 |
+
get_leaderboard(date_range),
|
214 |
+
get_data_summary(),
|
215 |
+
get_sample_data(),
|
216 |
+
)
|
217 |
+
|
218 |
+
|
219 |
+
# Download and process data on startup
|
220 |
+
download_and_process_data()
|
221 |
+
|
222 |
+
# Create Gradio interface
|
223 |
+
with gr.Blocks(css=CUSTOM_CSS, title="FutureBench Leaderboard") as demo:
|
224 |
+
gr.HTML(TITLE)
|
225 |
+
with gr.Row():
|
226 |
+
gr.Image("image/image.png", height=200, width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, container=False, elem_classes="center-logo")
|
227 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
228 |
+
|
229 |
+
with gr.Tabs():
|
230 |
+
with gr.TabItem("🏆 Leaderboard"):
|
231 |
+
leaderboard_display = gr.Dataframe(value=get_leaderboard(), interactive=False, wrap=True, elem_id="leaderboard-table")
|
232 |
+
|
233 |
+
with gr.Row():
|
234 |
+
date_slider = RangeSlider(
|
235 |
+
minimum=0,
|
236 |
+
maximum=len(PREDICTION_DATES) - 1,
|
237 |
+
value=(0, len(PREDICTION_DATES) - 1),
|
238 |
+
step=1,
|
239 |
+
label="📅 Date Range",
|
240 |
+
show_label=True,
|
241 |
+
labels=[str(d) for d in PREDICTION_DATES],
|
242 |
+
)
|
243 |
+
|
244 |
+
# Update leaderboard when date range is changed
|
245 |
+
date_slider.change(get_leaderboard, inputs=date_slider, outputs=leaderboard_display)
|
246 |
+
|
247 |
+
with gr.TabItem("📊 Summary"):
|
248 |
+
summary_display = gr.Markdown(get_data_summary(), elem_classes="markdown-text")
|
249 |
+
refresh_summary_btn = gr.Button("🔄 Refresh Summary")
|
250 |
+
|
251 |
+
refresh_summary_btn.click(lambda: get_data_summary(), outputs=summary_display)
|
252 |
+
|
253 |
+
with gr.TabItem("🔍 Sample Data"):
|
254 |
+
sample_display = gr.Dataframe(value=get_sample_data(), interactive=False, wrap=True)
|
255 |
+
refresh_sample_btn = gr.Button("🔄 Refresh Sample")
|
256 |
+
|
257 |
+
refresh_sample_btn.click(lambda: get_sample_data(), outputs=sample_display)
|
258 |
+
|
259 |
+
with gr.TabItem("📋 About"):
|
260 |
+
gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
|
261 |
+
|
262 |
+
if __name__ == "__main__":
|
263 |
+
scheduler = BackgroundScheduler()
|
264 |
+
scheduler.add_job(restart_space, "interval", seconds=1800)
|
265 |
+
scheduler.start()
|
266 |
+
demo.queue(default_concurrency_limit=40).launch()
|
image/image.png
ADDED
![]() |
logo.png
ADDED
![]() |
process_data/README.md
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# FutureBench Dataset Processing
|
2 |
+
|
3 |
+
This directory contains tools for processing FutureBench datasets, both downloading from HuggingFace and transforming your own database into the standard format.
|
4 |
+
|
5 |
+
## Option 1: Download from HuggingFace (Original)
|
6 |
+
|
7 |
+
Use this to download the existing FutureBench dataset:
|
8 |
+
|
9 |
+
```bash
|
10 |
+
python download_data.py
|
11 |
+
```
|
12 |
+
|
13 |
+
## Option 2: Transform Your Own Database
|
14 |
+
|
15 |
+
Use this to transform your production database into HuggingFace format:
|
16 |
+
|
17 |
+
### Setup
|
18 |
+
|
19 |
+
1. **Install dependencies:**
|
20 |
+
```bash
|
21 |
+
pip install pandas sqlalchemy huggingface_hub
|
22 |
+
```
|
23 |
+
|
24 |
+
2. **Set up HuggingFace token:**
|
25 |
+
```bash
|
26 |
+
export HF_TOKEN="your_huggingface_token_here"
|
27 |
+
```
|
28 |
+
|
29 |
+
3. **Configure your settings:**
|
30 |
+
Edit `config_db.py` to match your needs:
|
31 |
+
- Update `HF_CONFIG` with your HuggingFace repository names
|
32 |
+
- Adjust `PROCESSING_CONFIG` for data filtering preferences
|
33 |
+
- Note: Database connection uses the same setup as the main FutureBench app
|
34 |
+
|
35 |
+
### Usage
|
36 |
+
|
37 |
+
```bash
|
38 |
+
# Transform your database and upload to HuggingFace
|
39 |
+
python db_to_hf.py
|
40 |
+
|
41 |
+
# Or run locally without uploading
|
42 |
+
HF_TOKEN="" python db_to_hf.py
|
43 |
+
```
|
44 |
+
|
45 |
+
### Database Schema
|
46 |
+
|
47 |
+
The script uses the same database schema as the main FutureBench application:
|
48 |
+
- `EventBase` model for events
|
49 |
+
- `Prediction` model for predictions
|
50 |
+
- Uses SQLAlchemy ORM (same as `convert_to_csv.py`)
|
51 |
+
|
52 |
+
No additional database configuration needed - it uses the existing FutureBench database connection.
|
53 |
+
|
54 |
+
### Output Format
|
55 |
+
|
56 |
+
The script produces data in the same format as the original FutureBench dataset:
|
57 |
+
- `event_id`, `question`, `event_type`, `algorithm_name`, `actual_prediction`, `result`, `open_to_bet_until`, `prediction_created_at`
|
58 |
+
|
59 |
+
### Automation
|
60 |
+
|
61 |
+
You can run this as a scheduled job:
|
62 |
+
|
63 |
+
```bash
|
64 |
+
# Add to crontab to run daily at 2 AM
|
65 |
+
0 2 * * * cd /path/to/your/project && python leaderboard/process_data/db_to_hf.py
|
66 |
+
```
|
67 |
+
|
68 |
+
## Files
|
69 |
+
|
70 |
+
- `download_data.py` - Downloads data from HuggingFace repositories
|
71 |
+
- `db_to_hf.py` - Transforms your database to HuggingFace format
|
72 |
+
- `config_db.py` - Configuration for database connection and HF settings
|
73 |
+
- `config.py` - HuggingFace repository configuration
|
74 |
+
- `requirements.txt` - Python dependencies
|
75 |
+
|
76 |
+
## Data Structure
|
77 |
+
|
78 |
+
The main dataset contains:
|
79 |
+
- `event_id`: Unique identifier for each event
|
80 |
+
- `question`: The prediction question
|
81 |
+
- `event_type`: Type of event (polymarket, soccer, etc.)
|
82 |
+
- `answer_options`: Possible answers in JSON format
|
83 |
+
- `result`: Actual outcome (if resolved)
|
84 |
+
- `algorithm_name`: AI model that made the prediction
|
85 |
+
- `actual_prediction`: The prediction made
|
86 |
+
- `open_to_bet_until`: Prediction window deadline
|
87 |
+
- `prediction_created_at`: When prediction was made
|
88 |
+
|
89 |
+
## Output
|
90 |
+
|
91 |
+
The script generates:
|
92 |
+
- Downloaded datasets in local cache folders
|
93 |
+
- `evaluation_queue.csv` with unique events for processing
|
94 |
+
- Console output with data statistics and summary
|
process_data/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
FutureBench Data Processing
|
3 |
+
|
4 |
+
This package contains utilities for downloading and processing FutureBench datasets from HuggingFace.
|
5 |
+
"""
|
6 |
+
|
7 |
+
from .config import API, DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, OWNER, PREDICTIONS_CSV_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
8 |
+
from .download_data import download_datasets, generate_queue, process_data
|
9 |
+
|
10 |
+
__version__ = "0.1.0"
|
11 |
+
__all__ = ["TOKEN", "OWNER", "QUEUE_REPO", "RESULTS_REPO", "DATA_REPO", "REPO_ID", "EVAL_REQUESTS_PATH", "EVAL_RESULTS_PATH", "PREDICTIONS_CSV_PATH", "API", "download_datasets", "process_data", "generate_queue"]
|
process_data/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (805 Bytes). View file
|
|
process_data/__pycache__/config.cpython-312.pyc
ADDED
Binary file (1.02 kB). View file
|
|
process_data/__pycache__/download_data.cpython-312.pyc
ADDED
Binary file (6.45 kB). View file
|
|
process_data/config.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
|
5 |
+
# Configuration for HuggingFace repositories
|
6 |
+
# ------------------------------------------
|
7 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read token for accessing datasets
|
8 |
+
|
9 |
+
OWNER = "futurebench" # Change to your organization
|
10 |
+
# ------------------------------------------
|
11 |
+
|
12 |
+
# HuggingFace repository IDs
|
13 |
+
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/results"
|
15 |
+
DATA_REPO = f"{OWNER}/data"
|
16 |
+
|
17 |
+
# Local cache paths
|
18 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
19 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
20 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
21 |
+
PREDICTIONS_CSV_PATH = os.path.join(CACHE_PATH, "eval-data")
|
22 |
+
REPO_ID = f"{OWNER}/Future-Bench"
|
23 |
+
|
24 |
+
# HuggingFace API client
|
25 |
+
API = HfApi(token=TOKEN)
|
process_data/config_db.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Configuration for database to HuggingFace pipeline.
|
3 |
+
Update these settings to match your setup.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Database Configuration
|
9 |
+
# Note: Database connection is handled by future_bench.database.get_session()
|
10 |
+
# The script uses the same database connection as the main FutureBench app
|
11 |
+
|
12 |
+
|
13 |
+
# HuggingFace Configuration
|
14 |
+
HF_CONFIG = {
|
15 |
+
"token": os.getenv("HF_TOKEN"), # Set this in your environment
|
16 |
+
"data_repo": "futurebench/data",
|
17 |
+
"results_repo": "futurebench/results",
|
18 |
+
"requests_repo": "futurebench/requests", # Optional: for model submissions
|
19 |
+
}
|
20 |
+
|
21 |
+
# Data Processing Settings
|
22 |
+
PROCESSING_CONFIG = {
|
23 |
+
"days_history": 180, # How many days of data to include
|
24 |
+
"min_predictions": 5, # Minimum predictions per model to include
|
25 |
+
"event_types": ["news", "polymarket", "sports"], # Which event types to include
|
26 |
+
"exclude_models": ["test", "debug"], # Models to exclude from public dataset
|
27 |
+
}
|
28 |
+
|
29 |
+
# Note: Schema mapping not needed since we use SQLAlchemy ORM models
|
30 |
+
# The script uses the same models as convert_to_csv.py:
|
31 |
+
# - EventBase (events table)
|
32 |
+
# - Prediction (predictions table)
|
process_data/db_to_hf.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
Script to transform your production database into HuggingFace dataset format.
|
4 |
+
Follows the same pattern as FutureBench's convert_to_csv.py but simplified.
|
5 |
+
"""
|
6 |
+
|
7 |
+
import os
|
8 |
+
import sys
|
9 |
+
import tempfile
|
10 |
+
from datetime import datetime
|
11 |
+
|
12 |
+
import pandas as pd
|
13 |
+
from huggingface_hub import HfApi
|
14 |
+
|
15 |
+
# Add the parent directory to sys.path to allow imports (same as convert_to_csv.py)
|
16 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
17 |
+
|
18 |
+
# Import FutureBench models and database (same as convert_to_csv.py)
|
19 |
+
# Import configuration
|
20 |
+
from config_db import HF_CONFIG, PROCESSING_CONFIG
|
21 |
+
|
22 |
+
from future_bench.database import get_session
|
23 |
+
from future_bench.models import EventBase, Prediction
|
24 |
+
|
25 |
+
|
26 |
+
def datetime_to_string(dt):
|
27 |
+
"""Convert datetime to string or return empty string if None (same as convert_to_csv.py)"""
|
28 |
+
return dt.isoformat() if dt else ""
|
29 |
+
|
30 |
+
|
31 |
+
def extract_events_and_predictions(session):
|
32 |
+
"""
|
33 |
+
Extract events and predictions from your database.
|
34 |
+
Uses the same SQLAlchemy ORM approach as convert_to_csv.py.
|
35 |
+
"""
|
36 |
+
# Get all events (same as convert_to_csv.py)
|
37 |
+
events = session.query(EventBase).all()
|
38 |
+
if not events:
|
39 |
+
print("No events found in the database.")
|
40 |
+
return pd.DataFrame()
|
41 |
+
|
42 |
+
# Get all predictions (same as convert_to_csv.py)
|
43 |
+
predictions = session.query(Prediction).all()
|
44 |
+
if not predictions:
|
45 |
+
print("No predictions found in the database.")
|
46 |
+
return pd.DataFrame()
|
47 |
+
|
48 |
+
# Create combined view (same logic as convert_to_csv.py)
|
49 |
+
combined_data = []
|
50 |
+
for event in events:
|
51 |
+
if event.result is None: # Skip unresolved events
|
52 |
+
continue
|
53 |
+
|
54 |
+
event_predictions = [p for p in predictions if p.event_id == event.id]
|
55 |
+
for pred in event_predictions:
|
56 |
+
combined_data.append(
|
57 |
+
{
|
58 |
+
"event_id": event.id,
|
59 |
+
"question": event.question,
|
60 |
+
"event_type": event.event_type,
|
61 |
+
"open_to_bet_until": datetime_to_string(event.open_to_bet_until),
|
62 |
+
"result": event.result,
|
63 |
+
"algorithm_name": pred.algorithm_name,
|
64 |
+
"actual_prediction": pred.actual_prediction,
|
65 |
+
"prediction_created_at": datetime_to_string(pred.created_at),
|
66 |
+
}
|
67 |
+
)
|
68 |
+
|
69 |
+
df = pd.DataFrame(combined_data)
|
70 |
+
return df
|
71 |
+
|
72 |
+
|
73 |
+
def transform_to_standard_format(df):
|
74 |
+
"""
|
75 |
+
Transform your raw data into the standard format expected by your leaderboard.
|
76 |
+
This should match the CSV format your leaderboard already expects.
|
77 |
+
"""
|
78 |
+
# Convert date columns with flexible parsing for microseconds
|
79 |
+
df["open_to_bet_until"] = pd.to_datetime(df["open_to_bet_until"], format="mixed")
|
80 |
+
df["prediction_created_at"] = pd.to_datetime(df["prediction_created_at"], format="mixed")
|
81 |
+
|
82 |
+
# Add any additional columns your leaderboard expects
|
83 |
+
df["source"] = "your-app" # Add source identifier
|
84 |
+
|
85 |
+
# Filter to data starting from June 12th
|
86 |
+
cutoff_date = datetime(2025, 6, 12)
|
87 |
+
df = df[df["prediction_created_at"] >= cutoff_date]
|
88 |
+
print(f" Filtered to predictions created from {cutoff_date.strftime('%B %d, %Y')} onwards: {len(df)} records remaining")
|
89 |
+
|
90 |
+
# Filter by event types
|
91 |
+
df = df[df["event_type"].isin(PROCESSING_CONFIG["event_types"])]
|
92 |
+
|
93 |
+
# Exclude test models
|
94 |
+
df = df[~df["algorithm_name"].isin(PROCESSING_CONFIG["exclude_models"])]
|
95 |
+
|
96 |
+
# Calculate accuracy per model (for summary)
|
97 |
+
accuracy_df = df.groupby(["algorithm_name", "event_type"]).agg({"actual_prediction": "count", "result": lambda x: (df.loc[x.index, "actual_prediction"] == x).sum()}).rename(columns={"actual_prediction": "total_predictions", "result": "correct_predictions"}).reset_index()
|
98 |
+
|
99 |
+
accuracy_df["accuracy"] = accuracy_df["correct_predictions"] / accuracy_df["total_predictions"]
|
100 |
+
|
101 |
+
return df, accuracy_df
|
102 |
+
|
103 |
+
|
104 |
+
def upload_to_huggingface(df, accuracy_df, repo_data, repo_results):
|
105 |
+
"""
|
106 |
+
Upload the transformed data to HuggingFace repositories.
|
107 |
+
"""
|
108 |
+
api = HfApi(token=HF_CONFIG["token"])
|
109 |
+
|
110 |
+
# Create temporary directory for files
|
111 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
112 |
+
# Save main dataset
|
113 |
+
data_path = os.path.join(tmp_dir, "data.csv")
|
114 |
+
df.to_csv(data_path, index=False)
|
115 |
+
|
116 |
+
# Save accuracy summary
|
117 |
+
results_path = os.path.join(tmp_dir, "results.csv")
|
118 |
+
accuracy_df.to_csv(results_path, index=False)
|
119 |
+
|
120 |
+
# Upload to data repo
|
121 |
+
api.upload_file(path_or_fileobj=data_path, path_in_repo="data.csv", repo_id=repo_data, repo_type="dataset")
|
122 |
+
|
123 |
+
# Upload to results repo
|
124 |
+
api.upload_file(path_or_fileobj=results_path, path_in_repo="results.csv", repo_id=repo_results, repo_type="dataset")
|
125 |
+
|
126 |
+
print(f"✅ Uploaded data to {repo_data}")
|
127 |
+
print(f"✅ Uploaded results to {repo_results}")
|
128 |
+
|
129 |
+
|
130 |
+
def main():
|
131 |
+
"""Main pipeline function"""
|
132 |
+
print("🚀 Starting database to HuggingFace pipeline...")
|
133 |
+
|
134 |
+
# Step 1: Extract from database (same as convert_to_csv.py)
|
135 |
+
print("📊 Extracting data from database...")
|
136 |
+
session = next(get_session())
|
137 |
+
try:
|
138 |
+
df = extract_events_and_predictions(session)
|
139 |
+
print(f" Found {len(df)} event-prediction pairs")
|
140 |
+
finally:
|
141 |
+
session.close()
|
142 |
+
|
143 |
+
if len(df) == 0:
|
144 |
+
print("❌ No data found in database")
|
145 |
+
return
|
146 |
+
|
147 |
+
# Step 2: Transform to standard format
|
148 |
+
print("🔄 Transforming data...")
|
149 |
+
df, accuracy_df = transform_to_standard_format(df)
|
150 |
+
print(f" Processed {len(df)} records")
|
151 |
+
print(f" Generated accuracy stats for {len(accuracy_df)} model-task pairs")
|
152 |
+
|
153 |
+
# Step 3: Upload to HuggingFace
|
154 |
+
if HF_CONFIG["token"]:
|
155 |
+
print("☁️ Uploading to HuggingFace...")
|
156 |
+
upload_to_huggingface(df, accuracy_df, HF_CONFIG["data_repo"], HF_CONFIG["results_repo"])
|
157 |
+
else:
|
158 |
+
print("⚠️ No HF_TOKEN found, saving locally instead...")
|
159 |
+
df.to_csv("data_export.csv", index=False)
|
160 |
+
accuracy_df.to_csv("results_export.csv", index=False)
|
161 |
+
print(" Saved data_export.csv and results_export.csv")
|
162 |
+
|
163 |
+
print("✅ Pipeline completed successfully!")
|
164 |
+
|
165 |
+
|
166 |
+
if __name__ == "__main__":
|
167 |
+
main()
|
process_data/download_data.py
ADDED
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import os
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
from huggingface_hub import snapshot_download
|
6 |
+
|
7 |
+
from .config import DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, PREDICTIONS_CSV_PATH, QUEUE_REPO, RESULTS_REPO, TOKEN
|
8 |
+
|
9 |
+
|
10 |
+
def download_datasets():
|
11 |
+
"""Download datasets from HuggingFace repositories"""
|
12 |
+
print("Downloading datasets from HuggingFace...")
|
13 |
+
|
14 |
+
# Download eval requests (queue)
|
15 |
+
try:
|
16 |
+
print(f"Downloading eval requests to {EVAL_REQUESTS_PATH}")
|
17 |
+
snapshot_download(
|
18 |
+
repo_id=QUEUE_REPO,
|
19 |
+
local_dir=EVAL_REQUESTS_PATH,
|
20 |
+
repo_type="dataset",
|
21 |
+
tqdm_class=None,
|
22 |
+
etag_timeout=30,
|
23 |
+
token=TOKEN,
|
24 |
+
)
|
25 |
+
print("✓ Eval requests downloaded successfully")
|
26 |
+
except Exception as e:
|
27 |
+
print(f"Error downloading eval requests: {e}")
|
28 |
+
|
29 |
+
# Download eval results
|
30 |
+
try:
|
31 |
+
print(f"Downloading eval results to {EVAL_RESULTS_PATH}")
|
32 |
+
snapshot_download(
|
33 |
+
repo_id=RESULTS_REPO,
|
34 |
+
local_dir=EVAL_RESULTS_PATH,
|
35 |
+
repo_type="dataset",
|
36 |
+
tqdm_class=None,
|
37 |
+
etag_timeout=30,
|
38 |
+
token=TOKEN,
|
39 |
+
)
|
40 |
+
print("✓ Eval results downloaded successfully")
|
41 |
+
except Exception as e:
|
42 |
+
print(f"Error downloading eval results: {e}")
|
43 |
+
|
44 |
+
# Download prediction data (main dataset)
|
45 |
+
try:
|
46 |
+
print(f"Downloading prediction data to {PREDICTIONS_CSV_PATH}")
|
47 |
+
snapshot_download(
|
48 |
+
repo_id=DATA_REPO,
|
49 |
+
local_dir=PREDICTIONS_CSV_PATH,
|
50 |
+
repo_type="dataset",
|
51 |
+
tqdm_class=None,
|
52 |
+
etag_timeout=30,
|
53 |
+
token=TOKEN,
|
54 |
+
)
|
55 |
+
print("✓ Prediction data downloaded successfully")
|
56 |
+
except Exception as e:
|
57 |
+
print(f"Error downloading prediction data: {e}")
|
58 |
+
|
59 |
+
|
60 |
+
def process_data():
|
61 |
+
"""Process the downloaded data and create queue"""
|
62 |
+
print("Processing downloaded data...")
|
63 |
+
|
64 |
+
# Load the main dataset
|
65 |
+
csv_path = os.path.join(PREDICTIONS_CSV_PATH, "data.csv")
|
66 |
+
if not os.path.exists(csv_path):
|
67 |
+
print(f"Error: data.csv not found at {csv_path}")
|
68 |
+
return None, None
|
69 |
+
|
70 |
+
print(f"Loading data from {csv_path}")
|
71 |
+
df = pd.read_csv(csv_path)
|
72 |
+
|
73 |
+
# Convert date columns
|
74 |
+
df["open_to_bet_until"] = pd.to_datetime(df["open_to_bet_until"])
|
75 |
+
df["prediction_created_at"] = pd.to_datetime(df["prediction_created_at"])
|
76 |
+
|
77 |
+
print(f"Loaded {len(df)} records")
|
78 |
+
print(f"Data shape: {df.shape}")
|
79 |
+
print(f"Columns: {list(df.columns)}")
|
80 |
+
|
81 |
+
# Get unique dates for prediction windows
|
82 |
+
prediction_dates = sorted(df["open_to_bet_until"].dt.date.unique())
|
83 |
+
print(f"Prediction dates: {prediction_dates}")
|
84 |
+
|
85 |
+
# Get unique algorithms/models
|
86 |
+
algorithms = df["algorithm_name"].unique()
|
87 |
+
print(f"Algorithms: {algorithms}")
|
88 |
+
|
89 |
+
# Get unique event types
|
90 |
+
event_types = df["event_type"].unique()
|
91 |
+
print(f"Event types: {event_types}")
|
92 |
+
|
93 |
+
# Create a summary of the data
|
94 |
+
summary = {"total_records": len(df), "unique_events": df["event_id"].nunique(), "unique_algorithms": len(algorithms), "unique_event_types": len(event_types), "prediction_dates": prediction_dates, "algorithms": algorithms.tolist(), "event_types": event_types.tolist()}
|
95 |
+
|
96 |
+
print("\n=== Data Summary ===")
|
97 |
+
for key, value in summary.items():
|
98 |
+
print(f"{key}: {value}")
|
99 |
+
|
100 |
+
return df, summary
|
101 |
+
|
102 |
+
|
103 |
+
def generate_queue(df):
|
104 |
+
"""Generate evaluation queue from processed data"""
|
105 |
+
print("Generating evaluation queue...")
|
106 |
+
|
107 |
+
# Get unique events that need evaluation
|
108 |
+
unique_events = df.groupby("event_id").agg({"question": "first", "event_type": "first", "answer_options": "first", "result": "first", "open_to_bet_until": "first"}).reset_index()
|
109 |
+
|
110 |
+
# Filter for events that haven't been resolved yet (if needed)
|
111 |
+
pending_events = unique_events[unique_events["result"].isna()]
|
112 |
+
resolved_events = unique_events[unique_events["result"].notna()]
|
113 |
+
|
114 |
+
print(f"Total unique events: {len(unique_events)}")
|
115 |
+
print(f"Pending events: {len(pending_events)}")
|
116 |
+
print(f"Resolved events: {len(resolved_events)}")
|
117 |
+
|
118 |
+
# Save queue locally
|
119 |
+
queue_path = os.path.join(PREDICTIONS_CSV_PATH, "evaluation_queue.csv")
|
120 |
+
unique_events.to_csv(queue_path, index=False)
|
121 |
+
print(f"✓ Queue saved to {queue_path}")
|
122 |
+
|
123 |
+
return unique_events
|
124 |
+
|
125 |
+
|
126 |
+
def main():
|
127 |
+
"""Main function to download and process data"""
|
128 |
+
print("=== FutureBench Data Download and Processing ===")
|
129 |
+
|
130 |
+
# Download datasets
|
131 |
+
download_datasets()
|
132 |
+
|
133 |
+
# Process data
|
134 |
+
df, summary = process_data()
|
135 |
+
|
136 |
+
if df is None:
|
137 |
+
print("❌ Failed to process data. Exiting.")
|
138 |
+
return
|
139 |
+
|
140 |
+
# Generate queue
|
141 |
+
queue = generate_queue(df)
|
142 |
+
|
143 |
+
print("\n=== Processing Complete ===")
|
144 |
+
print("Data processed and queue generated successfully!")
|
145 |
+
print(f"Queue contains {len(queue)} events")
|
146 |
+
|
147 |
+
|
148 |
+
if __name__ == "__main__":
|
149 |
+
main()
|
process_data/requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pandas>=1.5.0
|
2 |
+
huggingface_hub>=0.15.0
|
3 |
+
sqlalchemy
|
4 |
+
psycopg2-binary # For PostgreSQL
|
5 |
+
PyMySQL # For MySQL
|
process_data/run_pipeline.sh
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Database to HuggingFace Pipeline
|
4 |
+
# Similar to FutureBench's to_csv.sh and to_benchmark.sh but combined
|
5 |
+
|
6 |
+
echo "🚀 Starting Database to HuggingFace Pipeline..."
|
7 |
+
|
8 |
+
# Check if HF_TOKEN is set
|
9 |
+
if [ -z "$HF_TOKEN" ]; then
|
10 |
+
echo "⚠️ HF_TOKEN not set. Will save files locally instead of uploading."
|
11 |
+
echo " To upload to HuggingFace, set: export HF_TOKEN='your_token_here'"
|
12 |
+
echo ""
|
13 |
+
fi
|
14 |
+
|
15 |
+
# Change to project root (same as to_csv.sh)
|
16 |
+
cd ../..
|
17 |
+
|
18 |
+
# Run the pipeline
|
19 |
+
python3 leaderboard/process_data/db_to_hf.py
|
20 |
+
|
21 |
+
# Check if it was successful
|
22 |
+
if [ $? -eq 0 ]; then
|
23 |
+
echo ""
|
24 |
+
echo "✅ Pipeline completed successfully!"
|
25 |
+
echo ""
|
26 |
+
echo "Next steps:"
|
27 |
+
echo "1. Check your HuggingFace repositories for updated data"
|
28 |
+
echo "2. Your leaderboard will automatically use the new data"
|
29 |
+
echo "3. Consider setting up a cron job to run this regularly"
|
30 |
+
else
|
31 |
+
echo ""
|
32 |
+
echo "❌ Pipeline failed. Check the error messages above."
|
33 |
+
exit 1
|
34 |
+
fi
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0.0
|
2 |
+
pandas>=1.5.0
|
3 |
+
huggingface_hub>=0.15.0
|
4 |
+
apscheduler
|
5 |
+
git+https://github.com/IsThatYou/gradio_rangeslider
|
src/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Simplified leaderboard components for FutureBench
|
3 |
+
"""
|
src/__pycache__/__init__.cpython-312.pyc
ADDED
Binary file (230 Bytes). View file
|
|
src/__pycache__/about.cpython-312.pyc
ADDED
Binary file (2.99 kB). View file
|
|
src/__pycache__/display_utils.cpython-312.pyc
ADDED
Binary file (15 kB). View file
|
|
src/__pycache__/leaderboard_utils.cpython-312.pyc
ADDED
Binary file (7.17 kB). View file
|
|
src/about.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from enum import Enum
|
3 |
+
|
4 |
+
|
5 |
+
@dataclass
|
6 |
+
class Task:
|
7 |
+
benchmark: str
|
8 |
+
metric: str
|
9 |
+
col_name: str
|
10 |
+
|
11 |
+
|
12 |
+
# Define our evaluation tasks
|
13 |
+
# ---------------------------------------------------
|
14 |
+
class Tasks(Enum):
|
15 |
+
# task_key in the data, metric name, display name
|
16 |
+
news = Task("news", "acc", "News")
|
17 |
+
polymarket = Task("polymarket", "acc", "PolyMarket")
|
18 |
+
|
19 |
+
|
20 |
+
# Your leaderboard name
|
21 |
+
TITLE = """<h1 align="center" id="space-title" style="font-size: 4.375rem; font-weight: bold; margin-bottom: 1rem;">🔮 FutureBench Leaderboard</h1>"""
|
22 |
+
|
23 |
+
# What does your leaderboard evaluate?
|
24 |
+
INTRODUCTION_TEXT = """<div class="section-card">
|
25 |
+
<h3 class="section-header"><span class="section-icon">🎯</span> About FutureBench</h3>
|
26 |
+
FutureBench is a benchmarking system for evaluating AI models on predicting future events.
|
27 |
+
This leaderboard shows how well different AI models perform at forecasting real-world outcomes
|
28 |
+
across various domains including news events, sports, and prediction markets.
|
29 |
+
<br><br>
|
30 |
+
📝 <a href="https://www.together.ai/blog/futurebench" target="_blank" style="color: #007acc; text-decoration: none;">Read our blog post</a> for more details about FutureBench.
|
31 |
+
</div>"""
|
32 |
+
|
33 |
+
# Additional information about the benchmark
|
34 |
+
ABOUT_TEXT = """
|
35 |
+
<div class="section-card fade-in-up">
|
36 |
+
<h2 class="section-header"><span class="section-icon">⚙️</span> How it works</h2>
|
37 |
+
|
38 |
+
FutureBench evaluates AI models on their ability to predict future events by:
|
39 |
+
|
40 |
+
- **Ingesting real-world events** from multiple sources (news, sports, prediction markets)
|
41 |
+
- **Collecting AI predictions** before events resolve
|
42 |
+
- **Measuring accuracy** once outcomes are known
|
43 |
+
- **Ranking models** based on their predictive performance
|
44 |
+
</div>
|
45 |
+
|
46 |
+
<div class="section-card fade-in-up stagger-1">
|
47 |
+
<h2 class="section-header"><span class="section-icon">📊</span> Event Types</h2>
|
48 |
+
|
49 |
+
- **News Events**: Predictions about political developments, economic changes, and current events
|
50 |
+
- **PolyMarket**: Predictions on various real-world events traded on prediction markets
|
51 |
+
</div>
|
52 |
+
|
53 |
+
<div class="section-card fade-in-up stagger-2">
|
54 |
+
<h2 class="section-header"><span class="section-icon">📈</span> Metrics</h2>
|
55 |
+
|
56 |
+
Models are evaluated using **accuracy** - the percentage of correct predictions made.
|
57 |
+
The **Average** score shows overall performance across all event types.
|
58 |
+
</div>
|
59 |
+
|
60 |
+
<div class="section-card fade-in-up stagger-3">
|
61 |
+
<h2 class="section-header"><span class="section-icon">🔒</span> Data Integrity</h2>
|
62 |
+
|
63 |
+
All predictions are made before events resolve, ensuring fair evaluation.
|
64 |
+
The leaderboard updates as new events are resolved and model performances are calculated.
|
65 |
+
</div>
|
66 |
+
"""
|
src/display_utils.py
ADDED
@@ -0,0 +1,566 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
from .about import Tasks
|
4 |
+
|
5 |
+
|
6 |
+
@dataclass(frozen=True)
|
7 |
+
class ColumnContent:
|
8 |
+
name: str
|
9 |
+
type: str
|
10 |
+
displayed_by_default: bool
|
11 |
+
hidden: bool = False
|
12 |
+
|
13 |
+
|
14 |
+
# Define leaderboard columns
|
15 |
+
@dataclass(frozen=True)
|
16 |
+
class LeaderboardColumn:
|
17 |
+
model = ColumnContent("Model", "str", True)
|
18 |
+
events = ColumnContent("Events", "number", True)
|
19 |
+
average = ColumnContent("Average", "number", True)
|
20 |
+
# Task-specific columns will be added dynamically
|
21 |
+
|
22 |
+
# Additional model info (hidden by default)
|
23 |
+
correct_predictions = ColumnContent("Correct Predictions", "number", False)
|
24 |
+
|
25 |
+
|
26 |
+
# Get column names for display
|
27 |
+
def get_display_columns():
|
28 |
+
"""Get list of column names for display"""
|
29 |
+
base_cols = ["Rank", "Model", "Events", "Average"]
|
30 |
+
task_cols = [task.value.col_name for task in Tasks]
|
31 |
+
return base_cols + task_cols
|
32 |
+
|
33 |
+
|
34 |
+
def get_all_columns():
|
35 |
+
"""Get all column names including hidden ones"""
|
36 |
+
base_cols = get_display_columns()
|
37 |
+
hidden_cols = ["Correct Predictions"]
|
38 |
+
return base_cols + hidden_cols
|
39 |
+
|
40 |
+
|
41 |
+
# Formatting helpers
|
42 |
+
def make_clickable_model(model_name):
|
43 |
+
"""Make model name clickable with link to HuggingFace"""
|
44 |
+
if "/" in model_name:
|
45 |
+
link = f"https://huggingface.co/{model_name}"
|
46 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;">{model_name}</a>'
|
47 |
+
return model_name
|
48 |
+
|
49 |
+
|
50 |
+
def format_percentage(value):
|
51 |
+
"""Format accuracy as percentage"""
|
52 |
+
if value is None or value == "N/A":
|
53 |
+
return "N/A"
|
54 |
+
try:
|
55 |
+
return f"{float(value):.1f}%"
|
56 |
+
except (ValueError, TypeError):
|
57 |
+
return "N/A"
|
58 |
+
|
59 |
+
|
60 |
+
def has_valid_scores(df, required_columns):
|
61 |
+
"""Check if dataframe has valid scores for required columns"""
|
62 |
+
return df[required_columns].notna().all(axis=1)
|
63 |
+
|
64 |
+
|
65 |
+
# CSS styling
|
66 |
+
CUSTOM_CSS = """
|
67 |
+
/* Global styling */
|
68 |
+
body {
|
69 |
+
background: linear-gradient(135deg, #1e1e2f 0%, #2d2d44 100%) !important;
|
70 |
+
}
|
71 |
+
|
72 |
+
/* Add consistent margins and centering */
|
73 |
+
.gradio-container,
|
74 |
+
.container,
|
75 |
+
.main {
|
76 |
+
margin: 0 auto !important;
|
77 |
+
max-width: 1400px !important;
|
78 |
+
padding: 0 60px !important;
|
79 |
+
}
|
80 |
+
|
81 |
+
.block {
|
82 |
+
margin: 0 auto !important;
|
83 |
+
max-width: 100% !important;
|
84 |
+
}
|
85 |
+
|
86 |
+
.markdown-text {
|
87 |
+
font-size: 18px !important;
|
88 |
+
line-height: 1.6 !important;
|
89 |
+
}
|
90 |
+
|
91 |
+
/* Larger font for introduction text */
|
92 |
+
.section-card {
|
93 |
+
font-size: 22px !important;
|
94 |
+
line-height: 1.7 !important;
|
95 |
+
}
|
96 |
+
|
97 |
+
.section-card p {
|
98 |
+
font-size: 22px !important;
|
99 |
+
line-height: 1.7 !important;
|
100 |
+
}
|
101 |
+
|
102 |
+
.section-card .markdown-text {
|
103 |
+
font-size: 22px !important;
|
104 |
+
line-height: 1.7 !important;
|
105 |
+
}
|
106 |
+
|
107 |
+
/* Header styling */
|
108 |
+
#space-title {
|
109 |
+
text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
|
110 |
+
margin-bottom: 0.5rem !important;
|
111 |
+
}
|
112 |
+
|
113 |
+
.center-logo {
|
114 |
+
display: flex !important;
|
115 |
+
justify-content: center !important;
|
116 |
+
align-items: center !important;
|
117 |
+
margin: 0.25rem 0 0.5rem 0 !important;
|
118 |
+
}
|
119 |
+
|
120 |
+
.center-logo img {
|
121 |
+
width: 200px !important;
|
122 |
+
height: 200px !important;
|
123 |
+
border-radius: 50% !important;
|
124 |
+
overflow: hidden !important;
|
125 |
+
object-fit: cover !important;
|
126 |
+
box-shadow: 0 8px 32px rgba(0,0,0,0.3) !important;
|
127 |
+
border: 3px solid rgba(255,255,255,0.1) !important;
|
128 |
+
}
|
129 |
+
|
130 |
+
/* Tab styling */
|
131 |
+
.tab-nav {
|
132 |
+
margin: 1rem 0 !important;
|
133 |
+
display: flex !important;
|
134 |
+
justify-content: center !important;
|
135 |
+
}
|
136 |
+
|
137 |
+
.tab-buttons {
|
138 |
+
display: flex !important;
|
139 |
+
justify-content: center !important;
|
140 |
+
flex-wrap: wrap !important;
|
141 |
+
gap: 8px !important;
|
142 |
+
}
|
143 |
+
|
144 |
+
.tab-buttons button {
|
145 |
+
font-size: 22px !important;
|
146 |
+
padding: 16px 32px !important;
|
147 |
+
margin: 0 6px !important;
|
148 |
+
border-radius: 8px !important;
|
149 |
+
border: 2px solid transparent !important;
|
150 |
+
background: rgba(255,255,255,0.1) !important;
|
151 |
+
color: white !important;
|
152 |
+
transition: all 0.3s ease !important;
|
153 |
+
}
|
154 |
+
|
155 |
+
.tab-buttons button:hover {
|
156 |
+
background: rgba(255,255,255,0.2) !important;
|
157 |
+
transform: translateY(-2px) !important;
|
158 |
+
}
|
159 |
+
|
160 |
+
.tab-buttons button.selected {
|
161 |
+
background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
|
162 |
+
border-color: #6366f1 !important;
|
163 |
+
box-shadow: 0 4px 12px rgba(99, 102, 241, 0.3) !important;
|
164 |
+
}
|
165 |
+
|
166 |
+
/* Leaderboard table styling */
|
167 |
+
#leaderboard-table {
|
168 |
+
margin: 20px 0 !important;
|
169 |
+
border-radius: 12px !important;
|
170 |
+
overflow: hidden !important;
|
171 |
+
box-shadow: 0 8px 32px rgba(0,0,0,0.2) !important;
|
172 |
+
}
|
173 |
+
|
174 |
+
#leaderboard-table table {
|
175 |
+
border-collapse: separate !important;
|
176 |
+
border-spacing: 0 !important;
|
177 |
+
width: 100% !important;
|
178 |
+
}
|
179 |
+
|
180 |
+
#leaderboard-table th {
|
181 |
+
background: linear-gradient(135deg, #4f46e5, #6366f1) !important;
|
182 |
+
color: white !important;
|
183 |
+
padding: 22px !important;
|
184 |
+
font-weight: 600 !important;
|
185 |
+
text-align: left !important;
|
186 |
+
border: none !important;
|
187 |
+
font-size: 16px !important;
|
188 |
+
}
|
189 |
+
|
190 |
+
#leaderboard-table td {
|
191 |
+
padding: 20px 22px !important;
|
192 |
+
border: none !important;
|
193 |
+
font-size: 16px !important;
|
194 |
+
}
|
195 |
+
|
196 |
+
#leaderboard-table tr:nth-child(even) {
|
197 |
+
background: rgba(255,255,255,0.05) !important;
|
198 |
+
}
|
199 |
+
|
200 |
+
#leaderboard-table tr:hover {
|
201 |
+
background: rgba(99, 102, 241, 0.1) !important;
|
202 |
+
transform: scale(1.01) !important;
|
203 |
+
transition: all 0.2s ease !important;
|
204 |
+
}
|
205 |
+
|
206 |
+
/* Rank column styling */
|
207 |
+
#leaderboard-table td:nth-child(1),
|
208 |
+
#leaderboard-table th:nth-child(1) {
|
209 |
+
text-align: center !important;
|
210 |
+
width: 80px !important;
|
211 |
+
min-width: 80px !important;
|
212 |
+
max-width: 80px !important;
|
213 |
+
font-size: 18px !important;
|
214 |
+
font-weight: 600 !important;
|
215 |
+
}
|
216 |
+
|
217 |
+
/* Model column styling */
|
218 |
+
#leaderboard-table td:nth-child(2),
|
219 |
+
#leaderboard-table th:nth-child(2) {
|
220 |
+
min-width: 180px !important;
|
221 |
+
max-width: 300px !important;
|
222 |
+
overflow: hidden !important;
|
223 |
+
white-space: nowrap !important;
|
224 |
+
text-overflow: ellipsis !important;
|
225 |
+
font-size: 16px !important;
|
226 |
+
}
|
227 |
+
|
228 |
+
/* Events column styling (numeric) */
|
229 |
+
#leaderboard-table td:nth-child(3),
|
230 |
+
#leaderboard-table th:nth-child(3) {
|
231 |
+
text-align: center !important;
|
232 |
+
width: 90px !important;
|
233 |
+
min-width: 90px !important;
|
234 |
+
max-width: 90px !important;
|
235 |
+
font-size: 16px !important;
|
236 |
+
font-weight: 600 !important;
|
237 |
+
}
|
238 |
+
|
239 |
+
/* Average column styling (percentage) */
|
240 |
+
#leaderboard-table td:nth-child(4),
|
241 |
+
#leaderboard-table th:nth-child(4) {
|
242 |
+
text-align: center !important;
|
243 |
+
width: 110px !important;
|
244 |
+
min-width: 110px !important;
|
245 |
+
max-width: 110px !important;
|
246 |
+
font-size: 17px !important;
|
247 |
+
font-weight: 700 !important;
|
248 |
+
color: #10b981 !important;
|
249 |
+
}
|
250 |
+
|
251 |
+
/* Task-specific columns (News, PolyMarket) - compact percentage columns */
|
252 |
+
#leaderboard-table td:nth-child(n+5),
|
253 |
+
#leaderboard-table th:nth-child(n+5) {
|
254 |
+
text-align: center !important;
|
255 |
+
width: 100px !important;
|
256 |
+
min-width: 100px !important;
|
257 |
+
max-width: 100px !important;
|
258 |
+
font-size: 16px !important;
|
259 |
+
font-weight: 600 !important;
|
260 |
+
}
|
261 |
+
|
262 |
+
/* Dropdown styling */
|
263 |
+
.dropdown {
|
264 |
+
margin: 20px 0 !important;
|
265 |
+
width: 100% !important;
|
266 |
+
}
|
267 |
+
|
268 |
+
.dropdown select {
|
269 |
+
background: rgba(255,255,255,0.1) !important;
|
270 |
+
border: 2px solid rgba(255,255,255,0.2) !important;
|
271 |
+
border-radius: 8px !important;
|
272 |
+
padding: 12px 18px !important;
|
273 |
+
color: white !important;
|
274 |
+
font-size: 16px !important;
|
275 |
+
width: 100% !important;
|
276 |
+
max-width: 300px !important;
|
277 |
+
}
|
278 |
+
|
279 |
+
/* Button styling */
|
280 |
+
#refresh-button, .refresh-btn {
|
281 |
+
background: linear-gradient(135deg, #10b981, #059669) !important;
|
282 |
+
color: white !important;
|
283 |
+
border: none !important;
|
284 |
+
padding: 14px 28px !important;
|
285 |
+
border-radius: 8px !important;
|
286 |
+
cursor: pointer !important;
|
287 |
+
font-size: 18px !important;
|
288 |
+
font-weight: 500 !important;
|
289 |
+
transition: all 0.3s ease !important;
|
290 |
+
box-shadow: 0 4px 12px rgba(16, 185, 129, 0.3) !important;
|
291 |
+
}
|
292 |
+
|
293 |
+
#refresh-button:hover, .refresh-btn:hover {
|
294 |
+
background: linear-gradient(135deg, #059669, #047857) !important;
|
295 |
+
transform: translateY(-2px) !important;
|
296 |
+
box-shadow: 0 6px 16px rgba(16, 185, 129, 0.4) !important;
|
297 |
+
}
|
298 |
+
|
299 |
+
/* Cards and sections */
|
300 |
+
.section-card {
|
301 |
+
background: rgba(255,255,255,0.05) !important;
|
302 |
+
border-radius: 12px !important;
|
303 |
+
padding: 25px !important;
|
304 |
+
margin: 15px 0 !important;
|
305 |
+
border: 1px solid rgba(255,255,255,0.1) !important;
|
306 |
+
box-shadow: 0 4px 16px rgba(0,0,0,0.1) !important;
|
307 |
+
max-width: 100% !important;
|
308 |
+
}
|
309 |
+
|
310 |
+
/* Metrics and stats */
|
311 |
+
.metric-highlight {
|
312 |
+
color: #10b981 !important;
|
313 |
+
font-weight: 600 !important;
|
314 |
+
}
|
315 |
+
|
316 |
+
.model-rank-1 {
|
317 |
+
background: linear-gradient(135deg, #fbbf24, #f59e0b) !important;
|
318 |
+
color: #1f2937 !important;
|
319 |
+
font-weight: 600 !important;
|
320 |
+
}
|
321 |
+
|
322 |
+
.model-rank-2 {
|
323 |
+
background: linear-gradient(135deg, #e5e7eb, #d1d5db) !important;
|
324 |
+
color: #1f2937 !important;
|
325 |
+
font-weight: 600 !important;
|
326 |
+
}
|
327 |
+
|
328 |
+
.model-rank-3 {
|
329 |
+
background: linear-gradient(135deg, #cd7c2f, #a16207) !important;
|
330 |
+
color: white !important;
|
331 |
+
font-weight: 600 !important;
|
332 |
+
}
|
333 |
+
|
334 |
+
/* Performance badges */
|
335 |
+
.rank-badge {
|
336 |
+
display: inline-block !important;
|
337 |
+
padding: 4px 8px !important;
|
338 |
+
border-radius: 20px !important;
|
339 |
+
font-size: 10px !important;
|
340 |
+
font-weight: 600 !important;
|
341 |
+
margin-right: 8px !important;
|
342 |
+
}
|
343 |
+
|
344 |
+
.rank-1 .rank-badge {
|
345 |
+
background: linear-gradient(135deg, #fbbf24, #f59e0b) !important;
|
346 |
+
color: #1f2937 !important;
|
347 |
+
}
|
348 |
+
|
349 |
+
.rank-2 .rank-badge {
|
350 |
+
background: linear-gradient(135deg, #e5e7eb, #d1d5db) !important;
|
351 |
+
color: #1f2937 !important;
|
352 |
+
}
|
353 |
+
|
354 |
+
.rank-3 .rank-badge {
|
355 |
+
background: linear-gradient(135deg, #cd7c2f, #a16207) !important;
|
356 |
+
color: white !important;
|
357 |
+
}
|
358 |
+
|
359 |
+
/* Progress bars for accuracy */
|
360 |
+
.accuracy-bar {
|
361 |
+
width: 100% !important;
|
362 |
+
height: 6px !important;
|
363 |
+
background: rgba(255,255,255,0.1) !important;
|
364 |
+
border-radius: 3px !important;
|
365 |
+
margin-top: 4px !important;
|
366 |
+
overflow: hidden !important;
|
367 |
+
}
|
368 |
+
|
369 |
+
.accuracy-progress {
|
370 |
+
height: 100% !important;
|
371 |
+
background: linear-gradient(90deg, #10b981, #059669) !important;
|
372 |
+
border-radius: 3px !important;
|
373 |
+
transition: width 0.8s ease !important;
|
374 |
+
}
|
375 |
+
|
376 |
+
/* Enhanced summary section */
|
377 |
+
.summary-stats {
|
378 |
+
display: grid !important;
|
379 |
+
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)) !important;
|
380 |
+
gap: 20px !important;
|
381 |
+
margin: 20px 0 !important;
|
382 |
+
}
|
383 |
+
|
384 |
+
.stat-card {
|
385 |
+
background: rgba(255,255,255,0.08) !important;
|
386 |
+
border-radius: 12px !important;
|
387 |
+
padding: 20px !important;
|
388 |
+
border: 1px solid rgba(255,255,255,0.1) !important;
|
389 |
+
text-align: center !important;
|
390 |
+
transition: transform 0.3s ease !important;
|
391 |
+
}
|
392 |
+
|
393 |
+
.stat-card:hover {
|
394 |
+
transform: translateY(-4px) !important;
|
395 |
+
}
|
396 |
+
|
397 |
+
.stat-value {
|
398 |
+
font-size: 1.875rem !important;
|
399 |
+
font-weight: 700 !important;
|
400 |
+
color: #10b981 !important;
|
401 |
+
margin-bottom: 8px !important;
|
402 |
+
}
|
403 |
+
|
404 |
+
.stat-label {
|
405 |
+
font-size: 0.775rem !important;
|
406 |
+
color: rgba(255,255,255,0.7) !important;
|
407 |
+
text-transform: uppercase !important;
|
408 |
+
letter-spacing: 0.5px !important;
|
409 |
+
}
|
410 |
+
|
411 |
+
/* Better section headers */
|
412 |
+
.section-header {
|
413 |
+
display: flex !important;
|
414 |
+
align-items: center !important;
|
415 |
+
gap: 12px !important;
|
416 |
+
margin: 0 0 15px 0 !important;
|
417 |
+
font-size: 1.675rem !important;
|
418 |
+
font-weight: 600 !important;
|
419 |
+
}
|
420 |
+
|
421 |
+
.section-icon {
|
422 |
+
font-size: 1.375rem !important;
|
423 |
+
}
|
424 |
+
|
425 |
+
/* Improved table styling */
|
426 |
+
#leaderboard-table tr:first-child td:first-child {
|
427 |
+
position: relative !important;
|
428 |
+
}
|
429 |
+
|
430 |
+
#leaderboard-table tr:nth-child(1) {
|
431 |
+
background: rgba(251, 191, 36, 0.1) !important;
|
432 |
+
}
|
433 |
+
|
434 |
+
#leaderboard-table tr:nth-child(2) {
|
435 |
+
background: rgba(229, 231, 235, 0.1) !important;
|
436 |
+
}
|
437 |
+
|
438 |
+
#leaderboard-table tr:nth-child(3) {
|
439 |
+
background: rgba(205, 124, 47, 0.1) !important;
|
440 |
+
}
|
441 |
+
|
442 |
+
/* Loading animations */
|
443 |
+
@keyframes fadeInUp {
|
444 |
+
from {
|
445 |
+
opacity: 0;
|
446 |
+
transform: translateY(20px);
|
447 |
+
}
|
448 |
+
to {
|
449 |
+
opacity: 1;
|
450 |
+
transform: translateY(0);
|
451 |
+
}
|
452 |
+
}
|
453 |
+
|
454 |
+
.fade-in-up {
|
455 |
+
animation: fadeInUp 0.6s ease-out !important;
|
456 |
+
}
|
457 |
+
|
458 |
+
/* Staggered animations */
|
459 |
+
.stagger-1 { animation-delay: 0.1s !important; }
|
460 |
+
.stagger-2 { animation-delay: 0.2s !important; }
|
461 |
+
.stagger-3 { animation-delay: 0.3s !important; }
|
462 |
+
.stagger-4 { animation-delay: 0.4s !important; }
|
463 |
+
|
464 |
+
/* Enhanced buttons */
|
465 |
+
.icon-button {
|
466 |
+
display: inline-flex !important;
|
467 |
+
align-items: center !important;
|
468 |
+
gap: 8px !important;
|
469 |
+
}
|
470 |
+
|
471 |
+
.icon-button::before {
|
472 |
+
font-size: 1.0em !important;
|
473 |
+
}
|
474 |
+
|
475 |
+
/* Improved markdown styling */
|
476 |
+
.markdown-text h1 {
|
477 |
+
color: #10b981 !important;
|
478 |
+
border-bottom: 2px solid rgba(16, 185, 129, 0.3) !important;
|
479 |
+
padding-bottom: 8px !important;
|
480 |
+
}
|
481 |
+
|
482 |
+
.markdown-text h2 {
|
483 |
+
color: #6366f1 !important;
|
484 |
+
margin-top: 2rem !important;
|
485 |
+
}
|
486 |
+
|
487 |
+
.markdown-text h3 {
|
488 |
+
color: #8b5cf6 !important;
|
489 |
+
}
|
490 |
+
|
491 |
+
.markdown-text ul {
|
492 |
+
padding-left: 20px !important;
|
493 |
+
}
|
494 |
+
|
495 |
+
.markdown-text li {
|
496 |
+
margin: 8px 0 !important;
|
497 |
+
list-style-type: none !important;
|
498 |
+
position: relative !important;
|
499 |
+
}
|
500 |
+
|
501 |
+
.markdown-text li::before {
|
502 |
+
content: "▸" !important;
|
503 |
+
color: #10b981 !important;
|
504 |
+
position: absolute !important;
|
505 |
+
left: -16px !important;
|
506 |
+
font-weight: bold !important;
|
507 |
+
}
|
508 |
+
|
509 |
+
/* Responsive design */
|
510 |
+
@media (max-width: 768px) {
|
511 |
+
/* Adjust container margins for mobile */
|
512 |
+
.gradio-container,
|
513 |
+
.container,
|
514 |
+
.main {
|
515 |
+
padding: 0 30px !important;
|
516 |
+
}
|
517 |
+
|
518 |
+
#space-title {
|
519 |
+
font-size: 2.375rem !important;
|
520 |
+
}
|
521 |
+
|
522 |
+
.center-logo img {
|
523 |
+
width: 150px !important;
|
524 |
+
height: 150px !important;
|
525 |
+
}
|
526 |
+
|
527 |
+
.tab-buttons button {
|
528 |
+
font-size: 18px !important;
|
529 |
+
padding: 14px 24px !important;
|
530 |
+
}
|
531 |
+
|
532 |
+
.summary-stats {
|
533 |
+
grid-template-columns: 1fr !important;
|
534 |
+
}
|
535 |
+
|
536 |
+
.stat-value {
|
537 |
+
font-size: 1.375rem !important;
|
538 |
+
}
|
539 |
+
|
540 |
+
/* Maintain readable font sizes on mobile */
|
541 |
+
#leaderboard-table th {
|
542 |
+
font-size: 14px !important;
|
543 |
+
padding: 16px 12px !important;
|
544 |
+
}
|
545 |
+
|
546 |
+
#leaderboard-table td {
|
547 |
+
font-size: 14px !important;
|
548 |
+
padding: 16px 12px !important;
|
549 |
+
}
|
550 |
+
|
551 |
+
/* Adjust column widths for mobile */
|
552 |
+
#leaderboard-table td:nth-child(1),
|
553 |
+
#leaderboard-table th:nth-child(1) {
|
554 |
+
width: 60px !important;
|
555 |
+
min-width: 60px !important;
|
556 |
+
max-width: 60px !important;
|
557 |
+
}
|
558 |
+
|
559 |
+
#leaderboard-table td:nth-child(n+5),
|
560 |
+
#leaderboard-table th:nth-child(n+5) {
|
561 |
+
width: 90px !important;
|
562 |
+
min-width: 90px !important;
|
563 |
+
max-width: 90px !important;
|
564 |
+
}
|
565 |
+
}
|
566 |
+
"""
|
src/leaderboard_utils.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import timedelta
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
|
5 |
+
from .about import Tasks
|
6 |
+
from .display_utils import format_percentage, make_clickable_model
|
7 |
+
|
8 |
+
|
9 |
+
def clean_model_name(model_name: str) -> str:
|
10 |
+
"""Clean up model names for better display"""
|
11 |
+
if model_name.startswith("smolagents-tavily-web-visit-"):
|
12 |
+
return "Agent Baseline " + model_name.removeprefix("smolagents-tavily-web-visit-")
|
13 |
+
if model_name.startswith("language-model-"):
|
14 |
+
return "Language Model " + model_name.removeprefix("language-model-")
|
15 |
+
return model_name
|
16 |
+
|
17 |
+
|
18 |
+
def get_available_weeks(predictions_df):
|
19 |
+
"""Get list of available weeks from the data"""
|
20 |
+
if predictions_df is None or predictions_df.empty:
|
21 |
+
return []
|
22 |
+
|
23 |
+
# Get unique dates and convert to weeks
|
24 |
+
dates = predictions_df["open_to_bet_until"].dt.date.unique()
|
25 |
+
weeks = {}
|
26 |
+
|
27 |
+
for date in dates:
|
28 |
+
# Get the Monday of the week for this date
|
29 |
+
monday = date - timedelta(days=date.weekday())
|
30 |
+
week_end = monday + timedelta(days=6)
|
31 |
+
week_key = f"{monday} to {week_end}"
|
32 |
+
week_range = (monday, week_end)
|
33 |
+
weeks[week_key] = week_range
|
34 |
+
|
35 |
+
# Sort by date
|
36 |
+
sorted_weeks = sorted(weeks.items(), key=lambda x: x[1][0])
|
37 |
+
|
38 |
+
return [("All Time", None)] + sorted_weeks
|
39 |
+
|
40 |
+
|
41 |
+
def filter_data_by_week(predictions_df, week_range):
|
42 |
+
"""Filter predictions data by week range"""
|
43 |
+
if predictions_df is None or predictions_df.empty or week_range is None:
|
44 |
+
return predictions_df
|
45 |
+
|
46 |
+
start_date, end_date = week_range
|
47 |
+
|
48 |
+
# Filter data where open_to_bet_until falls within the week
|
49 |
+
filtered_df = predictions_df[(predictions_df["open_to_bet_until"].dt.date >= start_date) & (predictions_df["open_to_bet_until"].dt.date <= end_date)]
|
50 |
+
|
51 |
+
return filtered_df
|
52 |
+
|
53 |
+
|
54 |
+
def create_leaderboard_df(predictions_df, week_filter=None):
|
55 |
+
"""
|
56 |
+
Create leaderboard DataFrame from predictions CSV data
|
57 |
+
Much simpler than Future-Bench's complex JSON parsing
|
58 |
+
"""
|
59 |
+
if predictions_df is None or predictions_df.empty:
|
60 |
+
return pd.DataFrame()
|
61 |
+
|
62 |
+
# Apply week filter if specified
|
63 |
+
if week_filter is not None:
|
64 |
+
predictions_df = filter_data_by_week(predictions_df, week_filter)
|
65 |
+
|
66 |
+
if predictions_df.empty:
|
67 |
+
return pd.DataFrame()
|
68 |
+
|
69 |
+
# Calculate accuracy by algorithm and event type
|
70 |
+
results = []
|
71 |
+
|
72 |
+
# Group by algorithm to calculate metrics
|
73 |
+
for algorithm in predictions_df["algorithm_name"].unique():
|
74 |
+
algo_data = predictions_df[predictions_df["algorithm_name"] == algorithm]
|
75 |
+
|
76 |
+
# Filter out rows where result is null (unresolved events)
|
77 |
+
resolved_data = algo_data[algo_data["result"].notna()]
|
78 |
+
|
79 |
+
if len(resolved_data) == 0:
|
80 |
+
continue
|
81 |
+
|
82 |
+
# Calculate accuracy for each event type
|
83 |
+
cleaned_algorithm = clean_model_name(algorithm)
|
84 |
+
algo_scores = {"Model": make_clickable_model(cleaned_algorithm), "Events": len(resolved_data), "Correct Predictions": 0}
|
85 |
+
|
86 |
+
task_scores = []
|
87 |
+
|
88 |
+
for task in Tasks:
|
89 |
+
task_data = resolved_data[resolved_data["event_type"] == task.value.benchmark]
|
90 |
+
|
91 |
+
if len(task_data) > 0:
|
92 |
+
# Calculate accuracy for this task
|
93 |
+
# Handle different prediction formats
|
94 |
+
correct = 0
|
95 |
+
total = len(task_data)
|
96 |
+
|
97 |
+
for _, row in task_data.iterrows():
|
98 |
+
prediction = row["actual_prediction"]
|
99 |
+
actual = row["result"]
|
100 |
+
|
101 |
+
# Simple string comparison for now
|
102 |
+
# Could be enhanced for more complex prediction formats
|
103 |
+
if str(prediction).lower().strip() == str(actual).lower().strip():
|
104 |
+
correct += 1
|
105 |
+
|
106 |
+
accuracy = (correct / total) * 100 if total > 0 else 0
|
107 |
+
algo_scores[task.value.col_name] = accuracy
|
108 |
+
task_scores.append(accuracy)
|
109 |
+
|
110 |
+
# Add to total correct predictions
|
111 |
+
algo_scores["Correct Predictions"] += correct
|
112 |
+
else:
|
113 |
+
algo_scores[task.value.col_name] = None
|
114 |
+
|
115 |
+
# Calculate average accuracy across tasks where model made predictions
|
116 |
+
if task_scores:
|
117 |
+
algo_scores["Average"] = sum(task_scores) / len(task_scores)
|
118 |
+
else:
|
119 |
+
algo_scores["Average"] = 0
|
120 |
+
|
121 |
+
results.append(algo_scores)
|
122 |
+
|
123 |
+
# Create DataFrame
|
124 |
+
df = pd.DataFrame(results)
|
125 |
+
|
126 |
+
# Sort by average score (descending)
|
127 |
+
if "Average" in df.columns:
|
128 |
+
df = df.sort_values("Average", ascending=False)
|
129 |
+
|
130 |
+
# Reset index to ensure proper row indexing
|
131 |
+
df = df.reset_index(drop=True)
|
132 |
+
|
133 |
+
# Add rank column with medals for top 3 and numbers for rest
|
134 |
+
ranks = []
|
135 |
+
for i in range(len(df)):
|
136 |
+
if i == 0:
|
137 |
+
ranks.append("🥇")
|
138 |
+
elif i == 1:
|
139 |
+
ranks.append("🥈")
|
140 |
+
elif i == 2:
|
141 |
+
ranks.append("🥉")
|
142 |
+
else:
|
143 |
+
ranks.append(f"#{i + 1}")
|
144 |
+
|
145 |
+
# Insert rank column at the beginning
|
146 |
+
df.insert(0, "Rank", ranks)
|
147 |
+
|
148 |
+
# Format percentage columns
|
149 |
+
for task in Tasks:
|
150 |
+
if task.value.col_name in df.columns:
|
151 |
+
df[task.value.col_name] = df[task.value.col_name].apply(format_percentage)
|
152 |
+
|
153 |
+
if "Average" in df.columns:
|
154 |
+
df["Average"] = df["Average"].apply(format_percentage)
|
155 |
+
|
156 |
+
return df
|
157 |
+
|
158 |
+
|
159 |
+
def get_leaderboard_summary(df):
|
160 |
+
"""Get summary statistics for the leaderboard"""
|
161 |
+
if df is None or df.empty:
|
162 |
+
return {"total_models": 0, "total_predictions": 0, "avg_accuracy": 0}
|
163 |
+
|
164 |
+
total_models = len(df)
|
165 |
+
total_predictions = df["Events"].sum() if "Events" in df.columns else 0
|
166 |
+
|
167 |
+
# Calculate average accuracy across all models
|
168 |
+
avg_accuracy = 0
|
169 |
+
if "Average" in df.columns:
|
170 |
+
# Extract numeric values from percentage strings
|
171 |
+
numeric_scores = []
|
172 |
+
for score in df["Average"]:
|
173 |
+
if score != "N/A":
|
174 |
+
try:
|
175 |
+
numeric_scores.append(float(score.replace("%", "")))
|
176 |
+
except Exception:
|
177 |
+
pass
|
178 |
+
|
179 |
+
if numeric_scores:
|
180 |
+
avg_accuracy = sum(numeric_scores) / len(numeric_scores)
|
181 |
+
|
182 |
+
return {"total_models": total_models, "total_predictions": total_predictions, "avg_accuracy": avg_accuracy}
|
183 |
+
|
184 |
+
|
185 |
+
def filter_leaderboard(df, min_predictions=0):
|
186 |
+
"""Filter leaderboard by minimum number of predictions"""
|
187 |
+
if df is None or df.empty:
|
188 |
+
return df
|
189 |
+
|
190 |
+
if "Events" in df.columns:
|
191 |
+
return df[df["Events"] >= min_predictions]
|
192 |
+
|
193 |
+
return df
|