vinid commited on
Commit
6441bc6
·
0 Parent(s):

Leaderboard deployment 2025-07-16 18:05:41

Browse files
README.md ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FutureBench Leaderboard
3
+ emoji: 🔮
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # FutureBench Leaderboard App
13
+
14
+ A minimal Gradio application for viewing FutureBench prediction data. This app downloads datasets from HuggingFace on startup and provides a web interface to explore the data.
15
+
16
+ ## Features
17
+
18
+ - 📊 **Data Summary**: View dataset statistics and information
19
+ - 🔍 **Sample Data**: Browse sample prediction records
20
+ - 📋 **About**: Learn about the FutureBench system
21
+ - 🔄 **Auto-refresh**: Download latest data on startup
22
+ - 📅 **Date Range Slider**: Filter the leaderboard by a custom date span
23
+
24
+ ## Setup
25
+
26
+ 1. Install dependencies:
27
+ ```bash
28
+ pip install -r requirements.txt
29
+ ```
30
+
31
+ 2. (Optional) Set your HuggingFace token for private repositories:
32
+ ```bash
33
+ export HF_TOKEN=your_token_here
34
+ ```
35
+
36
+ ## Running the App
37
+
38
+ Launch the Gradio application:
39
+
40
+ ```bash
41
+ python app.py
42
+ ```
43
+
44
+ The app will:
45
+ 1. Download datasets from HuggingFace repositories on startup
46
+ 2. Process the data and create summaries
47
+ 3. Launch a web interface at `http://localhost:7860`
48
+
49
+ ## Data Sources
50
+
51
+ The app downloads data from these HuggingFace repositories:
52
+ - `futurebench/requests` - Evaluation queue
53
+ - `futurebench/results` - Evaluation results
54
+ - `futurebench/data` - Main prediction dataset
55
+
56
+ ## Structure
57
+
58
+ - `app.py` - Main Gradio application
59
+ - `process_data/` - Data processing utilities
60
+ - `requirements.txt` - Python dependencies
61
+ - `README.md` - This file
62
+
63
+ ## Next Steps
64
+
65
+ This is a minimal version focusing on data download and display. Future enhancements will include:
66
+ - Full leaderboard with model rankings
67
+ - Interactive filtering and sorting
68
+ - Detailed performance metrics
69
+ - Model comparison tools
app.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from apscheduler.schedulers.background import BackgroundScheduler
6
+ from gradio_rangeslider import RangeSlider
7
+ from huggingface_hub import snapshot_download
8
+
9
+ # Import our data processing utilities
10
+ from process_data import API, DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, PREDICTIONS_CSV_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
11
+
12
+ # Import our leaderboard components
13
+ from src.about import ABOUT_TEXT, INTRODUCTION_TEXT, TITLE
14
+ from src.display_utils import CUSTOM_CSS, get_display_columns
15
+ from src.leaderboard_utils import create_leaderboard_df, get_available_weeks, get_leaderboard_summary
16
+
17
+ # Global variables for data
18
+ PREDICTIONS_DF = None
19
+ LEADERBOARD_DF = None
20
+ PREDICTION_DATES = []
21
+ AVAILABLE_WEEKS = []
22
+ DATA_SUMMARY = {}
23
+
24
+
25
+ def restart_space():
26
+ """Restart the space if needed"""
27
+ API.restart_space(repo_id=REPO_ID)
28
+
29
+
30
+ def download_and_process_data():
31
+ """Download and process data on startup"""
32
+ global PREDICTIONS_DF, LEADERBOARD_DF, PREDICTION_DATES, AVAILABLE_WEEKS, DATA_SUMMARY
33
+
34
+ print("=== Starting Data Download ===")
35
+
36
+ # Download eval requests (queue)
37
+ try:
38
+ print(f"Downloading eval requests to {EVAL_REQUESTS_PATH}")
39
+ snapshot_download(
40
+ repo_id=QUEUE_REPO,
41
+ local_dir=EVAL_REQUESTS_PATH,
42
+ repo_type="dataset",
43
+ tqdm_class=None,
44
+ etag_timeout=30,
45
+ token=TOKEN,
46
+ )
47
+ print("✓ Eval requests downloaded successfully")
48
+ except Exception as e:
49
+ print(f"Error downloading eval requests: {e}")
50
+
51
+ # Download eval results
52
+ try:
53
+ print(f"Downloading eval results to {EVAL_RESULTS_PATH}")
54
+ snapshot_download(
55
+ repo_id=RESULTS_REPO,
56
+ local_dir=EVAL_RESULTS_PATH,
57
+ repo_type="dataset",
58
+ tqdm_class=None,
59
+ etag_timeout=30,
60
+ token=TOKEN,
61
+ )
62
+ print("✓ Eval results downloaded successfully")
63
+ except Exception as e:
64
+ print(f"Error downloading eval results: {e}")
65
+
66
+ # Download prediction data (main dataset)
67
+ try:
68
+ print(f"Downloading prediction data to {PREDICTIONS_CSV_PATH}")
69
+ snapshot_download(
70
+ repo_id=DATA_REPO,
71
+ local_dir=PREDICTIONS_CSV_PATH,
72
+ repo_type="dataset",
73
+ tqdm_class=None,
74
+ etag_timeout=30,
75
+ token=TOKEN,
76
+ )
77
+ print("✓ Prediction data downloaded successfully")
78
+ except Exception as e:
79
+ print(f"Error downloading prediction data: {e}")
80
+
81
+ # Process the data
82
+ print("=== Processing Data ===")
83
+
84
+ # Load the main dataset
85
+ csv_path = os.path.join(PREDICTIONS_CSV_PATH, "data.csv")
86
+ if os.path.exists(csv_path):
87
+ print(f"Loading data from {csv_path}")
88
+ PREDICTIONS_DF = pd.read_csv(csv_path)
89
+
90
+ # Convert date columns
91
+ PREDICTIONS_DF["open_to_bet_until"] = pd.to_datetime(PREDICTIONS_DF["open_to_bet_until"])
92
+ PREDICTIONS_DF["prediction_created_at"] = pd.to_datetime(PREDICTIONS_DF["prediction_created_at"])
93
+
94
+ # Get prediction dates
95
+ PREDICTION_DATES = sorted(PREDICTIONS_DF["open_to_bet_until"].dt.date.unique())
96
+
97
+ # Get available weeks for filtering
98
+ AVAILABLE_WEEKS = get_available_weeks(PREDICTIONS_DF)
99
+
100
+ # Create leaderboard
101
+ print("Creating leaderboard...")
102
+ LEADERBOARD_DF = create_leaderboard_df(PREDICTIONS_DF)
103
+
104
+ # Create data summary
105
+ leaderboard_summary = get_leaderboard_summary(LEADERBOARD_DF)
106
+ DATA_SUMMARY = {
107
+ "total_records": len(PREDICTIONS_DF),
108
+ "unique_events": PREDICTIONS_DF["event_id"].nunique(),
109
+ "unique_algorithms": PREDICTIONS_DF["algorithm_name"].nunique(),
110
+ "unique_event_types": PREDICTIONS_DF["event_type"].nunique(),
111
+ "date_range": f"{PREDICTION_DATES[0]} to {PREDICTION_DATES[-1]}" if PREDICTION_DATES else "N/A",
112
+ "algorithms": PREDICTIONS_DF["algorithm_name"].unique().tolist(),
113
+ "event_types": PREDICTIONS_DF["event_type"].unique().tolist(),
114
+ "leaderboard_summary": leaderboard_summary,
115
+ }
116
+
117
+ print("✓ Data processed successfully")
118
+ print(f" - Total records: {DATA_SUMMARY['total_records']}")
119
+ print(f" - Unique events: {DATA_SUMMARY['unique_events']}")
120
+ print(f" - Unique algorithms: {DATA_SUMMARY['unique_algorithms']}")
121
+ print(f" - Leaderboard models: {leaderboard_summary['total_models']}")
122
+ print(f" - Date range: {DATA_SUMMARY['date_range']}")
123
+
124
+ else:
125
+ print(f"❌ Error: data.csv not found at {csv_path}")
126
+ PREDICTIONS_DF = pd.DataFrame()
127
+ LEADERBOARD_DF = pd.DataFrame()
128
+ DATA_SUMMARY = {"error": "No data found"}
129
+
130
+
131
+ def get_leaderboard(date_range=None):
132
+ """Return leaderboard filtered by date range"""
133
+ if PREDICTIONS_DF is None or PREDICTIONS_DF.empty:
134
+ return pd.DataFrame({"message": ["No data available"]})
135
+
136
+ # Determine range of dates to filter by
137
+ if not PREDICTION_DATES:
138
+ return pd.DataFrame({"message": ["No dates available"]})
139
+
140
+ if date_range is None:
141
+ start_idx, end_idx = 0, len(PREDICTION_DATES) - 1
142
+ else:
143
+ start_idx, end_idx = date_range
144
+ start_idx = max(0, min(start_idx, len(PREDICTION_DATES) - 1))
145
+ end_idx = max(start_idx, min(end_idx, len(PREDICTION_DATES) - 1))
146
+ start_idx, end_idx = int(start_idx), int(end_idx)
147
+
148
+ week_range = (PREDICTION_DATES[start_idx], PREDICTION_DATES[end_idx])
149
+
150
+ # Create filtered leaderboard
151
+ filtered_leaderboard = create_leaderboard_df(PREDICTIONS_DF, week_range)
152
+
153
+ if filtered_leaderboard.empty:
154
+ return pd.DataFrame({"message": ["No data available for selected week"]})
155
+
156
+ # Return only display columns
157
+ display_cols = get_display_columns()
158
+ available_cols = [col for col in display_cols if col in filtered_leaderboard.columns]
159
+
160
+ return filtered_leaderboard[available_cols]
161
+
162
+
163
+ def get_data_summary():
164
+ """Return formatted data summary"""
165
+ if not DATA_SUMMARY:
166
+ return "No data loaded"
167
+
168
+ if "error" in DATA_SUMMARY:
169
+ return f"Error: {DATA_SUMMARY['error']}"
170
+
171
+ summary = DATA_SUMMARY.get("leaderboard_summary", {})
172
+
173
+ summary_text = f"""
174
+ # 🏆 Leaderboard Summary
175
+
176
+ - **Models Ranked**: {summary.get("total_models", 0)}
177
+ - **Total Predictions**: {summary.get("total_predictions", 0):,}
178
+ - **Average Accuracy**: {summary.get("avg_accuracy", 0):.1f}%
179
+
180
+ # 📊 Dataset Overview
181
+
182
+ - **Total Records**: {DATA_SUMMARY["total_records"]:,}
183
+ - **Unique Events**: {DATA_SUMMARY["unique_events"]:,}
184
+ - **Event Types**: {DATA_SUMMARY["unique_event_types"]}
185
+ - **Date Range**: {DATA_SUMMARY["date_range"]}
186
+
187
+ ## 🤖 Models
188
+ {", ".join(DATA_SUMMARY["algorithms"])}
189
+
190
+ ## 📋 Event Types
191
+ {", ".join(DATA_SUMMARY["event_types"])}
192
+ """
193
+
194
+ return summary_text
195
+
196
+
197
+ def get_sample_data():
198
+ """Return sample of the data"""
199
+ if PREDICTIONS_DF is None or PREDICTIONS_DF.empty:
200
+ return pd.DataFrame({"message": ["No data available"]})
201
+
202
+ # Return first 10 rows with key columns
203
+ sample_cols = ["event_id", "question", "event_type", "algorithm_name", "actual_prediction", "result", "open_to_bet_until"]
204
+ available_cols = [col for col in sample_cols if col in PREDICTIONS_DF.columns]
205
+
206
+ return PREDICTIONS_DF[available_cols].head(10)
207
+
208
+
209
+ def refresh_all_data(date_range=None):
210
+ """Refresh all data and return updated components"""
211
+ download_and_process_data()
212
+ return (
213
+ get_leaderboard(date_range),
214
+ get_data_summary(),
215
+ get_sample_data(),
216
+ )
217
+
218
+
219
+ # Download and process data on startup
220
+ download_and_process_data()
221
+
222
+ # Create Gradio interface
223
+ with gr.Blocks(css=CUSTOM_CSS, title="FutureBench Leaderboard") as demo:
224
+ gr.HTML(TITLE)
225
+ with gr.Row():
226
+ gr.Image("image/image.png", height=200, width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, container=False, elem_classes="center-logo")
227
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
228
+
229
+ with gr.Tabs():
230
+ with gr.TabItem("🏆 Leaderboard"):
231
+ leaderboard_display = gr.Dataframe(value=get_leaderboard(), interactive=False, wrap=True, elem_id="leaderboard-table")
232
+
233
+ with gr.Row():
234
+ date_slider = RangeSlider(
235
+ minimum=0,
236
+ maximum=len(PREDICTION_DATES) - 1,
237
+ value=(0, len(PREDICTION_DATES) - 1),
238
+ step=1,
239
+ label="📅 Date Range",
240
+ show_label=True,
241
+ labels=[str(d) for d in PREDICTION_DATES],
242
+ )
243
+
244
+ # Update leaderboard when date range is changed
245
+ date_slider.change(get_leaderboard, inputs=date_slider, outputs=leaderboard_display)
246
+
247
+ with gr.TabItem("📊 Summary"):
248
+ summary_display = gr.Markdown(get_data_summary(), elem_classes="markdown-text")
249
+ refresh_summary_btn = gr.Button("🔄 Refresh Summary")
250
+
251
+ refresh_summary_btn.click(lambda: get_data_summary(), outputs=summary_display)
252
+
253
+ with gr.TabItem("🔍 Sample Data"):
254
+ sample_display = gr.Dataframe(value=get_sample_data(), interactive=False, wrap=True)
255
+ refresh_sample_btn = gr.Button("🔄 Refresh Sample")
256
+
257
+ refresh_sample_btn.click(lambda: get_sample_data(), outputs=sample_display)
258
+
259
+ with gr.TabItem("📋 About"):
260
+ gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
261
+
262
+ if __name__ == "__main__":
263
+ scheduler = BackgroundScheduler()
264
+ scheduler.add_job(restart_space, "interval", seconds=1800)
265
+ scheduler.start()
266
+ demo.queue(default_concurrency_limit=40).launch()
image/image.png ADDED
logo.png ADDED
process_data/README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FutureBench Dataset Processing
2
+
3
+ This directory contains tools for processing FutureBench datasets, both downloading from HuggingFace and transforming your own database into the standard format.
4
+
5
+ ## Option 1: Download from HuggingFace (Original)
6
+
7
+ Use this to download the existing FutureBench dataset:
8
+
9
+ ```bash
10
+ python download_data.py
11
+ ```
12
+
13
+ ## Option 2: Transform Your Own Database
14
+
15
+ Use this to transform your production database into HuggingFace format:
16
+
17
+ ### Setup
18
+
19
+ 1. **Install dependencies:**
20
+ ```bash
21
+ pip install pandas sqlalchemy huggingface_hub
22
+ ```
23
+
24
+ 2. **Set up HuggingFace token:**
25
+ ```bash
26
+ export HF_TOKEN="your_huggingface_token_here"
27
+ ```
28
+
29
+ 3. **Configure your settings:**
30
+ Edit `config_db.py` to match your needs:
31
+ - Update `HF_CONFIG` with your HuggingFace repository names
32
+ - Adjust `PROCESSING_CONFIG` for data filtering preferences
33
+ - Note: Database connection uses the same setup as the main FutureBench app
34
+
35
+ ### Usage
36
+
37
+ ```bash
38
+ # Transform your database and upload to HuggingFace
39
+ python db_to_hf.py
40
+
41
+ # Or run locally without uploading
42
+ HF_TOKEN="" python db_to_hf.py
43
+ ```
44
+
45
+ ### Database Schema
46
+
47
+ The script uses the same database schema as the main FutureBench application:
48
+ - `EventBase` model for events
49
+ - `Prediction` model for predictions
50
+ - Uses SQLAlchemy ORM (same as `convert_to_csv.py`)
51
+
52
+ No additional database configuration needed - it uses the existing FutureBench database connection.
53
+
54
+ ### Output Format
55
+
56
+ The script produces data in the same format as the original FutureBench dataset:
57
+ - `event_id`, `question`, `event_type`, `algorithm_name`, `actual_prediction`, `result`, `open_to_bet_until`, `prediction_created_at`
58
+
59
+ ### Automation
60
+
61
+ You can run this as a scheduled job:
62
+
63
+ ```bash
64
+ # Add to crontab to run daily at 2 AM
65
+ 0 2 * * * cd /path/to/your/project && python leaderboard/process_data/db_to_hf.py
66
+ ```
67
+
68
+ ## Files
69
+
70
+ - `download_data.py` - Downloads data from HuggingFace repositories
71
+ - `db_to_hf.py` - Transforms your database to HuggingFace format
72
+ - `config_db.py` - Configuration for database connection and HF settings
73
+ - `config.py` - HuggingFace repository configuration
74
+ - `requirements.txt` - Python dependencies
75
+
76
+ ## Data Structure
77
+
78
+ The main dataset contains:
79
+ - `event_id`: Unique identifier for each event
80
+ - `question`: The prediction question
81
+ - `event_type`: Type of event (polymarket, soccer, etc.)
82
+ - `answer_options`: Possible answers in JSON format
83
+ - `result`: Actual outcome (if resolved)
84
+ - `algorithm_name`: AI model that made the prediction
85
+ - `actual_prediction`: The prediction made
86
+ - `open_to_bet_until`: Prediction window deadline
87
+ - `prediction_created_at`: When prediction was made
88
+
89
+ ## Output
90
+
91
+ The script generates:
92
+ - Downloaded datasets in local cache folders
93
+ - `evaluation_queue.csv` with unique events for processing
94
+ - Console output with data statistics and summary
process_data/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FutureBench Data Processing
3
+
4
+ This package contains utilities for downloading and processing FutureBench datasets from HuggingFace.
5
+ """
6
+
7
+ from .config import API, DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, OWNER, PREDICTIONS_CSV_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
8
+ from .download_data import download_datasets, generate_queue, process_data
9
+
10
+ __version__ = "0.1.0"
11
+ __all__ = ["TOKEN", "OWNER", "QUEUE_REPO", "RESULTS_REPO", "DATA_REPO", "REPO_ID", "EVAL_REQUESTS_PATH", "EVAL_RESULTS_PATH", "PREDICTIONS_CSV_PATH", "API", "download_datasets", "process_data", "generate_queue"]
process_data/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (805 Bytes). View file
 
process_data/__pycache__/config.cpython-312.pyc ADDED
Binary file (1.02 kB). View file
 
process_data/__pycache__/download_data.cpython-312.pyc ADDED
Binary file (6.45 kB). View file
 
process_data/config.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from huggingface_hub import HfApi
4
+
5
+ # Configuration for HuggingFace repositories
6
+ # ------------------------------------------
7
+ TOKEN = os.environ.get("HF_TOKEN") # A read token for accessing datasets
8
+
9
+ OWNER = "futurebench" # Change to your organization
10
+ # ------------------------------------------
11
+
12
+ # HuggingFace repository IDs
13
+ QUEUE_REPO = f"{OWNER}/requests"
14
+ RESULTS_REPO = f"{OWNER}/results"
15
+ DATA_REPO = f"{OWNER}/data"
16
+
17
+ # Local cache paths
18
+ CACHE_PATH = os.getenv("HF_HOME", ".")
19
+ EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
20
+ EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
21
+ PREDICTIONS_CSV_PATH = os.path.join(CACHE_PATH, "eval-data")
22
+ REPO_ID = f"{OWNER}/Future-Bench"
23
+
24
+ # HuggingFace API client
25
+ API = HfApi(token=TOKEN)
process_data/config_db.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration for database to HuggingFace pipeline.
3
+ Update these settings to match your setup.
4
+ """
5
+
6
+ import os
7
+
8
+ # Database Configuration
9
+ # Note: Database connection is handled by future_bench.database.get_session()
10
+ # The script uses the same database connection as the main FutureBench app
11
+
12
+
13
+ # HuggingFace Configuration
14
+ HF_CONFIG = {
15
+ "token": os.getenv("HF_TOKEN"), # Set this in your environment
16
+ "data_repo": "futurebench/data",
17
+ "results_repo": "futurebench/results",
18
+ "requests_repo": "futurebench/requests", # Optional: for model submissions
19
+ }
20
+
21
+ # Data Processing Settings
22
+ PROCESSING_CONFIG = {
23
+ "days_history": 180, # How many days of data to include
24
+ "min_predictions": 5, # Minimum predictions per model to include
25
+ "event_types": ["news", "polymarket", "sports"], # Which event types to include
26
+ "exclude_models": ["test", "debug"], # Models to exclude from public dataset
27
+ }
28
+
29
+ # Note: Schema mapping not needed since we use SQLAlchemy ORM models
30
+ # The script uses the same models as convert_to_csv.py:
31
+ # - EventBase (events table)
32
+ # - Prediction (predictions table)
process_data/db_to_hf.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to transform your production database into HuggingFace dataset format.
4
+ Follows the same pattern as FutureBench's convert_to_csv.py but simplified.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import tempfile
10
+ from datetime import datetime
11
+
12
+ import pandas as pd
13
+ from huggingface_hub import HfApi
14
+
15
+ # Add the parent directory to sys.path to allow imports (same as convert_to_csv.py)
16
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
17
+
18
+ # Import FutureBench models and database (same as convert_to_csv.py)
19
+ # Import configuration
20
+ from config_db import HF_CONFIG, PROCESSING_CONFIG
21
+
22
+ from future_bench.database import get_session
23
+ from future_bench.models import EventBase, Prediction
24
+
25
+
26
+ def datetime_to_string(dt):
27
+ """Convert datetime to string or return empty string if None (same as convert_to_csv.py)"""
28
+ return dt.isoformat() if dt else ""
29
+
30
+
31
+ def extract_events_and_predictions(session):
32
+ """
33
+ Extract events and predictions from your database.
34
+ Uses the same SQLAlchemy ORM approach as convert_to_csv.py.
35
+ """
36
+ # Get all events (same as convert_to_csv.py)
37
+ events = session.query(EventBase).all()
38
+ if not events:
39
+ print("No events found in the database.")
40
+ return pd.DataFrame()
41
+
42
+ # Get all predictions (same as convert_to_csv.py)
43
+ predictions = session.query(Prediction).all()
44
+ if not predictions:
45
+ print("No predictions found in the database.")
46
+ return pd.DataFrame()
47
+
48
+ # Create combined view (same logic as convert_to_csv.py)
49
+ combined_data = []
50
+ for event in events:
51
+ if event.result is None: # Skip unresolved events
52
+ continue
53
+
54
+ event_predictions = [p for p in predictions if p.event_id == event.id]
55
+ for pred in event_predictions:
56
+ combined_data.append(
57
+ {
58
+ "event_id": event.id,
59
+ "question": event.question,
60
+ "event_type": event.event_type,
61
+ "open_to_bet_until": datetime_to_string(event.open_to_bet_until),
62
+ "result": event.result,
63
+ "algorithm_name": pred.algorithm_name,
64
+ "actual_prediction": pred.actual_prediction,
65
+ "prediction_created_at": datetime_to_string(pred.created_at),
66
+ }
67
+ )
68
+
69
+ df = pd.DataFrame(combined_data)
70
+ return df
71
+
72
+
73
+ def transform_to_standard_format(df):
74
+ """
75
+ Transform your raw data into the standard format expected by your leaderboard.
76
+ This should match the CSV format your leaderboard already expects.
77
+ """
78
+ # Convert date columns with flexible parsing for microseconds
79
+ df["open_to_bet_until"] = pd.to_datetime(df["open_to_bet_until"], format="mixed")
80
+ df["prediction_created_at"] = pd.to_datetime(df["prediction_created_at"], format="mixed")
81
+
82
+ # Add any additional columns your leaderboard expects
83
+ df["source"] = "your-app" # Add source identifier
84
+
85
+ # Filter to data starting from June 12th
86
+ cutoff_date = datetime(2025, 6, 12)
87
+ df = df[df["prediction_created_at"] >= cutoff_date]
88
+ print(f" Filtered to predictions created from {cutoff_date.strftime('%B %d, %Y')} onwards: {len(df)} records remaining")
89
+
90
+ # Filter by event types
91
+ df = df[df["event_type"].isin(PROCESSING_CONFIG["event_types"])]
92
+
93
+ # Exclude test models
94
+ df = df[~df["algorithm_name"].isin(PROCESSING_CONFIG["exclude_models"])]
95
+
96
+ # Calculate accuracy per model (for summary)
97
+ accuracy_df = df.groupby(["algorithm_name", "event_type"]).agg({"actual_prediction": "count", "result": lambda x: (df.loc[x.index, "actual_prediction"] == x).sum()}).rename(columns={"actual_prediction": "total_predictions", "result": "correct_predictions"}).reset_index()
98
+
99
+ accuracy_df["accuracy"] = accuracy_df["correct_predictions"] / accuracy_df["total_predictions"]
100
+
101
+ return df, accuracy_df
102
+
103
+
104
+ def upload_to_huggingface(df, accuracy_df, repo_data, repo_results):
105
+ """
106
+ Upload the transformed data to HuggingFace repositories.
107
+ """
108
+ api = HfApi(token=HF_CONFIG["token"])
109
+
110
+ # Create temporary directory for files
111
+ with tempfile.TemporaryDirectory() as tmp_dir:
112
+ # Save main dataset
113
+ data_path = os.path.join(tmp_dir, "data.csv")
114
+ df.to_csv(data_path, index=False)
115
+
116
+ # Save accuracy summary
117
+ results_path = os.path.join(tmp_dir, "results.csv")
118
+ accuracy_df.to_csv(results_path, index=False)
119
+
120
+ # Upload to data repo
121
+ api.upload_file(path_or_fileobj=data_path, path_in_repo="data.csv", repo_id=repo_data, repo_type="dataset")
122
+
123
+ # Upload to results repo
124
+ api.upload_file(path_or_fileobj=results_path, path_in_repo="results.csv", repo_id=repo_results, repo_type="dataset")
125
+
126
+ print(f"✅ Uploaded data to {repo_data}")
127
+ print(f"✅ Uploaded results to {repo_results}")
128
+
129
+
130
+ def main():
131
+ """Main pipeline function"""
132
+ print("🚀 Starting database to HuggingFace pipeline...")
133
+
134
+ # Step 1: Extract from database (same as convert_to_csv.py)
135
+ print("📊 Extracting data from database...")
136
+ session = next(get_session())
137
+ try:
138
+ df = extract_events_and_predictions(session)
139
+ print(f" Found {len(df)} event-prediction pairs")
140
+ finally:
141
+ session.close()
142
+
143
+ if len(df) == 0:
144
+ print("❌ No data found in database")
145
+ return
146
+
147
+ # Step 2: Transform to standard format
148
+ print("🔄 Transforming data...")
149
+ df, accuracy_df = transform_to_standard_format(df)
150
+ print(f" Processed {len(df)} records")
151
+ print(f" Generated accuracy stats for {len(accuracy_df)} model-task pairs")
152
+
153
+ # Step 3: Upload to HuggingFace
154
+ if HF_CONFIG["token"]:
155
+ print("☁️ Uploading to HuggingFace...")
156
+ upload_to_huggingface(df, accuracy_df, HF_CONFIG["data_repo"], HF_CONFIG["results_repo"])
157
+ else:
158
+ print("⚠️ No HF_TOKEN found, saving locally instead...")
159
+ df.to_csv("data_export.csv", index=False)
160
+ accuracy_df.to_csv("results_export.csv", index=False)
161
+ print(" Saved data_export.csv and results_export.csv")
162
+
163
+ print("✅ Pipeline completed successfully!")
164
+
165
+
166
+ if __name__ == "__main__":
167
+ main()
process_data/download_data.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import os
3
+
4
+ import pandas as pd
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from .config import DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, PREDICTIONS_CSV_PATH, QUEUE_REPO, RESULTS_REPO, TOKEN
8
+
9
+
10
+ def download_datasets():
11
+ """Download datasets from HuggingFace repositories"""
12
+ print("Downloading datasets from HuggingFace...")
13
+
14
+ # Download eval requests (queue)
15
+ try:
16
+ print(f"Downloading eval requests to {EVAL_REQUESTS_PATH}")
17
+ snapshot_download(
18
+ repo_id=QUEUE_REPO,
19
+ local_dir=EVAL_REQUESTS_PATH,
20
+ repo_type="dataset",
21
+ tqdm_class=None,
22
+ etag_timeout=30,
23
+ token=TOKEN,
24
+ )
25
+ print("✓ Eval requests downloaded successfully")
26
+ except Exception as e:
27
+ print(f"Error downloading eval requests: {e}")
28
+
29
+ # Download eval results
30
+ try:
31
+ print(f"Downloading eval results to {EVAL_RESULTS_PATH}")
32
+ snapshot_download(
33
+ repo_id=RESULTS_REPO,
34
+ local_dir=EVAL_RESULTS_PATH,
35
+ repo_type="dataset",
36
+ tqdm_class=None,
37
+ etag_timeout=30,
38
+ token=TOKEN,
39
+ )
40
+ print("✓ Eval results downloaded successfully")
41
+ except Exception as e:
42
+ print(f"Error downloading eval results: {e}")
43
+
44
+ # Download prediction data (main dataset)
45
+ try:
46
+ print(f"Downloading prediction data to {PREDICTIONS_CSV_PATH}")
47
+ snapshot_download(
48
+ repo_id=DATA_REPO,
49
+ local_dir=PREDICTIONS_CSV_PATH,
50
+ repo_type="dataset",
51
+ tqdm_class=None,
52
+ etag_timeout=30,
53
+ token=TOKEN,
54
+ )
55
+ print("✓ Prediction data downloaded successfully")
56
+ except Exception as e:
57
+ print(f"Error downloading prediction data: {e}")
58
+
59
+
60
+ def process_data():
61
+ """Process the downloaded data and create queue"""
62
+ print("Processing downloaded data...")
63
+
64
+ # Load the main dataset
65
+ csv_path = os.path.join(PREDICTIONS_CSV_PATH, "data.csv")
66
+ if not os.path.exists(csv_path):
67
+ print(f"Error: data.csv not found at {csv_path}")
68
+ return None, None
69
+
70
+ print(f"Loading data from {csv_path}")
71
+ df = pd.read_csv(csv_path)
72
+
73
+ # Convert date columns
74
+ df["open_to_bet_until"] = pd.to_datetime(df["open_to_bet_until"])
75
+ df["prediction_created_at"] = pd.to_datetime(df["prediction_created_at"])
76
+
77
+ print(f"Loaded {len(df)} records")
78
+ print(f"Data shape: {df.shape}")
79
+ print(f"Columns: {list(df.columns)}")
80
+
81
+ # Get unique dates for prediction windows
82
+ prediction_dates = sorted(df["open_to_bet_until"].dt.date.unique())
83
+ print(f"Prediction dates: {prediction_dates}")
84
+
85
+ # Get unique algorithms/models
86
+ algorithms = df["algorithm_name"].unique()
87
+ print(f"Algorithms: {algorithms}")
88
+
89
+ # Get unique event types
90
+ event_types = df["event_type"].unique()
91
+ print(f"Event types: {event_types}")
92
+
93
+ # Create a summary of the data
94
+ summary = {"total_records": len(df), "unique_events": df["event_id"].nunique(), "unique_algorithms": len(algorithms), "unique_event_types": len(event_types), "prediction_dates": prediction_dates, "algorithms": algorithms.tolist(), "event_types": event_types.tolist()}
95
+
96
+ print("\n=== Data Summary ===")
97
+ for key, value in summary.items():
98
+ print(f"{key}: {value}")
99
+
100
+ return df, summary
101
+
102
+
103
+ def generate_queue(df):
104
+ """Generate evaluation queue from processed data"""
105
+ print("Generating evaluation queue...")
106
+
107
+ # Get unique events that need evaluation
108
+ unique_events = df.groupby("event_id").agg({"question": "first", "event_type": "first", "answer_options": "first", "result": "first", "open_to_bet_until": "first"}).reset_index()
109
+
110
+ # Filter for events that haven't been resolved yet (if needed)
111
+ pending_events = unique_events[unique_events["result"].isna()]
112
+ resolved_events = unique_events[unique_events["result"].notna()]
113
+
114
+ print(f"Total unique events: {len(unique_events)}")
115
+ print(f"Pending events: {len(pending_events)}")
116
+ print(f"Resolved events: {len(resolved_events)}")
117
+
118
+ # Save queue locally
119
+ queue_path = os.path.join(PREDICTIONS_CSV_PATH, "evaluation_queue.csv")
120
+ unique_events.to_csv(queue_path, index=False)
121
+ print(f"✓ Queue saved to {queue_path}")
122
+
123
+ return unique_events
124
+
125
+
126
+ def main():
127
+ """Main function to download and process data"""
128
+ print("=== FutureBench Data Download and Processing ===")
129
+
130
+ # Download datasets
131
+ download_datasets()
132
+
133
+ # Process data
134
+ df, summary = process_data()
135
+
136
+ if df is None:
137
+ print("❌ Failed to process data. Exiting.")
138
+ return
139
+
140
+ # Generate queue
141
+ queue = generate_queue(df)
142
+
143
+ print("\n=== Processing Complete ===")
144
+ print("Data processed and queue generated successfully!")
145
+ print(f"Queue contains {len(queue)} events")
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()
process_data/requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ pandas>=1.5.0
2
+ huggingface_hub>=0.15.0
3
+ sqlalchemy
4
+ psycopg2-binary # For PostgreSQL
5
+ PyMySQL # For MySQL
process_data/run_pipeline.sh ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Database to HuggingFace Pipeline
4
+ # Similar to FutureBench's to_csv.sh and to_benchmark.sh but combined
5
+
6
+ echo "🚀 Starting Database to HuggingFace Pipeline..."
7
+
8
+ # Check if HF_TOKEN is set
9
+ if [ -z "$HF_TOKEN" ]; then
10
+ echo "⚠️ HF_TOKEN not set. Will save files locally instead of uploading."
11
+ echo " To upload to HuggingFace, set: export HF_TOKEN='your_token_here'"
12
+ echo ""
13
+ fi
14
+
15
+ # Change to project root (same as to_csv.sh)
16
+ cd ../..
17
+
18
+ # Run the pipeline
19
+ python3 leaderboard/process_data/db_to_hf.py
20
+
21
+ # Check if it was successful
22
+ if [ $? -eq 0 ]; then
23
+ echo ""
24
+ echo "✅ Pipeline completed successfully!"
25
+ echo ""
26
+ echo "Next steps:"
27
+ echo "1. Check your HuggingFace repositories for updated data"
28
+ echo "2. Your leaderboard will automatically use the new data"
29
+ echo "3. Consider setting up a cron job to run this regularly"
30
+ else
31
+ echo ""
32
+ echo "❌ Pipeline failed. Check the error messages above."
33
+ exit 1
34
+ fi
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ pandas>=1.5.0
3
+ huggingface_hub>=0.15.0
4
+ apscheduler
5
+ git+https://github.com/IsThatYou/gradio_rangeslider
src/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Simplified leaderboard components for FutureBench
3
+ """
src/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (230 Bytes). View file
 
src/__pycache__/about.cpython-312.pyc ADDED
Binary file (2.99 kB). View file
 
src/__pycache__/display_utils.cpython-312.pyc ADDED
Binary file (15 kB). View file
 
src/__pycache__/leaderboard_utils.cpython-312.pyc ADDED
Binary file (7.17 kB). View file
 
src/about.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+
4
+
5
+ @dataclass
6
+ class Task:
7
+ benchmark: str
8
+ metric: str
9
+ col_name: str
10
+
11
+
12
+ # Define our evaluation tasks
13
+ # ---------------------------------------------------
14
+ class Tasks(Enum):
15
+ # task_key in the data, metric name, display name
16
+ news = Task("news", "acc", "News")
17
+ polymarket = Task("polymarket", "acc", "PolyMarket")
18
+
19
+
20
+ # Your leaderboard name
21
+ TITLE = """<h1 align="center" id="space-title" style="font-size: 4.375rem; font-weight: bold; margin-bottom: 1rem;">🔮 FutureBench Leaderboard</h1>"""
22
+
23
+ # What does your leaderboard evaluate?
24
+ INTRODUCTION_TEXT = """<div class="section-card">
25
+ <h3 class="section-header"><span class="section-icon">🎯</span> About FutureBench</h3>
26
+ FutureBench is a benchmarking system for evaluating AI models on predicting future events.
27
+ This leaderboard shows how well different AI models perform at forecasting real-world outcomes
28
+ across various domains including news events, sports, and prediction markets.
29
+ <br><br>
30
+ 📝 <a href="https://www.together.ai/blog/futurebench" target="_blank" style="color: #007acc; text-decoration: none;">Read our blog post</a> for more details about FutureBench.
31
+ </div>"""
32
+
33
+ # Additional information about the benchmark
34
+ ABOUT_TEXT = """
35
+ <div class="section-card fade-in-up">
36
+ <h2 class="section-header"><span class="section-icon">⚙️</span> How it works</h2>
37
+
38
+ FutureBench evaluates AI models on their ability to predict future events by:
39
+
40
+ - **Ingesting real-world events** from multiple sources (news, sports, prediction markets)
41
+ - **Collecting AI predictions** before events resolve
42
+ - **Measuring accuracy** once outcomes are known
43
+ - **Ranking models** based on their predictive performance
44
+ </div>
45
+
46
+ <div class="section-card fade-in-up stagger-1">
47
+ <h2 class="section-header"><span class="section-icon">📊</span> Event Types</h2>
48
+
49
+ - **News Events**: Predictions about political developments, economic changes, and current events
50
+ - **PolyMarket**: Predictions on various real-world events traded on prediction markets
51
+ </div>
52
+
53
+ <div class="section-card fade-in-up stagger-2">
54
+ <h2 class="section-header"><span class="section-icon">📈</span> Metrics</h2>
55
+
56
+ Models are evaluated using **accuracy** - the percentage of correct predictions made.
57
+ The **Average** score shows overall performance across all event types.
58
+ </div>
59
+
60
+ <div class="section-card fade-in-up stagger-3">
61
+ <h2 class="section-header"><span class="section-icon">🔒</span> Data Integrity</h2>
62
+
63
+ All predictions are made before events resolve, ensuring fair evaluation.
64
+ The leaderboard updates as new events are resolved and model performances are calculated.
65
+ </div>
66
+ """
src/display_utils.py ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ from .about import Tasks
4
+
5
+
6
+ @dataclass(frozen=True)
7
+ class ColumnContent:
8
+ name: str
9
+ type: str
10
+ displayed_by_default: bool
11
+ hidden: bool = False
12
+
13
+
14
+ # Define leaderboard columns
15
+ @dataclass(frozen=True)
16
+ class LeaderboardColumn:
17
+ model = ColumnContent("Model", "str", True)
18
+ events = ColumnContent("Events", "number", True)
19
+ average = ColumnContent("Average", "number", True)
20
+ # Task-specific columns will be added dynamically
21
+
22
+ # Additional model info (hidden by default)
23
+ correct_predictions = ColumnContent("Correct Predictions", "number", False)
24
+
25
+
26
+ # Get column names for display
27
+ def get_display_columns():
28
+ """Get list of column names for display"""
29
+ base_cols = ["Rank", "Model", "Events", "Average"]
30
+ task_cols = [task.value.col_name for task in Tasks]
31
+ return base_cols + task_cols
32
+
33
+
34
+ def get_all_columns():
35
+ """Get all column names including hidden ones"""
36
+ base_cols = get_display_columns()
37
+ hidden_cols = ["Correct Predictions"]
38
+ return base_cols + hidden_cols
39
+
40
+
41
+ # Formatting helpers
42
+ def make_clickable_model(model_name):
43
+ """Make model name clickable with link to HuggingFace"""
44
+ if "/" in model_name:
45
+ link = f"https://huggingface.co/{model_name}"
46
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;">{model_name}</a>'
47
+ return model_name
48
+
49
+
50
+ def format_percentage(value):
51
+ """Format accuracy as percentage"""
52
+ if value is None or value == "N/A":
53
+ return "N/A"
54
+ try:
55
+ return f"{float(value):.1f}%"
56
+ except (ValueError, TypeError):
57
+ return "N/A"
58
+
59
+
60
+ def has_valid_scores(df, required_columns):
61
+ """Check if dataframe has valid scores for required columns"""
62
+ return df[required_columns].notna().all(axis=1)
63
+
64
+
65
+ # CSS styling
66
+ CUSTOM_CSS = """
67
+ /* Global styling */
68
+ body {
69
+ background: linear-gradient(135deg, #1e1e2f 0%, #2d2d44 100%) !important;
70
+ }
71
+
72
+ /* Add consistent margins and centering */
73
+ .gradio-container,
74
+ .container,
75
+ .main {
76
+ margin: 0 auto !important;
77
+ max-width: 1400px !important;
78
+ padding: 0 60px !important;
79
+ }
80
+
81
+ .block {
82
+ margin: 0 auto !important;
83
+ max-width: 100% !important;
84
+ }
85
+
86
+ .markdown-text {
87
+ font-size: 18px !important;
88
+ line-height: 1.6 !important;
89
+ }
90
+
91
+ /* Larger font for introduction text */
92
+ .section-card {
93
+ font-size: 22px !important;
94
+ line-height: 1.7 !important;
95
+ }
96
+
97
+ .section-card p {
98
+ font-size: 22px !important;
99
+ line-height: 1.7 !important;
100
+ }
101
+
102
+ .section-card .markdown-text {
103
+ font-size: 22px !important;
104
+ line-height: 1.7 !important;
105
+ }
106
+
107
+ /* Header styling */
108
+ #space-title {
109
+ text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
110
+ margin-bottom: 0.5rem !important;
111
+ }
112
+
113
+ .center-logo {
114
+ display: flex !important;
115
+ justify-content: center !important;
116
+ align-items: center !important;
117
+ margin: 0.25rem 0 0.5rem 0 !important;
118
+ }
119
+
120
+ .center-logo img {
121
+ width: 200px !important;
122
+ height: 200px !important;
123
+ border-radius: 50% !important;
124
+ overflow: hidden !important;
125
+ object-fit: cover !important;
126
+ box-shadow: 0 8px 32px rgba(0,0,0,0.3) !important;
127
+ border: 3px solid rgba(255,255,255,0.1) !important;
128
+ }
129
+
130
+ /* Tab styling */
131
+ .tab-nav {
132
+ margin: 1rem 0 !important;
133
+ display: flex !important;
134
+ justify-content: center !important;
135
+ }
136
+
137
+ .tab-buttons {
138
+ display: flex !important;
139
+ justify-content: center !important;
140
+ flex-wrap: wrap !important;
141
+ gap: 8px !important;
142
+ }
143
+
144
+ .tab-buttons button {
145
+ font-size: 22px !important;
146
+ padding: 16px 32px !important;
147
+ margin: 0 6px !important;
148
+ border-radius: 8px !important;
149
+ border: 2px solid transparent !important;
150
+ background: rgba(255,255,255,0.1) !important;
151
+ color: white !important;
152
+ transition: all 0.3s ease !important;
153
+ }
154
+
155
+ .tab-buttons button:hover {
156
+ background: rgba(255,255,255,0.2) !important;
157
+ transform: translateY(-2px) !important;
158
+ }
159
+
160
+ .tab-buttons button.selected {
161
+ background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
162
+ border-color: #6366f1 !important;
163
+ box-shadow: 0 4px 12px rgba(99, 102, 241, 0.3) !important;
164
+ }
165
+
166
+ /* Leaderboard table styling */
167
+ #leaderboard-table {
168
+ margin: 20px 0 !important;
169
+ border-radius: 12px !important;
170
+ overflow: hidden !important;
171
+ box-shadow: 0 8px 32px rgba(0,0,0,0.2) !important;
172
+ }
173
+
174
+ #leaderboard-table table {
175
+ border-collapse: separate !important;
176
+ border-spacing: 0 !important;
177
+ width: 100% !important;
178
+ }
179
+
180
+ #leaderboard-table th {
181
+ background: linear-gradient(135deg, #4f46e5, #6366f1) !important;
182
+ color: white !important;
183
+ padding: 22px !important;
184
+ font-weight: 600 !important;
185
+ text-align: left !important;
186
+ border: none !important;
187
+ font-size: 16px !important;
188
+ }
189
+
190
+ #leaderboard-table td {
191
+ padding: 20px 22px !important;
192
+ border: none !important;
193
+ font-size: 16px !important;
194
+ }
195
+
196
+ #leaderboard-table tr:nth-child(even) {
197
+ background: rgba(255,255,255,0.05) !important;
198
+ }
199
+
200
+ #leaderboard-table tr:hover {
201
+ background: rgba(99, 102, 241, 0.1) !important;
202
+ transform: scale(1.01) !important;
203
+ transition: all 0.2s ease !important;
204
+ }
205
+
206
+ /* Rank column styling */
207
+ #leaderboard-table td:nth-child(1),
208
+ #leaderboard-table th:nth-child(1) {
209
+ text-align: center !important;
210
+ width: 80px !important;
211
+ min-width: 80px !important;
212
+ max-width: 80px !important;
213
+ font-size: 18px !important;
214
+ font-weight: 600 !important;
215
+ }
216
+
217
+ /* Model column styling */
218
+ #leaderboard-table td:nth-child(2),
219
+ #leaderboard-table th:nth-child(2) {
220
+ min-width: 180px !important;
221
+ max-width: 300px !important;
222
+ overflow: hidden !important;
223
+ white-space: nowrap !important;
224
+ text-overflow: ellipsis !important;
225
+ font-size: 16px !important;
226
+ }
227
+
228
+ /* Events column styling (numeric) */
229
+ #leaderboard-table td:nth-child(3),
230
+ #leaderboard-table th:nth-child(3) {
231
+ text-align: center !important;
232
+ width: 90px !important;
233
+ min-width: 90px !important;
234
+ max-width: 90px !important;
235
+ font-size: 16px !important;
236
+ font-weight: 600 !important;
237
+ }
238
+
239
+ /* Average column styling (percentage) */
240
+ #leaderboard-table td:nth-child(4),
241
+ #leaderboard-table th:nth-child(4) {
242
+ text-align: center !important;
243
+ width: 110px !important;
244
+ min-width: 110px !important;
245
+ max-width: 110px !important;
246
+ font-size: 17px !important;
247
+ font-weight: 700 !important;
248
+ color: #10b981 !important;
249
+ }
250
+
251
+ /* Task-specific columns (News, PolyMarket) - compact percentage columns */
252
+ #leaderboard-table td:nth-child(n+5),
253
+ #leaderboard-table th:nth-child(n+5) {
254
+ text-align: center !important;
255
+ width: 100px !important;
256
+ min-width: 100px !important;
257
+ max-width: 100px !important;
258
+ font-size: 16px !important;
259
+ font-weight: 600 !important;
260
+ }
261
+
262
+ /* Dropdown styling */
263
+ .dropdown {
264
+ margin: 20px 0 !important;
265
+ width: 100% !important;
266
+ }
267
+
268
+ .dropdown select {
269
+ background: rgba(255,255,255,0.1) !important;
270
+ border: 2px solid rgba(255,255,255,0.2) !important;
271
+ border-radius: 8px !important;
272
+ padding: 12px 18px !important;
273
+ color: white !important;
274
+ font-size: 16px !important;
275
+ width: 100% !important;
276
+ max-width: 300px !important;
277
+ }
278
+
279
+ /* Button styling */
280
+ #refresh-button, .refresh-btn {
281
+ background: linear-gradient(135deg, #10b981, #059669) !important;
282
+ color: white !important;
283
+ border: none !important;
284
+ padding: 14px 28px !important;
285
+ border-radius: 8px !important;
286
+ cursor: pointer !important;
287
+ font-size: 18px !important;
288
+ font-weight: 500 !important;
289
+ transition: all 0.3s ease !important;
290
+ box-shadow: 0 4px 12px rgba(16, 185, 129, 0.3) !important;
291
+ }
292
+
293
+ #refresh-button:hover, .refresh-btn:hover {
294
+ background: linear-gradient(135deg, #059669, #047857) !important;
295
+ transform: translateY(-2px) !important;
296
+ box-shadow: 0 6px 16px rgba(16, 185, 129, 0.4) !important;
297
+ }
298
+
299
+ /* Cards and sections */
300
+ .section-card {
301
+ background: rgba(255,255,255,0.05) !important;
302
+ border-radius: 12px !important;
303
+ padding: 25px !important;
304
+ margin: 15px 0 !important;
305
+ border: 1px solid rgba(255,255,255,0.1) !important;
306
+ box-shadow: 0 4px 16px rgba(0,0,0,0.1) !important;
307
+ max-width: 100% !important;
308
+ }
309
+
310
+ /* Metrics and stats */
311
+ .metric-highlight {
312
+ color: #10b981 !important;
313
+ font-weight: 600 !important;
314
+ }
315
+
316
+ .model-rank-1 {
317
+ background: linear-gradient(135deg, #fbbf24, #f59e0b) !important;
318
+ color: #1f2937 !important;
319
+ font-weight: 600 !important;
320
+ }
321
+
322
+ .model-rank-2 {
323
+ background: linear-gradient(135deg, #e5e7eb, #d1d5db) !important;
324
+ color: #1f2937 !important;
325
+ font-weight: 600 !important;
326
+ }
327
+
328
+ .model-rank-3 {
329
+ background: linear-gradient(135deg, #cd7c2f, #a16207) !important;
330
+ color: white !important;
331
+ font-weight: 600 !important;
332
+ }
333
+
334
+ /* Performance badges */
335
+ .rank-badge {
336
+ display: inline-block !important;
337
+ padding: 4px 8px !important;
338
+ border-radius: 20px !important;
339
+ font-size: 10px !important;
340
+ font-weight: 600 !important;
341
+ margin-right: 8px !important;
342
+ }
343
+
344
+ .rank-1 .rank-badge {
345
+ background: linear-gradient(135deg, #fbbf24, #f59e0b) !important;
346
+ color: #1f2937 !important;
347
+ }
348
+
349
+ .rank-2 .rank-badge {
350
+ background: linear-gradient(135deg, #e5e7eb, #d1d5db) !important;
351
+ color: #1f2937 !important;
352
+ }
353
+
354
+ .rank-3 .rank-badge {
355
+ background: linear-gradient(135deg, #cd7c2f, #a16207) !important;
356
+ color: white !important;
357
+ }
358
+
359
+ /* Progress bars for accuracy */
360
+ .accuracy-bar {
361
+ width: 100% !important;
362
+ height: 6px !important;
363
+ background: rgba(255,255,255,0.1) !important;
364
+ border-radius: 3px !important;
365
+ margin-top: 4px !important;
366
+ overflow: hidden !important;
367
+ }
368
+
369
+ .accuracy-progress {
370
+ height: 100% !important;
371
+ background: linear-gradient(90deg, #10b981, #059669) !important;
372
+ border-radius: 3px !important;
373
+ transition: width 0.8s ease !important;
374
+ }
375
+
376
+ /* Enhanced summary section */
377
+ .summary-stats {
378
+ display: grid !important;
379
+ grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)) !important;
380
+ gap: 20px !important;
381
+ margin: 20px 0 !important;
382
+ }
383
+
384
+ .stat-card {
385
+ background: rgba(255,255,255,0.08) !important;
386
+ border-radius: 12px !important;
387
+ padding: 20px !important;
388
+ border: 1px solid rgba(255,255,255,0.1) !important;
389
+ text-align: center !important;
390
+ transition: transform 0.3s ease !important;
391
+ }
392
+
393
+ .stat-card:hover {
394
+ transform: translateY(-4px) !important;
395
+ }
396
+
397
+ .stat-value {
398
+ font-size: 1.875rem !important;
399
+ font-weight: 700 !important;
400
+ color: #10b981 !important;
401
+ margin-bottom: 8px !important;
402
+ }
403
+
404
+ .stat-label {
405
+ font-size: 0.775rem !important;
406
+ color: rgba(255,255,255,0.7) !important;
407
+ text-transform: uppercase !important;
408
+ letter-spacing: 0.5px !important;
409
+ }
410
+
411
+ /* Better section headers */
412
+ .section-header {
413
+ display: flex !important;
414
+ align-items: center !important;
415
+ gap: 12px !important;
416
+ margin: 0 0 15px 0 !important;
417
+ font-size: 1.675rem !important;
418
+ font-weight: 600 !important;
419
+ }
420
+
421
+ .section-icon {
422
+ font-size: 1.375rem !important;
423
+ }
424
+
425
+ /* Improved table styling */
426
+ #leaderboard-table tr:first-child td:first-child {
427
+ position: relative !important;
428
+ }
429
+
430
+ #leaderboard-table tr:nth-child(1) {
431
+ background: rgba(251, 191, 36, 0.1) !important;
432
+ }
433
+
434
+ #leaderboard-table tr:nth-child(2) {
435
+ background: rgba(229, 231, 235, 0.1) !important;
436
+ }
437
+
438
+ #leaderboard-table tr:nth-child(3) {
439
+ background: rgba(205, 124, 47, 0.1) !important;
440
+ }
441
+
442
+ /* Loading animations */
443
+ @keyframes fadeInUp {
444
+ from {
445
+ opacity: 0;
446
+ transform: translateY(20px);
447
+ }
448
+ to {
449
+ opacity: 1;
450
+ transform: translateY(0);
451
+ }
452
+ }
453
+
454
+ .fade-in-up {
455
+ animation: fadeInUp 0.6s ease-out !important;
456
+ }
457
+
458
+ /* Staggered animations */
459
+ .stagger-1 { animation-delay: 0.1s !important; }
460
+ .stagger-2 { animation-delay: 0.2s !important; }
461
+ .stagger-3 { animation-delay: 0.3s !important; }
462
+ .stagger-4 { animation-delay: 0.4s !important; }
463
+
464
+ /* Enhanced buttons */
465
+ .icon-button {
466
+ display: inline-flex !important;
467
+ align-items: center !important;
468
+ gap: 8px !important;
469
+ }
470
+
471
+ .icon-button::before {
472
+ font-size: 1.0em !important;
473
+ }
474
+
475
+ /* Improved markdown styling */
476
+ .markdown-text h1 {
477
+ color: #10b981 !important;
478
+ border-bottom: 2px solid rgba(16, 185, 129, 0.3) !important;
479
+ padding-bottom: 8px !important;
480
+ }
481
+
482
+ .markdown-text h2 {
483
+ color: #6366f1 !important;
484
+ margin-top: 2rem !important;
485
+ }
486
+
487
+ .markdown-text h3 {
488
+ color: #8b5cf6 !important;
489
+ }
490
+
491
+ .markdown-text ul {
492
+ padding-left: 20px !important;
493
+ }
494
+
495
+ .markdown-text li {
496
+ margin: 8px 0 !important;
497
+ list-style-type: none !important;
498
+ position: relative !important;
499
+ }
500
+
501
+ .markdown-text li::before {
502
+ content: "▸" !important;
503
+ color: #10b981 !important;
504
+ position: absolute !important;
505
+ left: -16px !important;
506
+ font-weight: bold !important;
507
+ }
508
+
509
+ /* Responsive design */
510
+ @media (max-width: 768px) {
511
+ /* Adjust container margins for mobile */
512
+ .gradio-container,
513
+ .container,
514
+ .main {
515
+ padding: 0 30px !important;
516
+ }
517
+
518
+ #space-title {
519
+ font-size: 2.375rem !important;
520
+ }
521
+
522
+ .center-logo img {
523
+ width: 150px !important;
524
+ height: 150px !important;
525
+ }
526
+
527
+ .tab-buttons button {
528
+ font-size: 18px !important;
529
+ padding: 14px 24px !important;
530
+ }
531
+
532
+ .summary-stats {
533
+ grid-template-columns: 1fr !important;
534
+ }
535
+
536
+ .stat-value {
537
+ font-size: 1.375rem !important;
538
+ }
539
+
540
+ /* Maintain readable font sizes on mobile */
541
+ #leaderboard-table th {
542
+ font-size: 14px !important;
543
+ padding: 16px 12px !important;
544
+ }
545
+
546
+ #leaderboard-table td {
547
+ font-size: 14px !important;
548
+ padding: 16px 12px !important;
549
+ }
550
+
551
+ /* Adjust column widths for mobile */
552
+ #leaderboard-table td:nth-child(1),
553
+ #leaderboard-table th:nth-child(1) {
554
+ width: 60px !important;
555
+ min-width: 60px !important;
556
+ max-width: 60px !important;
557
+ }
558
+
559
+ #leaderboard-table td:nth-child(n+5),
560
+ #leaderboard-table th:nth-child(n+5) {
561
+ width: 90px !important;
562
+ min-width: 90px !important;
563
+ max-width: 90px !important;
564
+ }
565
+ }
566
+ """
src/leaderboard_utils.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import timedelta
2
+
3
+ import pandas as pd
4
+
5
+ from .about import Tasks
6
+ from .display_utils import format_percentage, make_clickable_model
7
+
8
+
9
+ def clean_model_name(model_name: str) -> str:
10
+ """Clean up model names for better display"""
11
+ if model_name.startswith("smolagents-tavily-web-visit-"):
12
+ return "Agent Baseline " + model_name.removeprefix("smolagents-tavily-web-visit-")
13
+ if model_name.startswith("language-model-"):
14
+ return "Language Model " + model_name.removeprefix("language-model-")
15
+ return model_name
16
+
17
+
18
+ def get_available_weeks(predictions_df):
19
+ """Get list of available weeks from the data"""
20
+ if predictions_df is None or predictions_df.empty:
21
+ return []
22
+
23
+ # Get unique dates and convert to weeks
24
+ dates = predictions_df["open_to_bet_until"].dt.date.unique()
25
+ weeks = {}
26
+
27
+ for date in dates:
28
+ # Get the Monday of the week for this date
29
+ monday = date - timedelta(days=date.weekday())
30
+ week_end = monday + timedelta(days=6)
31
+ week_key = f"{monday} to {week_end}"
32
+ week_range = (monday, week_end)
33
+ weeks[week_key] = week_range
34
+
35
+ # Sort by date
36
+ sorted_weeks = sorted(weeks.items(), key=lambda x: x[1][0])
37
+
38
+ return [("All Time", None)] + sorted_weeks
39
+
40
+
41
+ def filter_data_by_week(predictions_df, week_range):
42
+ """Filter predictions data by week range"""
43
+ if predictions_df is None or predictions_df.empty or week_range is None:
44
+ return predictions_df
45
+
46
+ start_date, end_date = week_range
47
+
48
+ # Filter data where open_to_bet_until falls within the week
49
+ filtered_df = predictions_df[(predictions_df["open_to_bet_until"].dt.date >= start_date) & (predictions_df["open_to_bet_until"].dt.date <= end_date)]
50
+
51
+ return filtered_df
52
+
53
+
54
+ def create_leaderboard_df(predictions_df, week_filter=None):
55
+ """
56
+ Create leaderboard DataFrame from predictions CSV data
57
+ Much simpler than Future-Bench's complex JSON parsing
58
+ """
59
+ if predictions_df is None or predictions_df.empty:
60
+ return pd.DataFrame()
61
+
62
+ # Apply week filter if specified
63
+ if week_filter is not None:
64
+ predictions_df = filter_data_by_week(predictions_df, week_filter)
65
+
66
+ if predictions_df.empty:
67
+ return pd.DataFrame()
68
+
69
+ # Calculate accuracy by algorithm and event type
70
+ results = []
71
+
72
+ # Group by algorithm to calculate metrics
73
+ for algorithm in predictions_df["algorithm_name"].unique():
74
+ algo_data = predictions_df[predictions_df["algorithm_name"] == algorithm]
75
+
76
+ # Filter out rows where result is null (unresolved events)
77
+ resolved_data = algo_data[algo_data["result"].notna()]
78
+
79
+ if len(resolved_data) == 0:
80
+ continue
81
+
82
+ # Calculate accuracy for each event type
83
+ cleaned_algorithm = clean_model_name(algorithm)
84
+ algo_scores = {"Model": make_clickable_model(cleaned_algorithm), "Events": len(resolved_data), "Correct Predictions": 0}
85
+
86
+ task_scores = []
87
+
88
+ for task in Tasks:
89
+ task_data = resolved_data[resolved_data["event_type"] == task.value.benchmark]
90
+
91
+ if len(task_data) > 0:
92
+ # Calculate accuracy for this task
93
+ # Handle different prediction formats
94
+ correct = 0
95
+ total = len(task_data)
96
+
97
+ for _, row in task_data.iterrows():
98
+ prediction = row["actual_prediction"]
99
+ actual = row["result"]
100
+
101
+ # Simple string comparison for now
102
+ # Could be enhanced for more complex prediction formats
103
+ if str(prediction).lower().strip() == str(actual).lower().strip():
104
+ correct += 1
105
+
106
+ accuracy = (correct / total) * 100 if total > 0 else 0
107
+ algo_scores[task.value.col_name] = accuracy
108
+ task_scores.append(accuracy)
109
+
110
+ # Add to total correct predictions
111
+ algo_scores["Correct Predictions"] += correct
112
+ else:
113
+ algo_scores[task.value.col_name] = None
114
+
115
+ # Calculate average accuracy across tasks where model made predictions
116
+ if task_scores:
117
+ algo_scores["Average"] = sum(task_scores) / len(task_scores)
118
+ else:
119
+ algo_scores["Average"] = 0
120
+
121
+ results.append(algo_scores)
122
+
123
+ # Create DataFrame
124
+ df = pd.DataFrame(results)
125
+
126
+ # Sort by average score (descending)
127
+ if "Average" in df.columns:
128
+ df = df.sort_values("Average", ascending=False)
129
+
130
+ # Reset index to ensure proper row indexing
131
+ df = df.reset_index(drop=True)
132
+
133
+ # Add rank column with medals for top 3 and numbers for rest
134
+ ranks = []
135
+ for i in range(len(df)):
136
+ if i == 0:
137
+ ranks.append("🥇")
138
+ elif i == 1:
139
+ ranks.append("🥈")
140
+ elif i == 2:
141
+ ranks.append("🥉")
142
+ else:
143
+ ranks.append(f"#{i + 1}")
144
+
145
+ # Insert rank column at the beginning
146
+ df.insert(0, "Rank", ranks)
147
+
148
+ # Format percentage columns
149
+ for task in Tasks:
150
+ if task.value.col_name in df.columns:
151
+ df[task.value.col_name] = df[task.value.col_name].apply(format_percentage)
152
+
153
+ if "Average" in df.columns:
154
+ df["Average"] = df["Average"].apply(format_percentage)
155
+
156
+ return df
157
+
158
+
159
+ def get_leaderboard_summary(df):
160
+ """Get summary statistics for the leaderboard"""
161
+ if df is None or df.empty:
162
+ return {"total_models": 0, "total_predictions": 0, "avg_accuracy": 0}
163
+
164
+ total_models = len(df)
165
+ total_predictions = df["Events"].sum() if "Events" in df.columns else 0
166
+
167
+ # Calculate average accuracy across all models
168
+ avg_accuracy = 0
169
+ if "Average" in df.columns:
170
+ # Extract numeric values from percentage strings
171
+ numeric_scores = []
172
+ for score in df["Average"]:
173
+ if score != "N/A":
174
+ try:
175
+ numeric_scores.append(float(score.replace("%", "")))
176
+ except Exception:
177
+ pass
178
+
179
+ if numeric_scores:
180
+ avg_accuracy = sum(numeric_scores) / len(numeric_scores)
181
+
182
+ return {"total_models": total_models, "total_predictions": total_predictions, "avg_accuracy": avg_accuracy}
183
+
184
+
185
+ def filter_leaderboard(df, min_predictions=0):
186
+ """Filter leaderboard by minimum number of predictions"""
187
+ if df is None or df.empty:
188
+ return df
189
+
190
+ if "Events" in df.columns:
191
+ return df[df["Events"] >= min_predictions]
192
+
193
+ return df