Spaces:

togethercomputer
/

FutureBench

Running

App Files Files Community

vinid commited on 25 days ago

Commit

6441bc6

0 Parent(s):

Leaderboard deployment 2025-07-16 18:05:41

Browse files

Files changed (24) hide show

README.md +69 -0
app.py +266 -0
image/image.png +0 -0
logo.png +1 -0
process_data/README.md +94 -0
process_data/__init__.py +11 -0
process_data/__pycache__/__init__.cpython-312.pyc +0 -0
process_data/__pycache__/config.cpython-312.pyc +0 -0
process_data/__pycache__/download_data.cpython-312.pyc +0 -0
process_data/config.py +25 -0
process_data/config_db.py +32 -0
process_data/db_to_hf.py +167 -0
process_data/download_data.py +149 -0
process_data/requirements.txt +5 -0
process_data/run_pipeline.sh +34 -0
requirements.txt +5 -0
src/__init__.py +3 -0
src/__pycache__/__init__.cpython-312.pyc +0 -0
src/__pycache__/about.cpython-312.pyc +0 -0
src/__pycache__/display_utils.cpython-312.pyc +0 -0
src/__pycache__/leaderboard_utils.cpython-312.pyc +0 -0
src/about.py +66 -0
src/display_utils.py +566 -0
src/leaderboard_utils.py +193 -0

README.md ADDED Viewed

	@@ -0,0 +1,69 @@

+---
+title: FutureBench Leaderboard
+emoji: 🔮
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+---
+# FutureBench Leaderboard App
+A minimal Gradio application for viewing FutureBench prediction data. This app downloads datasets from HuggingFace on startup and provides a web interface to explore the data.
+## Features
+- 📊 **Data Summary**: View dataset statistics and information
+- 🔍 **Sample Data**: Browse sample prediction records
+- 📋 **About**: Learn about the FutureBench system
+- 🔄 **Auto-refresh**: Download latest data on startup
+- 📅 **Date Range Slider**: Filter the leaderboard by a custom date span
+## Setup
+1. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+2. (Optional) Set your HuggingFace token for private repositories:
+```bash
+export HF_TOKEN=your_token_here
+```
+## Running the App
+Launch the Gradio application:
+```bash
+python app.py
+```
+The app will:
+1. Download datasets from HuggingFace repositories on startup
+2. Process the data and create summaries
+3. Launch a web interface at `http://localhost:7860`
+## Data Sources
+The app downloads data from these HuggingFace repositories:
+- `futurebench/requests` - Evaluation queue
+- `futurebench/results` - Evaluation results
+- `futurebench/data` - Main prediction dataset
+## Structure
+- `app.py` - Main Gradio application
+- `process_data/` - Data processing utilities
+- `requirements.txt` - Python dependencies
+- `README.md` - This file
+## Next Steps
+This is a minimal version focusing on data download and display. Future enhancements will include:
+- Full leaderboard with model rankings
+- Interactive filtering and sorting
+- Detailed performance metrics
+- Model comparison tools

app.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import os
+import gradio as gr
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from gradio_rangeslider import RangeSlider
+from huggingface_hub import snapshot_download
+# Import our data processing utilities
+from process_data import API, DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, PREDICTIONS_CSV_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+# Import our leaderboard components
+from src.about import ABOUT_TEXT, INTRODUCTION_TEXT, TITLE
+from src.display_utils import CUSTOM_CSS, get_display_columns
+from src.leaderboard_utils import create_leaderboard_df, get_available_weeks, get_leaderboard_summary
+# Global variables for data
+PREDICTIONS_DF = None
+LEADERBOARD_DF = None
+PREDICTION_DATES = []
+AVAILABLE_WEEKS = []
+DATA_SUMMARY = {}
+def restart_space():
+    """Restart the space if needed"""
+    API.restart_space(repo_id=REPO_ID)
+def download_and_process_data():
+    """Download and process data on startup"""
+    global PREDICTIONS_DF, LEADERBOARD_DF, PREDICTION_DATES, AVAILABLE_WEEKS, DATA_SUMMARY
+    print("=== Starting Data Download ===")
+    # Download eval requests (queue)
+    try:
+        print(f"Downloading eval requests to {EVAL_REQUESTS_PATH}")
+        snapshot_download(
+            repo_id=QUEUE_REPO,
+            local_dir=EVAL_REQUESTS_PATH,
+            repo_type="dataset",
+            tqdm_class=None,
+            etag_timeout=30,
+            token=TOKEN,
+        )
+        print("✓ Eval requests downloaded successfully")
+    except Exception as e:
+        print(f"Error downloading eval requests: {e}")
+    # Download eval results
+    try:
+        print(f"Downloading eval results to {EVAL_RESULTS_PATH}")
+        snapshot_download(
+            repo_id=RESULTS_REPO,
+            local_dir=EVAL_RESULTS_PATH,
+            repo_type="dataset",
+            tqdm_class=None,
+            etag_timeout=30,
+            token=TOKEN,
+        )
+        print("✓ Eval results downloaded successfully")
+    except Exception as e:
+        print(f"Error downloading eval results: {e}")
+    # Download prediction data (main dataset)
+    try:
+        print(f"Downloading prediction data to {PREDICTIONS_CSV_PATH}")
+        snapshot_download(
+            repo_id=DATA_REPO,
+            local_dir=PREDICTIONS_CSV_PATH,
+            repo_type="dataset",
+            tqdm_class=None,
+            etag_timeout=30,
+            token=TOKEN,
+        )
+        print("✓ Prediction data downloaded successfully")
+    except Exception as e:
+        print(f"Error downloading prediction data: {e}")
+    # Process the data
+    print("=== Processing Data ===")
+    # Load the main dataset
+    csv_path = os.path.join(PREDICTIONS_CSV_PATH, "data.csv")
+    if os.path.exists(csv_path):
+        print(f"Loading data from {csv_path}")
+        PREDICTIONS_DF = pd.read_csv(csv_path)
+        # Convert date columns
+        PREDICTIONS_DF["open_to_bet_until"] = pd.to_datetime(PREDICTIONS_DF["open_to_bet_until"])
+        PREDICTIONS_DF["prediction_created_at"] = pd.to_datetime(PREDICTIONS_DF["prediction_created_at"])
+        # Get prediction dates
+        PREDICTION_DATES = sorted(PREDICTIONS_DF["open_to_bet_until"].dt.date.unique())
+        # Get available weeks for filtering
+        AVAILABLE_WEEKS = get_available_weeks(PREDICTIONS_DF)
+        # Create leaderboard
+        print("Creating leaderboard...")
+        LEADERBOARD_DF = create_leaderboard_df(PREDICTIONS_DF)
+        # Create data summary
+        leaderboard_summary = get_leaderboard_summary(LEADERBOARD_DF)
+        DATA_SUMMARY = {
+            "total_records": len(PREDICTIONS_DF),
+            "unique_events": PREDICTIONS_DF["event_id"].nunique(),
+            "unique_algorithms": PREDICTIONS_DF["algorithm_name"].nunique(),
+            "unique_event_types": PREDICTIONS_DF["event_type"].nunique(),
+            "date_range": f"{PREDICTION_DATES[0]} to {PREDICTION_DATES[-1]}" if PREDICTION_DATES else "N/A",
+            "algorithms": PREDICTIONS_DF["algorithm_name"].unique().tolist(),
+            "event_types": PREDICTIONS_DF["event_type"].unique().tolist(),
+            "leaderboard_summary": leaderboard_summary,
+        }
+        print("✓ Data processed successfully")
+        print(f"  - Total records: {DATA_SUMMARY['total_records']}")
+        print(f"  - Unique events: {DATA_SUMMARY['unique_events']}")
+        print(f"  - Unique algorithms: {DATA_SUMMARY['unique_algorithms']}")
+        print(f"  - Leaderboard models: {leaderboard_summary['total_models']}")
+        print(f"  - Date range: {DATA_SUMMARY['date_range']}")
+    else:
+        print(f"❌ Error: data.csv not found at {csv_path}")
+        PREDICTIONS_DF = pd.DataFrame()
+        LEADERBOARD_DF = pd.DataFrame()
+        DATA_SUMMARY = {"error": "No data found"}
+def get_leaderboard(date_range=None):
+    """Return leaderboard filtered by date range"""
+    if PREDICTIONS_DF is None or PREDICTIONS_DF.empty:
+        return pd.DataFrame({"message": ["No data available"]})
+    # Determine range of dates to filter by
+    if not PREDICTION_DATES:
+        return pd.DataFrame({"message": ["No dates available"]})
+    if date_range is None:
+        start_idx, end_idx = 0, len(PREDICTION_DATES) - 1
+    else:
+        start_idx, end_idx = date_range
+        start_idx = max(0, min(start_idx, len(PREDICTION_DATES) - 1))
+        end_idx = max(start_idx, min(end_idx, len(PREDICTION_DATES) - 1))
+        start_idx, end_idx = int(start_idx), int(end_idx)
+    week_range = (PREDICTION_DATES[start_idx], PREDICTION_DATES[end_idx])
+    # Create filtered leaderboard
+    filtered_leaderboard = create_leaderboard_df(PREDICTIONS_DF, week_range)
+    if filtered_leaderboard.empty:
+        return pd.DataFrame({"message": ["No data available for selected week"]})
+    # Return only display columns
+    display_cols = get_display_columns()
+    available_cols = [col for col in display_cols if col in filtered_leaderboard.columns]
+    return filtered_leaderboard[available_cols]
+def get_data_summary():
+    """Return formatted data summary"""
+    if not DATA_SUMMARY:
+        return "No data loaded"
+    if "error" in DATA_SUMMARY:
+        return f"Error: {DATA_SUMMARY['error']}"
+    summary = DATA_SUMMARY.get("leaderboard_summary", {})
+    summary_text = f"""
+    # 🏆 Leaderboard Summary
+    - **Models Ranked**: {summary.get("total_models", 0)}
+    - **Total Predictions**: {summary.get("total_predictions", 0):,}
+    - **Average Accuracy**: {summary.get("avg_accuracy", 0):.1f}%
+    # 📊 Dataset Overview
+    - **Total Records**: {DATA_SUMMARY["total_records"]:,}
+    - **Unique Events**: {DATA_SUMMARY["unique_events"]:,}
+    - **Event Types**: {DATA_SUMMARY["unique_event_types"]}
+    - **Date Range**: {DATA_SUMMARY["date_range"]}
+    ## 🤖 Models
+    {", ".join(DATA_SUMMARY["algorithms"])}
+    ## 📋 Event Types
+    {", ".join(DATA_SUMMARY["event_types"])}
+    """
+    return summary_text
+def get_sample_data():
+    """Return sample of the data"""
+    if PREDICTIONS_DF is None or PREDICTIONS_DF.empty:
+        return pd.DataFrame({"message": ["No data available"]})
+    # Return first 10 rows with key columns
+    sample_cols = ["event_id", "question", "event_type", "algorithm_name", "actual_prediction", "result", "open_to_bet_until"]
+    available_cols = [col for col in sample_cols if col in PREDICTIONS_DF.columns]
+    return PREDICTIONS_DF[available_cols].head(10)
+def refresh_all_data(date_range=None):
+    """Refresh all data and return updated components"""
+    download_and_process_data()
+    return (
+        get_leaderboard(date_range),
+        get_data_summary(),
+        get_sample_data(),
+    )
+# Download and process data on startup
+download_and_process_data()
+# Create Gradio interface
+with gr.Blocks(css=CUSTOM_CSS, title="FutureBench Leaderboard") as demo:
+    gr.HTML(TITLE)
+    with gr.Row():
+        gr.Image("image/image.png", height=200, width=200, show_label=False, show_download_button=False, show_fullscreen_button=False, container=False, elem_classes="center-logo")
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs():
+        with gr.TabItem("🏆 Leaderboard"):
+            leaderboard_display = gr.Dataframe(value=get_leaderboard(), interactive=False, wrap=True, elem_id="leaderboard-table")
+            with gr.Row():
+                date_slider = RangeSlider(
+                    minimum=0,
+                    maximum=len(PREDICTION_DATES) - 1,
+                    value=(0, len(PREDICTION_DATES) - 1),
+                    step=1,
+                    label="📅 Date Range",
+                    show_label=True,
+                    labels=[str(d) for d in PREDICTION_DATES],
+                )
+            # Update leaderboard when date range is changed
+            date_slider.change(get_leaderboard, inputs=date_slider, outputs=leaderboard_display)
+        with gr.TabItem("📊 Summary"):
+            summary_display = gr.Markdown(get_data_summary(), elem_classes="markdown-text")
+            refresh_summary_btn = gr.Button("🔄 Refresh Summary")
+            refresh_summary_btn.click(lambda: get_data_summary(), outputs=summary_display)
+        with gr.TabItem("🔍 Sample Data"):
+            sample_display = gr.Dataframe(value=get_sample_data(), interactive=False, wrap=True)
+            refresh_sample_btn = gr.Button("🔄 Refresh Sample")
+            refresh_sample_btn.click(lambda: get_sample_data(), outputs=sample_display)
+        with gr.TabItem("📋 About"):
+            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
+if __name__ == "__main__":
+    scheduler = BackgroundScheduler()
+    scheduler.add_job(restart_space, "interval", seconds=1800)
+    scheduler.start()
+    demo.queue(default_concurrency_limit=40).launch()

image/image.png ADDED Viewed

logo.png ADDED Viewed

process_data/README.md ADDED Viewed

	@@ -0,0 +1,94 @@

+# FutureBench Dataset Processing
+This directory contains tools for processing FutureBench datasets, both downloading from HuggingFace and transforming your own database into the standard format.
+## Option 1: Download from HuggingFace (Original)
+Use this to download the existing FutureBench dataset:
+```bash
+python download_data.py
+```
+## Option 2: Transform Your Own Database
+Use this to transform your production database into HuggingFace format:
+### Setup
+1. **Install dependencies:**
+```bash
+pip install pandas sqlalchemy huggingface_hub
+```
+2. **Set up HuggingFace token:**
+```bash
+export HF_TOKEN="your_huggingface_token_here"
+```
+3. **Configure your settings:**
+Edit `config_db.py` to match your needs:
+- Update `HF_CONFIG` with your HuggingFace repository names
+- Adjust `PROCESSING_CONFIG` for data filtering preferences
+- Note: Database connection uses the same setup as the main FutureBench app
+### Usage
+```bash
+# Transform your database and upload to HuggingFace
+python db_to_hf.py
+# Or run locally without uploading
+HF_TOKEN="" python db_to_hf.py
+```
+### Database Schema
+The script uses the same database schema as the main FutureBench application:
+- `EventBase` model for events
+- `Prediction` model for predictions
+- Uses SQLAlchemy ORM (same as `convert_to_csv.py`)
+No additional database configuration needed - it uses the existing FutureBench database connection.
+### Output Format
+The script produces data in the same format as the original FutureBench dataset:
+- `event_id`, `question`, `event_type`, `algorithm_name`, `actual_prediction`, `result`, `open_to_bet_until`, `prediction_created_at`
+### Automation
+You can run this as a scheduled job:
+```bash
+# Add to crontab to run daily at 2 AM
+0 2 * * * cd /path/to/your/project && python leaderboard/process_data/db_to_hf.py
+```
+## Files
+- `download_data.py` - Downloads data from HuggingFace repositories
+- `db_to_hf.py` - Transforms your database to HuggingFace format
+- `config_db.py` - Configuration for database connection and HF settings
+- `config.py` - HuggingFace repository configuration
+- `requirements.txt` - Python dependencies
+## Data Structure
+The main dataset contains:
+- `event_id`: Unique identifier for each event
+- `question`: The prediction question
+- `event_type`: Type of event (polymarket, soccer, etc.)
+- `answer_options`: Possible answers in JSON format
+- `result`: Actual outcome (if resolved)
+- `algorithm_name`: AI model that made the prediction
+- `actual_prediction`: The prediction made
+- `open_to_bet_until`: Prediction window deadline
+- `prediction_created_at`: When prediction was made
+## Output
+The script generates:
+- Downloaded datasets in local cache folders
+- `evaluation_queue.csv` with unique events for processing
+- Console output with data statistics and summary

process_data/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""
+FutureBench Data Processing
+This package contains utilities for downloading and processing FutureBench datasets from HuggingFace.
+"""
+from .config import API, DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, OWNER, PREDICTIONS_CSV_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
+from .download_data import download_datasets, generate_queue, process_data
+__version__ = "0.1.0"
+__all__ = ["TOKEN", "OWNER", "QUEUE_REPO", "RESULTS_REPO", "DATA_REPO", "REPO_ID", "EVAL_REQUESTS_PATH", "EVAL_RESULTS_PATH", "PREDICTIONS_CSV_PATH", "API", "download_datasets", "process_data", "generate_queue"]

process_data/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (805 Bytes). View file

process_data/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (1.02 kB). View file

process_data/__pycache__/download_data.cpython-312.pyc ADDED Viewed

Binary file (6.45 kB). View file

process_data/config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+from huggingface_hub import HfApi
+# Configuration for HuggingFace repositories
+# ------------------------------------------
+TOKEN = os.environ.get("HF_TOKEN")  # A read token for accessing datasets
+OWNER = "futurebench"  # Change to your organization
+# ------------------------------------------
+# HuggingFace repository IDs
+QUEUE_REPO = f"{OWNER}/requests"
+RESULTS_REPO = f"{OWNER}/results"
+DATA_REPO = f"{OWNER}/data"
+# Local cache paths
+CACHE_PATH = os.getenv("HF_HOME", ".")
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+PREDICTIONS_CSV_PATH = os.path.join(CACHE_PATH, "eval-data")
+REPO_ID = f"{OWNER}/Future-Bench"
+# HuggingFace API client
+API = HfApi(token=TOKEN)

process_data/config_db.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Configuration for database to HuggingFace pipeline.
+Update these settings to match your setup.
+"""
+import os
+# Database Configuration
+# Note: Database connection is handled by future_bench.database.get_session()
+# The script uses the same database connection as the main FutureBench app
+# HuggingFace Configuration
+HF_CONFIG = {
+    "token": os.getenv("HF_TOKEN"),  # Set this in your environment
+    "data_repo": "futurebench/data",
+    "results_repo": "futurebench/results",
+    "requests_repo": "futurebench/requests",  # Optional: for model submissions
+}
+# Data Processing Settings
+PROCESSING_CONFIG = {
+    "days_history": 180,  # How many days of data to include
+    "min_predictions": 5,  # Minimum predictions per model to include
+    "event_types": ["news", "polymarket", "sports"],  # Which event types to include
+    "exclude_models": ["test", "debug"],  # Models to exclude from public dataset
+}
+# Note: Schema mapping not needed since we use SQLAlchemy ORM models
+# The script uses the same models as convert_to_csv.py:
+# - EventBase (events table)
+# - Prediction (predictions table)

process_data/db_to_hf.py ADDED Viewed

	@@ -0,0 +1,167 @@

+#!/usr/bin/env python3
+"""
+Script to transform your production database into HuggingFace dataset format.
+Follows the same pattern as FutureBench's convert_to_csv.py but simplified.
+"""
+import os
+import sys
+import tempfile
+from datetime import datetime
+import pandas as pd
+from huggingface_hub import HfApi
+# Add the parent directory to sys.path to allow imports (same as convert_to_csv.py)
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+# Import FutureBench models and database (same as convert_to_csv.py)
+# Import configuration
+from config_db import HF_CONFIG, PROCESSING_CONFIG
+from future_bench.database import get_session
+from future_bench.models import EventBase, Prediction
+def datetime_to_string(dt):
+    """Convert datetime to string or return empty string if None (same as convert_to_csv.py)"""
+    return dt.isoformat() if dt else ""
+def extract_events_and_predictions(session):
+    """
+    Extract events and predictions from your database.
+    Uses the same SQLAlchemy ORM approach as convert_to_csv.py.
+    """
+    # Get all events (same as convert_to_csv.py)
+    events = session.query(EventBase).all()
+    if not events:
+        print("No events found in the database.")
+        return pd.DataFrame()
+    # Get all predictions (same as convert_to_csv.py)
+    predictions = session.query(Prediction).all()
+    if not predictions:
+        print("No predictions found in the database.")
+        return pd.DataFrame()
+    # Create combined view (same logic as convert_to_csv.py)
+    combined_data = []
+    for event in events:
+        if event.result is None:  # Skip unresolved events
+            continue
+        event_predictions = [p for p in predictions if p.event_id == event.id]
+        for pred in event_predictions:
+            combined_data.append(
+                {
+                    "event_id": event.id,
+                    "question": event.question,
+                    "event_type": event.event_type,
+                    "open_to_bet_until": datetime_to_string(event.open_to_bet_until),
+                    "result": event.result,
+                    "algorithm_name": pred.algorithm_name,
+                    "actual_prediction": pred.actual_prediction,
+                    "prediction_created_at": datetime_to_string(pred.created_at),
+                }
+            )
+    df = pd.DataFrame(combined_data)
+    return df
+def transform_to_standard_format(df):
+    """
+    Transform your raw data into the standard format expected by your leaderboard.
+    This should match the CSV format your leaderboard already expects.
+    """
+    # Convert date columns with flexible parsing for microseconds
+    df["open_to_bet_until"] = pd.to_datetime(df["open_to_bet_until"], format="mixed")
+    df["prediction_created_at"] = pd.to_datetime(df["prediction_created_at"], format="mixed")
+    # Add any additional columns your leaderboard expects
+    df["source"] = "your-app"  # Add source identifier
+    # Filter to data starting from June 12th
+    cutoff_date = datetime(2025, 6, 12)
+    df = df[df["prediction_created_at"] >= cutoff_date]
+    print(f"   Filtered to predictions created from {cutoff_date.strftime('%B %d, %Y')} onwards: {len(df)} records remaining")
+    # Filter by event types
+    df = df[df["event_type"].isin(PROCESSING_CONFIG["event_types"])]
+    # Exclude test models
+    df = df[~df["algorithm_name"].isin(PROCESSING_CONFIG["exclude_models"])]
+    # Calculate accuracy per model (for summary)
+    accuracy_df = df.groupby(["algorithm_name", "event_type"]).agg({"actual_prediction": "count", "result": lambda x: (df.loc[x.index, "actual_prediction"] == x).sum()}).rename(columns={"actual_prediction": "total_predictions", "result": "correct_predictions"}).reset_index()
+    accuracy_df["accuracy"] = accuracy_df["correct_predictions"] / accuracy_df["total_predictions"]
+    return df, accuracy_df
+def upload_to_huggingface(df, accuracy_df, repo_data, repo_results):
+    """
+    Upload the transformed data to HuggingFace repositories.
+    """
+    api = HfApi(token=HF_CONFIG["token"])
+    # Create temporary directory for files
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        # Save main dataset
+        data_path = os.path.join(tmp_dir, "data.csv")
+        df.to_csv(data_path, index=False)
+        # Save accuracy summary
+        results_path = os.path.join(tmp_dir, "results.csv")
+        accuracy_df.to_csv(results_path, index=False)
+        # Upload to data repo
+        api.upload_file(path_or_fileobj=data_path, path_in_repo="data.csv", repo_id=repo_data, repo_type="dataset")
+        # Upload to results repo
+        api.upload_file(path_or_fileobj=results_path, path_in_repo="results.csv", repo_id=repo_results, repo_type="dataset")
+        print(f"✅ Uploaded data to {repo_data}")
+        print(f"✅ Uploaded results to {repo_results}")
+def main():
+    """Main pipeline function"""
+    print("🚀 Starting database to HuggingFace pipeline...")
+    # Step 1: Extract from database (same as convert_to_csv.py)
+    print("📊 Extracting data from database...")
+    session = next(get_session())
+    try:
+        df = extract_events_and_predictions(session)
+        print(f"   Found {len(df)} event-prediction pairs")
+    finally:
+        session.close()
+    if len(df) == 0:
+        print("❌ No data found in database")
+        return
+    # Step 2: Transform to standard format
+    print("🔄 Transforming data...")
+    df, accuracy_df = transform_to_standard_format(df)
+    print(f"   Processed {len(df)} records")
+    print(f"   Generated accuracy stats for {len(accuracy_df)} model-task pairs")
+    # Step 3: Upload to HuggingFace
+    if HF_CONFIG["token"]:
+        print("☁️  Uploading to HuggingFace...")
+        upload_to_huggingface(df, accuracy_df, HF_CONFIG["data_repo"], HF_CONFIG["results_repo"])
+    else:
+        print("⚠️  No HF_TOKEN found, saving locally instead...")
+        df.to_csv("data_export.csv", index=False)
+        accuracy_df.to_csv("results_export.csv", index=False)
+        print("   Saved data_export.csv and results_export.csv")
+    print("✅ Pipeline completed successfully!")
+if __name__ == "__main__":
+    main()

process_data/download_data.py ADDED Viewed

	@@ -0,0 +1,149 @@

+#!/usr/bin/env python3
+import os
+import pandas as pd
+from huggingface_hub import snapshot_download
+from .config import DATA_REPO, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, PREDICTIONS_CSV_PATH, QUEUE_REPO, RESULTS_REPO, TOKEN
+def download_datasets():
+    """Download datasets from HuggingFace repositories"""
+    print("Downloading datasets from HuggingFace...")
+    # Download eval requests (queue)
+    try:
+        print(f"Downloading eval requests to {EVAL_REQUESTS_PATH}")
+        snapshot_download(
+            repo_id=QUEUE_REPO,
+            local_dir=EVAL_REQUESTS_PATH,
+            repo_type="dataset",
+            tqdm_class=None,
+            etag_timeout=30,
+            token=TOKEN,
+        )
+        print("✓ Eval requests downloaded successfully")
+    except Exception as e:
+        print(f"Error downloading eval requests: {e}")
+    # Download eval results
+    try:
+        print(f"Downloading eval results to {EVAL_RESULTS_PATH}")
+        snapshot_download(
+            repo_id=RESULTS_REPO,
+            local_dir=EVAL_RESULTS_PATH,
+            repo_type="dataset",
+            tqdm_class=None,
+            etag_timeout=30,
+            token=TOKEN,
+        )
+        print("✓ Eval results downloaded successfully")
+    except Exception as e:
+        print(f"Error downloading eval results: {e}")
+    # Download prediction data (main dataset)
+    try:
+        print(f"Downloading prediction data to {PREDICTIONS_CSV_PATH}")
+        snapshot_download(
+            repo_id=DATA_REPO,
+            local_dir=PREDICTIONS_CSV_PATH,
+            repo_type="dataset",
+            tqdm_class=None,
+            etag_timeout=30,
+            token=TOKEN,
+        )
+        print("✓ Prediction data downloaded successfully")
+    except Exception as e:
+        print(f"Error downloading prediction data: {e}")
+def process_data():
+    """Process the downloaded data and create queue"""
+    print("Processing downloaded data...")
+    # Load the main dataset
+    csv_path = os.path.join(PREDICTIONS_CSV_PATH, "data.csv")
+    if not os.path.exists(csv_path):
+        print(f"Error: data.csv not found at {csv_path}")
+        return None, None
+    print(f"Loading data from {csv_path}")
+    df = pd.read_csv(csv_path)
+    # Convert date columns
+    df["open_to_bet_until"] = pd.to_datetime(df["open_to_bet_until"])
+    df["prediction_created_at"] = pd.to_datetime(df["prediction_created_at"])
+    print(f"Loaded {len(df)} records")
+    print(f"Data shape: {df.shape}")
+    print(f"Columns: {list(df.columns)}")
+    # Get unique dates for prediction windows
+    prediction_dates = sorted(df["open_to_bet_until"].dt.date.unique())
+    print(f"Prediction dates: {prediction_dates}")
+    # Get unique algorithms/models
+    algorithms = df["algorithm_name"].unique()
+    print(f"Algorithms: {algorithms}")
+    # Get unique event types
+    event_types = df["event_type"].unique()
+    print(f"Event types: {event_types}")
+    # Create a summary of the data
+    summary = {"total_records": len(df), "unique_events": df["event_id"].nunique(), "unique_algorithms": len(algorithms), "unique_event_types": len(event_types), "prediction_dates": prediction_dates, "algorithms": algorithms.tolist(), "event_types": event_types.tolist()}
+    print("\n=== Data Summary ===")
+    for key, value in summary.items():
+        print(f"{key}: {value}")
+    return df, summary
+def generate_queue(df):
+    """Generate evaluation queue from processed data"""
+    print("Generating evaluation queue...")
+    # Get unique events that need evaluation
+    unique_events = df.groupby("event_id").agg({"question": "first", "event_type": "first", "answer_options": "first", "result": "first", "open_to_bet_until": "first"}).reset_index()
+    # Filter for events that haven't been resolved yet (if needed)
+    pending_events = unique_events[unique_events["result"].isna()]
+    resolved_events = unique_events[unique_events["result"].notna()]
+    print(f"Total unique events: {len(unique_events)}")
+    print(f"Pending events: {len(pending_events)}")
+    print(f"Resolved events: {len(resolved_events)}")
+    # Save queue locally
+    queue_path = os.path.join(PREDICTIONS_CSV_PATH, "evaluation_queue.csv")
+    unique_events.to_csv(queue_path, index=False)
+    print(f"✓ Queue saved to {queue_path}")
+    return unique_events
+def main():
+    """Main function to download and process data"""
+    print("=== FutureBench Data Download and Processing ===")
+    # Download datasets
+    download_datasets()
+    # Process data
+    df, summary = process_data()
+    if df is None:
+        print("❌ Failed to process data. Exiting.")
+        return
+    # Generate queue
+    queue = generate_queue(df)
+    print("\n=== Processing Complete ===")
+    print("Data processed and queue generated successfully!")
+    print(f"Queue contains {len(queue)} events")
+if __name__ == "__main__":
+    main()

process_data/requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+pandas>=1.5.0
+huggingface_hub>=0.15.0
+sqlalchemy
+psycopg2-binary  # For PostgreSQL
+PyMySQL  # For MySQL

process_data/run_pipeline.sh ADDED Viewed

	@@ -0,0 +1,34 @@

+#!/bin/bash
+# Database to HuggingFace Pipeline
+# Similar to FutureBench's to_csv.sh and to_benchmark.sh but combined
+echo "🚀 Starting Database to HuggingFace Pipeline..."
+# Check if HF_TOKEN is set
+if [ -z "$HF_TOKEN" ]; then
+    echo "⚠️  HF_TOKEN not set. Will save files locally instead of uploading."
+    echo "   To upload to HuggingFace, set: export HF_TOKEN='your_token_here'"
+    echo ""
+fi
+# Change to project root (same as to_csv.sh)
+cd ../..
+# Run the pipeline
+python3 leaderboard/process_data/db_to_hf.py
+# Check if it was successful
+if [ $? -eq 0 ]; then
+    echo ""
+    echo "✅ Pipeline completed successfully!"
+    echo ""
+    echo "Next steps:"
+    echo "1. Check your HuggingFace repositories for updated data"
+    echo "2. Your leaderboard will automatically use the new data"
+    echo "3. Consider setting up a cron job to run this regularly"
+else
+    echo ""
+    echo "❌ Pipeline failed. Check the error messages above."
+    exit 1
+fi

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=4.0.0
+pandas>=1.5.0
+huggingface_hub>=0.15.0
+apscheduler
+git+https://github.com/IsThatYou/gradio_rangeslider

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Simplified leaderboard components for FutureBench
+"""

src/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (230 Bytes). View file

src/__pycache__/about.cpython-312.pyc ADDED Viewed

Binary file (2.99 kB). View file

src/__pycache__/display_utils.cpython-312.pyc ADDED Viewed

Binary file (15 kB). View file

src/__pycache__/leaderboard_utils.cpython-312.pyc ADDED Viewed

Binary file (7.17 kB). View file

src/about.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from dataclasses import dataclass
+from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+# Define our evaluation tasks
+# ---------------------------------------------------
+class Tasks(Enum):
+    # task_key in the data, metric name, display name
+    news = Task("news", "acc", "News")
+    polymarket = Task("polymarket", "acc", "PolyMarket")
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title" style="font-size: 4.375rem; font-weight: bold; margin-bottom: 1rem;">🔮 FutureBench Leaderboard</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """<div class="section-card">
+<h3 class="section-header"><span class="section-icon">🎯</span> About FutureBench</h3>
+FutureBench is a benchmarking system for evaluating AI models on predicting future events.
+This leaderboard shows how well different AI models perform at forecasting real-world outcomes
+across various domains including news events, sports, and prediction markets.
+<br><br>
+📝 <a href="https://www.together.ai/blog/futurebench" target="_blank" style="color: #007acc; text-decoration: none;">Read our blog post</a> for more details about FutureBench.
+</div>"""
+# Additional information about the benchmark
+ABOUT_TEXT = """
+<div class="section-card fade-in-up">
+<h2 class="section-header"><span class="section-icon">⚙️</span> How it works</h2>
+FutureBench evaluates AI models on their ability to predict future events by:
+- **Ingesting real-world events** from multiple sources (news, sports, prediction markets)
+- **Collecting AI predictions** before events resolve
+- **Measuring accuracy** once outcomes are known
+- **Ranking models** based on their predictive performance
+</div>
+<div class="section-card fade-in-up stagger-1">
+<h2 class="section-header"><span class="section-icon">📊</span> Event Types</h2>
+- **News Events**: Predictions about political developments, economic changes, and current events
+- **PolyMarket**: Predictions on various real-world events traded on prediction markets
+</div>
+<div class="section-card fade-in-up stagger-2">
+<h2 class="section-header"><span class="section-icon">📈</span> Metrics</h2>
+Models are evaluated using **accuracy** - the percentage of correct predictions made.
+The **Average** score shows overall performance across all event types.
+</div>
+<div class="section-card fade-in-up stagger-3">
+<h2 class="section-header"><span class="section-icon">🔒</span> Data Integrity</h2>
+All predictions are made before events resolve, ensuring fair evaluation.
+The leaderboard updates as new events are resolved and model performances are calculated.
+</div>
+"""

src/display_utils.py ADDED Viewed

	@@ -0,0 +1,566 @@

+from dataclasses import dataclass
+from .about import Tasks
+@dataclass(frozen=True)
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+# Define leaderboard columns
+@dataclass(frozen=True)
+class LeaderboardColumn:
+    model = ColumnContent("Model", "str", True)
+    events = ColumnContent("Events", "number", True)
+    average = ColumnContent("Average", "number", True)
+    # Task-specific columns will be added dynamically
+    # Additional model info (hidden by default)
+    correct_predictions = ColumnContent("Correct Predictions", "number", False)
+# Get column names for display
+def get_display_columns():
+    """Get list of column names for display"""
+    base_cols = ["Rank", "Model", "Events", "Average"]
+    task_cols = [task.value.col_name for task in Tasks]
+    return base_cols + task_cols
+def get_all_columns():
+    """Get all column names including hidden ones"""
+    base_cols = get_display_columns()
+    hidden_cols = ["Correct Predictions"]
+    return base_cols + hidden_cols
+# Formatting helpers
+def make_clickable_model(model_name):
+    """Make model name clickable with link to HuggingFace"""
+    if "/" in model_name:
+        link = f"https://huggingface.co/{model_name}"
+        return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;">{model_name}</a>'
+    return model_name
+def format_percentage(value):
+    """Format accuracy as percentage"""
+    if value is None or value == "N/A":
+        return "N/A"
+    try:
+        return f"{float(value):.1f}%"
+    except (ValueError, TypeError):
+        return "N/A"
+def has_valid_scores(df, required_columns):
+    """Check if dataframe has valid scores for required columns"""
+    return df[required_columns].notna().all(axis=1)
+# CSS styling
+CUSTOM_CSS = """
+/* Global styling */
+body {
+    background: linear-gradient(135deg, #1e1e2f 0%, #2d2d44 100%) !important;
+}
+/* Add consistent margins and centering */
+.gradio-container,
+.container,
+.main {
+    margin: 0 auto !important;
+    max-width: 1400px !important;
+    padding: 0 60px !important;
+}
+.block {
+    margin: 0 auto !important;
+    max-width: 100% !important;
+}
+.markdown-text {
+    font-size: 18px !important;
+    line-height: 1.6 !important;
+}
+/* Larger font for introduction text */
+.section-card {
+    font-size: 22px !important;
+    line-height: 1.7 !important;
+}
+.section-card p {
+    font-size: 22px !important;
+    line-height: 1.7 !important;
+}
+.section-card .markdown-text {
+    font-size: 22px !important;
+    line-height: 1.7 !important;
+}
+/* Header styling */
+#space-title {
+    text-shadow: 2px 2px 4px rgba(0,0,0,0.3) !important;
+    margin-bottom: 0.5rem !important;
+}
+.center-logo {
+    display: flex !important;
+    justify-content: center !important;
+    align-items: center !important;
+    margin: 0.25rem 0 0.5rem 0 !important;
+}
+.center-logo img {
+    width: 200px !important;
+    height: 200px !important;
+    border-radius: 50% !important;
+    overflow: hidden !important;
+    object-fit: cover !important;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.3) !important;
+    border: 3px solid rgba(255,255,255,0.1) !important;
+}
+/* Tab styling */
+.tab-nav {
+    margin: 1rem 0 !important;
+    display: flex !important;
+    justify-content: center !important;
+}
+.tab-buttons {
+    display: flex !important;
+    justify-content: center !important;
+    flex-wrap: wrap !important;
+    gap: 8px !important;
+}
+.tab-buttons button {
+    font-size: 22px !important;
+    padding: 16px 32px !important;
+    margin: 0 6px !important;
+    border-radius: 8px !important;
+    border: 2px solid transparent !important;
+    background: rgba(255,255,255,0.1) !important;
+    color: white !important;
+    transition: all 0.3s ease !important;
+}
+.tab-buttons button:hover {
+    background: rgba(255,255,255,0.2) !important;
+    transform: translateY(-2px) !important;
+}
+.tab-buttons button.selected {
+    background: linear-gradient(135deg, #6366f1, #8b5cf6) !important;
+    border-color: #6366f1 !important;
+    box-shadow: 0 4px 12px rgba(99, 102, 241, 0.3) !important;
+}
+/* Leaderboard table styling */
+#leaderboard-table {
+    margin: 20px 0 !important;
+    border-radius: 12px !important;
+    overflow: hidden !important;
+    box-shadow: 0 8px 32px rgba(0,0,0,0.2) !important;
+}
+#leaderboard-table table {
+    border-collapse: separate !important;
+    border-spacing: 0 !important;
+    width: 100% !important;
+}
+#leaderboard-table th {
+    background: linear-gradient(135deg, #4f46e5, #6366f1) !important;
+    color: white !important;
+    padding: 22px !important;
+    font-weight: 600 !important;
+    text-align: left !important;
+    border: none !important;
+    font-size: 16px !important;
+}
+#leaderboard-table td {
+    padding: 20px 22px !important;
+    border: none !important;
+    font-size: 16px !important;
+}
+#leaderboard-table tr:nth-child(even) {
+    background: rgba(255,255,255,0.05) !important;
+}
+#leaderboard-table tr:hover {
+    background: rgba(99, 102, 241, 0.1) !important;
+    transform: scale(1.01) !important;
+    transition: all 0.2s ease !important;
+}
+/* Rank column styling */
+#leaderboard-table td:nth-child(1),
+#leaderboard-table th:nth-child(1) {
+    text-align: center !important;
+    width: 80px !important;
+    min-width: 80px !important;
+    max-width: 80px !important;
+    font-size: 18px !important;
+    font-weight: 600 !important;
+}
+/* Model column styling */
+#leaderboard-table td:nth-child(2),
+#leaderboard-table th:nth-child(2) {
+    min-width: 180px !important;
+    max-width: 300px !important;
+    overflow: hidden !important;
+    white-space: nowrap !important;
+    text-overflow: ellipsis !important;
+    font-size: 16px !important;
+}
+/* Events column styling (numeric) */
+#leaderboard-table td:nth-child(3),
+#leaderboard-table th:nth-child(3) {
+    text-align: center !important;
+    width: 90px !important;
+    min-width: 90px !important;
+    max-width: 90px !important;
+    font-size: 16px !important;
+    font-weight: 600 !important;
+}
+/* Average column styling (percentage) */
+#leaderboard-table td:nth-child(4),
+#leaderboard-table th:nth-child(4) {
+    text-align: center !important;
+    width: 110px !important;
+    min-width: 110px !important;
+    max-width: 110px !important;
+    font-size: 17px !important;
+    font-weight: 700 !important;
+    color: #10b981 !important;
+}
+/* Task-specific columns (News, PolyMarket) - compact percentage columns */
+#leaderboard-table td:nth-child(n+5),
+#leaderboard-table th:nth-child(n+5) {
+    text-align: center !important;
+    width: 100px !important;
+    min-width: 100px !important;
+    max-width: 100px !important;
+    font-size: 16px !important;
+    font-weight: 600 !important;
+}
+/* Dropdown styling */
+.dropdown {
+    margin: 20px 0 !important;
+    width: 100% !important;
+}
+.dropdown select {
+    background: rgba(255,255,255,0.1) !important;
+    border: 2px solid rgba(255,255,255,0.2) !important;
+    border-radius: 8px !important;
+    padding: 12px 18px !important;
+    color: white !important;
+    font-size: 16px !important;
+    width: 100% !important;
+    max-width: 300px !important;
+}
+/* Button styling */
+#refresh-button, .refresh-btn {
+    background: linear-gradient(135deg, #10b981, #059669) !important;
+    color: white !important;
+    border: none !important;
+    padding: 14px 28px !important;
+    border-radius: 8px !important;
+    cursor: pointer !important;
+    font-size: 18px !important;
+    font-weight: 500 !important;
+    transition: all 0.3s ease !important;
+    box-shadow: 0 4px 12px rgba(16, 185, 129, 0.3) !important;
+}
+#refresh-button:hover, .refresh-btn:hover {
+    background: linear-gradient(135deg, #059669, #047857) !important;
+    transform: translateY(-2px) !important;
+    box-shadow: 0 6px 16px rgba(16, 185, 129, 0.4) !important;
+}
+/* Cards and sections */
+.section-card {
+    background: rgba(255,255,255,0.05) !important;
+    border-radius: 12px !important;
+    padding: 25px !important;
+    margin: 15px 0 !important;
+    border: 1px solid rgba(255,255,255,0.1) !important;
+    box-shadow: 0 4px 16px rgba(0,0,0,0.1) !important;
+    max-width: 100% !important;
+}
+/* Metrics and stats */
+.metric-highlight {
+    color: #10b981 !important;
+    font-weight: 600 !important;
+}
+.model-rank-1 {
+    background: linear-gradient(135deg, #fbbf24, #f59e0b) !important;
+    color: #1f2937 !important;
+    font-weight: 600 !important;
+}
+.model-rank-2 {
+    background: linear-gradient(135deg, #e5e7eb, #d1d5db) !important;
+    color: #1f2937 !important;
+    font-weight: 600 !important;
+}
+.model-rank-3 {
+    background: linear-gradient(135deg, #cd7c2f, #a16207) !important;
+    color: white !important;
+    font-weight: 600 !important;
+}
+/* Performance badges */
+.rank-badge {
+    display: inline-block !important;
+    padding: 4px 8px !important;
+    border-radius: 20px !important;
+    font-size: 10px !important;
+    font-weight: 600 !important;
+    margin-right: 8px !important;
+}
+.rank-1 .rank-badge {
+    background: linear-gradient(135deg, #fbbf24, #f59e0b) !important;
+    color: #1f2937 !important;
+}
+.rank-2 .rank-badge {
+    background: linear-gradient(135deg, #e5e7eb, #d1d5db) !important;
+    color: #1f2937 !important;
+}
+.rank-3 .rank-badge {
+    background: linear-gradient(135deg, #cd7c2f, #a16207) !important;
+    color: white !important;
+}
+/* Progress bars for accuracy */
+.accuracy-bar {
+    width: 100% !important;
+    height: 6px !important;
+    background: rgba(255,255,255,0.1) !important;
+    border-radius: 3px !important;
+    margin-top: 4px !important;
+    overflow: hidden !important;
+}
+.accuracy-progress {
+    height: 100% !important;
+    background: linear-gradient(90deg, #10b981, #059669) !important;
+    border-radius: 3px !important;
+    transition: width 0.8s ease !important;
+}
+/* Enhanced summary section */
+.summary-stats {
+    display: grid !important;
+    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)) !important;
+    gap: 20px !important;
+    margin: 20px 0 !important;
+}
+.stat-card {
+    background: rgba(255,255,255,0.08) !important;
+    border-radius: 12px !important;
+    padding: 20px !important;
+    border: 1px solid rgba(255,255,255,0.1) !important;
+    text-align: center !important;
+    transition: transform 0.3s ease !important;
+}
+.stat-card:hover {
+    transform: translateY(-4px) !important;
+}
+.stat-value {
+    font-size: 1.875rem !important;
+    font-weight: 700 !important;
+    color: #10b981 !important;
+    margin-bottom: 8px !important;
+}
+.stat-label {
+    font-size: 0.775rem !important;
+    color: rgba(255,255,255,0.7) !important;
+    text-transform: uppercase !important;
+    letter-spacing: 0.5px !important;
+}
+/* Better section headers */
+.section-header {
+    display: flex !important;
+    align-items: center !important;
+    gap: 12px !important;
+    margin: 0 0 15px 0 !important;
+    font-size: 1.675rem !important;
+    font-weight: 600 !important;
+}
+.section-icon {
+    font-size: 1.375rem !important;
+}
+/* Improved table styling */
+#leaderboard-table tr:first-child td:first-child {
+    position: relative !important;
+}
+#leaderboard-table tr:nth-child(1) {
+    background: rgba(251, 191, 36, 0.1) !important;
+}
+#leaderboard-table tr:nth-child(2) {
+    background: rgba(229, 231, 235, 0.1) !important;
+}
+#leaderboard-table tr:nth-child(3) {
+    background: rgba(205, 124, 47, 0.1) !important;
+}
+/* Loading animations */
+@keyframes fadeInUp {
+    from {
+        opacity: 0;
+        transform: translateY(20px);
+    }
+    to {
+        opacity: 1;
+        transform: translateY(0);
+    }
+}
+.fade-in-up {
+    animation: fadeInUp 0.6s ease-out !important;
+}
+/* Staggered animations */
+.stagger-1 { animation-delay: 0.1s !important; }
+.stagger-2 { animation-delay: 0.2s !important; }
+.stagger-3 { animation-delay: 0.3s !important; }
+.stagger-4 { animation-delay: 0.4s !important; }
+/* Enhanced buttons */
+.icon-button {
+    display: inline-flex !important;
+    align-items: center !important;
+    gap: 8px !important;
+}
+.icon-button::before {
+    font-size: 1.0em !important;
+}
+/* Improved markdown styling */
+.markdown-text h1 {
+    color: #10b981 !important;
+    border-bottom: 2px solid rgba(16, 185, 129, 0.3) !important;
+    padding-bottom: 8px !important;
+}
+.markdown-text h2 {
+    color: #6366f1 !important;
+    margin-top: 2rem !important;
+}
+.markdown-text h3 {
+    color: #8b5cf6 !important;
+}
+.markdown-text ul {
+    padding-left: 20px !important;
+}
+.markdown-text li {
+    margin: 8px 0 !important;
+    list-style-type: none !important;
+    position: relative !important;
+}
+.markdown-text li::before {
+    content: "▸" !important;
+    color: #10b981 !important;
+    position: absolute !important;
+    left: -16px !important;
+    font-weight: bold !important;
+}
+/* Responsive design */
+@media (max-width: 768px) {
+    /* Adjust container margins for mobile */
+    .gradio-container,
+    .container,
+    .main {
+        padding: 0 30px !important;
+    }
+    #space-title {
+        font-size: 2.375rem !important;
+    }
+    .center-logo img {
+        width: 150px !important;
+        height: 150px !important;
+    }
+    .tab-buttons button {
+        font-size: 18px !important;
+        padding: 14px 24px !important;
+    }
+    .summary-stats {
+        grid-template-columns: 1fr !important;
+    }
+    .stat-value {
+        font-size: 1.375rem !important;
+    }
+    /* Maintain readable font sizes on mobile */
+    #leaderboard-table th {
+        font-size: 14px !important;
+        padding: 16px 12px !important;
+    }
+    #leaderboard-table td {
+        font-size: 14px !important;
+        padding: 16px 12px !important;
+    }
+    /* Adjust column widths for mobile */
+    #leaderboard-table td:nth-child(1),
+    #leaderboard-table th:nth-child(1) {
+        width: 60px !important;
+        min-width: 60px !important;
+        max-width: 60px !important;
+    }
+    #leaderboard-table td:nth-child(n+5),
+    #leaderboard-table th:nth-child(n+5) {
+        width: 90px !important;
+        min-width: 90px !important;
+        max-width: 90px !important;
+    }
+}
+"""

src/leaderboard_utils.py ADDED Viewed

	@@ -0,0 +1,193 @@

+from datetime import timedelta
+import pandas as pd
+from .about import Tasks
+from .display_utils import format_percentage, make_clickable_model
+def clean_model_name(model_name: str) -> str:
+    """Clean up model names for better display"""
+    if model_name.startswith("smolagents-tavily-web-visit-"):
+        return "Agent Baseline " + model_name.removeprefix("smolagents-tavily-web-visit-")
+    if model_name.startswith("language-model-"):
+        return "Language Model " + model_name.removeprefix("language-model-")
+    return model_name
+def get_available_weeks(predictions_df):
+    """Get list of available weeks from the data"""
+    if predictions_df is None or predictions_df.empty:
+        return []
+    # Get unique dates and convert to weeks
+    dates = predictions_df["open_to_bet_until"].dt.date.unique()
+    weeks = {}
+    for date in dates:
+        # Get the Monday of the week for this date
+        monday = date - timedelta(days=date.weekday())
+        week_end = monday + timedelta(days=6)
+        week_key = f"{monday} to {week_end}"
+        week_range = (monday, week_end)
+        weeks[week_key] = week_range
+    # Sort by date
+    sorted_weeks = sorted(weeks.items(), key=lambda x: x[1][0])
+    return [("All Time", None)] + sorted_weeks
+def filter_data_by_week(predictions_df, week_range):
+    """Filter predictions data by week range"""
+    if predictions_df is None or predictions_df.empty or week_range is None:
+        return predictions_df
+    start_date, end_date = week_range
+    # Filter data where open_to_bet_until falls within the week
+    filtered_df = predictions_df[(predictions_df["open_to_bet_until"].dt.date >= start_date) & (predictions_df["open_to_bet_until"].dt.date <= end_date)]
+    return filtered_df
+def create_leaderboard_df(predictions_df, week_filter=None):
+    """
+    Create leaderboard DataFrame from predictions CSV data
+    Much simpler than Future-Bench's complex JSON parsing
+    """
+    if predictions_df is None or predictions_df.empty:
+        return pd.DataFrame()
+    # Apply week filter if specified
+    if week_filter is not None:
+        predictions_df = filter_data_by_week(predictions_df, week_filter)
+    if predictions_df.empty:
+        return pd.DataFrame()
+    # Calculate accuracy by algorithm and event type
+    results = []
+    # Group by algorithm to calculate metrics
+    for algorithm in predictions_df["algorithm_name"].unique():
+        algo_data = predictions_df[predictions_df["algorithm_name"] == algorithm]
+        # Filter out rows where result is null (unresolved events)
+        resolved_data = algo_data[algo_data["result"].notna()]
+        if len(resolved_data) == 0:
+            continue
+        # Calculate accuracy for each event type
+        cleaned_algorithm = clean_model_name(algorithm)
+        algo_scores = {"Model": make_clickable_model(cleaned_algorithm), "Events": len(resolved_data), "Correct Predictions": 0}
+        task_scores = []
+        for task in Tasks:
+            task_data = resolved_data[resolved_data["event_type"] == task.value.benchmark]
+            if len(task_data) > 0:
+                # Calculate accuracy for this task
+                # Handle different prediction formats
+                correct = 0
+                total = len(task_data)
+                for _, row in task_data.iterrows():
+                    prediction = row["actual_prediction"]
+                    actual = row["result"]
+                    # Simple string comparison for now
+                    # Could be enhanced for more complex prediction formats
+                    if str(prediction).lower().strip() == str(actual).lower().strip():
+                        correct += 1
+                accuracy = (correct / total) * 100 if total > 0 else 0
+                algo_scores[task.value.col_name] = accuracy
+                task_scores.append(accuracy)
+                # Add to total correct predictions
+                algo_scores["Correct Predictions"] += correct
+            else:
+                algo_scores[task.value.col_name] = None
+        # Calculate average accuracy across tasks where model made predictions
+        if task_scores:
+            algo_scores["Average"] = sum(task_scores) / len(task_scores)
+        else:
+            algo_scores["Average"] = 0
+        results.append(algo_scores)
+    # Create DataFrame
+    df = pd.DataFrame(results)
+    # Sort by average score (descending)
+    if "Average" in df.columns:
+        df = df.sort_values("Average", ascending=False)
+    # Reset index to ensure proper row indexing
+    df = df.reset_index(drop=True)
+    # Add rank column with medals for top 3 and numbers for rest
+    ranks = []
+    for i in range(len(df)):
+        if i == 0:
+            ranks.append("🥇")
+        elif i == 1:
+            ranks.append("🥈")
+        elif i == 2:
+            ranks.append("🥉")
+        else:
+            ranks.append(f"#{i + 1}")
+    # Insert rank column at the beginning
+    df.insert(0, "Rank", ranks)
+    # Format percentage columns
+    for task in Tasks:
+        if task.value.col_name in df.columns:
+            df[task.value.col_name] = df[task.value.col_name].apply(format_percentage)
+    if "Average" in df.columns:
+        df["Average"] = df["Average"].apply(format_percentage)
+    return df
+def get_leaderboard_summary(df):
+    """Get summary statistics for the leaderboard"""
+    if df is None or df.empty:
+        return {"total_models": 0, "total_predictions": 0, "avg_accuracy": 0}
+    total_models = len(df)
+    total_predictions = df["Events"].sum() if "Events" in df.columns else 0
+    # Calculate average accuracy across all models
+    avg_accuracy = 0
+    if "Average" in df.columns:
+        # Extract numeric values from percentage strings
+        numeric_scores = []
+        for score in df["Average"]:
+            if score != "N/A":
+                try:
+                    numeric_scores.append(float(score.replace("%", "")))
+                except Exception:
+                    pass
+        if numeric_scores:
+            avg_accuracy = sum(numeric_scores) / len(numeric_scores)
+    return {"total_models": total_models, "total_predictions": total_predictions, "avg_accuracy": avg_accuracy}
+def filter_leaderboard(df, min_predictions=0):
+    """Filter leaderboard by minimum number of predictions"""
+    if df is None or df.empty:
+        return df
+    if "Events" in df.columns:
+        return df[df["Events"] >= min_predictions]
+    return df