Spaces:

infini-gram-mini
/

Benchmark-Contamination-Monitoring-System

Running

Benchmark-Contamination-Monitoring-System

File size: 14,867 Bytes

import gradio as gr
import os
import json
import pandas as pd
from huggingface_hub import HfApi, hf_hub_download
from datasets import load_dataset
import requests
import datetime

TOKEN = os.environ.get("HF_TOKEN")
OWNER = os.environ.get("OWNER")
RESULTS_COMMUNITY = f"{OWNER}/benchmark_results"
api = HfApi()

URL = os.environ.get("URL")


def load_data(source, refresh=False):
    if source == "core":
        with open("data.json", "r") as f:
            data = json.load(f)
    else:
        if refresh:
            ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload")
        else:
            ds = load_dataset(RESULTS_COMMUNITY)
        data = []
        for entry in ds['train']:
            data.append(entry)
    return data


def build_table(source, refresh=False):
    data = load_data(source, refresh)

    if source == "core":
        headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"]
    else:
        headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"]

    html = """
    <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
    <thead>
    <tr>
    """
    for col in headers:
        html += f'''
        <th onclick="sortTable(this)" style="cursor: pointer; border: 1px solid #ddd; padding: 8px; text-align: right;">
            {col}
            <span class="tri-container">
                <span class="triangle-up"></span>
                <span class="triangle-down"></span>
            </span>
        </th>
        '''
    html += "</tr></thead><tbody>"

    for entry in data:
        name = entry.get("Benchmark", "")
        url = entry.get("URL", "#")
        hyperlink = f'<a href="{url}" target="_blank">{name}</a>' if url else name

        row = {
            "Benchmark": hyperlink,
            "Pile-train Dirty (%)": entry.get("Pile Dirty", -1),
            "DCLM-baseline Dirty (%)": entry.get("DCLM Dirty", -1),
            "CC-2025-05 Dirty (%)": entry.get("CC202505 Dirty", -1),
            "CC-2025-08 Dirty (%)": entry.get("CC202508 Dirty", -1)
        }

        if source == "core":
            row["Category"] = entry.get("Category", "")
        elif source == "community":
            row["Contributor"] = entry.get("Contributor", "")

        html += "<tr>"
        for col in headers:
            val = row.get(col, "")
            if isinstance(val, float) and val >= 0:
                val_display = f"{val:5.1f}"
                html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val_display}</td>'
            elif isinstance(val, float):
                html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">N/A</td>'
            else:
                html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val}</td>'
        html += "</tr>\n"

    html += "</tbody></table>"

    html += """
    <script>
        let sortDirection = {};

        function sortTable(header) {
            const table = document.getElementById("benchmarkTable");
            const rows = Array.from(table.tBodies[0].rows);
            const columnIndex = Array.from(header.parentNode.children).indexOf(header);
            const isAscending = sortDirection[columnIndex] === 'ascending';
            sortDirection[columnIndex] = isAscending ? 'descending' : 'ascending';

            Array.from(header.parentNode.children).forEach(th => {
                const up = th.querySelector('.triangle-up');
                const down = th.querySelector('.triangle-down');
                if (up) up.classList.remove('active');
                if (down) down.classList.remove('active');
            });

            if (sortDirection[columnIndex] === 'ascending') {
                header.querySelector('.triangle-up').classList.add('active');
            } else {
                header.querySelector('.triangle-down').classList.add('active');
            }

            rows.sort((rowA, rowB) => {
                const cellA = rowA.cells[columnIndex].innerText;
                const cellB = rowB.cells[columnIndex].innerText;
                if (isNaN(cellA)) {
                    return isAscending ? cellA.localeCompare(cellB) : cellB.localeCompare(cellA);
                }
                return isAscending ? parseFloat(cellA) - parseFloat(cellB) : parseFloat(cellB) - parseFloat(cellA);
            });

            rows.forEach(row => table.tBodies[0].appendChild(row));
        }
    </script>
    """

    html += """
    <style>
        thead tr {
            background-color: #f0f0f0;
        }
        .tri-container {
            display: inline-block;
            margin-left: 4px;
            vertical-align: middle;
        }
        .triangle-up, .triangle-down {
            display: block;
            width: 0;
            height: 0;
            margin: 1px auto;
            border-left: 5px solid transparent;
            border-right: 5px solid transparent;
        }
        .triangle-up {
            border-bottom: 5px solid #999;
        }
        .triangle-down {
            border-top: 5px solid #999;
        }
        .triangle-up.active {
            border-bottom: 5px solid #000;
        }
        .triangle-down.active {
            border-top: 5px solid #000;
        }
    </style>
    """

    return html


def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split, field_name, hf_config, profile: gr.OAuthProfile):
    user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
    creation_date = json.loads(user_data.content)["createdAt"]
    if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10):
        return format_error("This account is not authorized to submit.")
    
    if not benchmark_name or not benchmark_name.strip():
        return "❌ Please provide a benchmark name."
    
    if not field_name or not field_name.strip():
        return "❌ Please provide a field name."
    
    has_jsonl = jsonl_file is not None
    has_hf = hf_path and hf_path.strip()
    
    if not has_jsonl and not has_hf:
        return "❌ Please provide either a .jsonl file or a HuggingFace dataset path."
    
    if has_jsonl:
        try:
            with open(jsonl_file.name, 'r', encoding='utf-8') as f:
                line_count = 0
                for line in f:
                    line_count += 1
                    if line_count > 5:
                        break
                    
                    try:
                        entry = json.loads(line.strip())
                        if field_name.strip() not in entry:
                            available_fields = list(entry.keys())
                            return f"❌ Field '{field_name.strip()}' not found in JSONL file. Available fields: {', '.join(available_fields)}"
                    except json.JSONDecodeError as e:
                        return f"❌ Invalid JSON format in line {line_count}: {str(e)}"
                
                if line_count == 0:
                    return "❌ The uploaded file is empty."
                    
        except Exception as e:
            return f"❌ Error reading file: {str(e)}"
    elif has_hf:
        if not hf_split or not hf_split.strip():
            return "❌ Please provide a dataset split for the HuggingFace dataset."
        
        try:
            if hf_config:
                dataset_info = load_dataset(hf_path.strip(), hf_config.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True)
            else:
                dataset_info = load_dataset(hf_path.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True)
            first_item = next(iter(dataset_info))
            if field_name.strip() not in first_item:
                available_fields = list(first_item.keys())
                return f"❌ Field '{field_name.strip()}' not found in dataset. Available fields: {', '.join(available_fields)}"
        except Exception as e:
            return f"❌ Could not access HuggingFace dataset: {str(e)}"
    
    try:
        data = {
            'name': benchmark_name.strip(),
            'contributor': contributor.strip(),
            'type': 'jsonl' if has_jsonl else 'hf',
            'split': hf_split.strip() if has_hf else '',
            'field_name': field_name.strip(),
            'hf_path': hf_path.strip() if has_hf else '',
            'hf_config': hf_config.strip() if has_hf else ''
        }
        print(json.dumps(data))
        files = {}
        if has_jsonl:
            files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json')
        
        response = requests.post(f"{URL}/", data={"payload": json.dumps(data)}, files=files, timeout=30)
        
        if files:
            files['file'][1].close()
        
        if response.status_code == 200:
            result = response.json()
            if result.get("status") == "success":
                message = result.get('message', 'Submission successful!')
                
                full_message = f"{message}"
                
                return full_message
            elif result.get("status") == "info":
                return f"❌ {result.get('message', 'Submission already exists')}"
            else:
                return f"❌ {result.get('message', 'Unknown error occurred')}"
        else:
            return f"❌ Server error: {response.status_code} - {response.text}"
            
    except Exception as e:
        return f"❌ Error submitting benchmark: {str(e)}"


with gr.Blocks() as interface:
    gr.HTML(
            '''<h1 text-align="center">📖 Benchmark Contamination Monitoring System</h1>

            <p style='font-size: 16px;'>This system monitors potential contamination in benchmark datasets used for evaluating language models across various open-source corpora 🧐.</p>
            <p style='font-size: 16px;'>The system is released along with our paper <a href="https://arxiv.org/abs/2506.12229">Infini-gram mini: Exact n-gram Search at the Internet Scale with FM-Index</a>, which documents the methodology and findings in detail.</p>
            <p style='font-size: 16px;'>We welcome the community to submit new benchmarks for contamination analysis using the <b>"Add New Benchmarks"</b> tab.</p>
            '''
        )

    with gr.Tabs():
        with gr.Tab(label="Bulletin"):
            gr.Markdown("## Benchmark Contamination Bulletin")
            with gr.Accordion(label='Click to view instructions', open=False):
                gr.Markdown('''
                The **Benchmark Contamination Bulletin** presents contamination statistics for evaluation benchmarks across different data sources.

                - Benchmarks analyzed in our paper are under the **core** source. Community-submitted benchmarks appear under the **community** source.
                - The contamination rate represents the percentage of *dirty* benchmark entries.
                - The bulletin will be updated regularly to include contamination checks on newly released Common Crawl dumps.
                ''')

            source_radio = gr.Radio(
                choices=["core", "community"],
                label="Select Benchmark Source",
                value="core"
            )

            leaderboard_html = gr.HTML(build_table("core", refresh=False))

            def update_table(source):
                return build_table(source, refresh=True)

            source_radio.change(
                fn=build_table,
                inputs=source_radio,
                outputs=leaderboard_html
            )

            refresh_button = gr.Button("Refresh")
            refresh_button.click(
                fn=update_table,
                inputs=source_radio,
                outputs=leaderboard_html
            )

        with gr.Tab(label="Add New Benchmarks"):
            gr.Markdown('''
            ## Add Your Own Benchmarks for Contamination Checking

            You can use this form to submit a benchmark for contamination checking. Submissions may include either a direct upload or a reference to a publicly available dataset on Hugging Face.

            ### Submission Guidelines:
            - **Benchmark Name**: Provide a name for your benchmark.
            - **Contributor**: Enter your name or affiliation.
            - **Data Source**:
                - Upload a `.jsonl` file containing your benchmark entries, or
                - Specify a Hugging Face dataset path (`author/benchmark-name`) along with the   appropriate split (e.g., `test`, `validation`).
            - **Field Name**: Indicate the field to analyze for contamination:
                - For question-answering datasets: use the question field.
                - For language understanding tasks: use the context or passage field.

            ### What Happens Next:
            Once submitted, your benchmark will be queued for analysis. Results will be published in the **community** section of the bulletin.
            
            Processing time may vary depending on the dataset format and size. You can check the results by navigating to the **Bulletin** tab and selecting the **community** source, then clicking **Refresh**.
            ''')


            with gr.Row():
                benchmark_name_input = gr.Textbox(label="Benchmark Name")
                contributor_input = gr.Textbox(label="Contributor")

            with gr.Row():
                jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
                with gr.Column():
                    hf_path_input = gr.Textbox(label="HuggingFace Dataset Path", placeholder="e.g., author/benchmark-name")
                    hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
                    hf_config_input = gr.Textbox(label="Dataset Config (optional)", placeholder="name of dataset config")
            field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")

            with gr.Row():
                gr.LoginButton()
                submit_button = gr.Button("Submit for Contamination Check")
            result_output = gr.Textbox(label="Submission Status", interactive=False)

            submit_button.click(
                fn=record_submission,
                inputs=[benchmark_name_input, contributor_input, jsonl_input, hf_path_input, hf_split_input, field_name_input, hf_config_input],
                outputs=result_output,
            )

interface.launch()