import gradio as gr import os import json import pandas as pd from huggingface_hub import HfApi, hf_hub_download from datasets import load_dataset import requests import datetime TOKEN = os.environ.get("HF_TOKEN") OWNER = os.environ.get("OWNER") RESULTS_COMMUNITY = f"{OWNER}/benchmark_results" api = HfApi() URL = os.environ.get("URL") def load_data(source, refresh=False): if source == "core": with open("data.json", "r") as f: data = json.load(f) else: if refresh: ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload") else: ds = load_dataset(RESULTS_COMMUNITY) data = [] for entry in ds['train']: data.append(entry) return data def build_table(source, refresh=False): data = load_data(source, refresh) if source == "core": headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"] else: headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"] html = """
{col} | ''' html += "||
---|---|---|
{val_display} | ' elif isinstance(val, float): html += f'N/A | ' else: html += f'{val} | ' html += "
This system monitors potential contamination in benchmark datasets used for evaluating language models across various open-source corpora 🧐.
The system is released along with our paper Infini-gram mini: Exact n-gram Search at the Internet Scale with FM-Index, which documents the methodology and findings in detail.
We welcome the community to submit new benchmarks for contamination analysis using the "Add New Benchmarks" tab.
''' ) with gr.Tabs(): with gr.Tab(label="Bulletin"): gr.Markdown("## Benchmark Contamination Bulletin") with gr.Accordion(label='Click to view instructions', open=False): gr.Markdown(''' The **Benchmark Contamination Bulletin** presents contamination statistics for evaluation benchmarks across different data sources. - Benchmarks analyzed in our paper are under the **core** source. Community-submitted benchmarks appear under the **community** source. - The contamination rate represents the percentage of *dirty* benchmark entries. - The bulletin will be updated regularly to include contamination checks on newly released Common Crawl dumps. ''') source_radio = gr.Radio( choices=["core", "community"], label="Select Benchmark Source", value="core" ) leaderboard_html = gr.HTML(build_table("core", refresh=False)) def update_table(source): return build_table(source, refresh=True) source_radio.change( fn=build_table, inputs=source_radio, outputs=leaderboard_html ) refresh_button = gr.Button("Refresh") refresh_button.click( fn=update_table, inputs=source_radio, outputs=leaderboard_html ) with gr.Tab(label="Add New Benchmarks"): gr.Markdown(''' ## Add Your Own Benchmarks for Contamination Checking You can use this form to submit a benchmark for contamination checking. Submissions may include either a direct upload or a reference to a publicly available dataset on Hugging Face. ### Submission Guidelines: - **Benchmark Name**: Provide a name for your benchmark. - **Contributor**: Enter your name or affiliation. - **Data Source**: - Upload a `.jsonl` file containing your benchmark entries, or - Specify a Hugging Face dataset path (`author/benchmark-name`) along with the appropriate split (e.g., `test`, `validation`). - **Field Name**: Indicate the field to analyze for contamination: - For question-answering datasets: use the question field. - For language understanding tasks: use the context or passage field. ### What Happens Next: Once submitted, your benchmark will be queued for analysis. Results will be published in the **community** section of the bulletin. Processing time may vary depending on the dataset format and size. You can check the results by navigating to the **Bulletin** tab and selecting the **community** source, then clicking **Refresh**. ''') with gr.Row(): benchmark_name_input = gr.Textbox(label="Benchmark Name") contributor_input = gr.Textbox(label="Contributor") with gr.Row(): jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"]) with gr.Column(): hf_path_input = gr.Textbox(label="HuggingFace Dataset Path", placeholder="e.g., author/benchmark-name") hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test") hf_config_input = gr.Textbox(label="Dataset Config (optional)", placeholder="name of dataset config") field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...") with gr.Row(): gr.LoginButton() submit_button = gr.Button("Submit for Contamination Check") result_output = gr.Textbox(label="Submission Status", interactive=False) submit_button.click( fn=record_submission, inputs=[benchmark_name_input, contributor_input, jsonl_input, hf_path_input, hf_split_input, field_name_input, hf_config_input], outputs=result_output, ) interface.launch()