import gradio as gr import os import json import pandas as pd from huggingface_hub import HfApi, hf_hub_download from datasets import load_dataset import requests import datetime TOKEN = os.environ.get("HF_TOKEN") OWNER = os.environ.get("OWNER") RESULTS_COMMUNITY = f"{OWNER}/benchmark_results" api = HfApi() URL = os.environ.get("URL") def load_data(source, refresh=False): if source == "core": with open("data.json", "r") as f: data = json.load(f) else: if refresh: ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload") else: ds = load_dataset(RESULTS_COMMUNITY) data = [] for entry in ds['train']: data.append(entry) return data def build_table(source, refresh=False): data = load_data(source, refresh) if source == "core": headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"] else: headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"] html = """ """ for col in headers: html += f''' ''' html += "" for entry in data: name = entry.get("Benchmark", "") url = entry.get("URL", "#") hyperlink = f'{name}' if url else name row = { "Benchmark": hyperlink, "Pile-train Dirty (%)": entry.get("Pile Dirty", -1), "DCLM-baseline Dirty (%)": entry.get("DCLM Dirty", -1), "CC-2025-05 Dirty (%)": entry.get("CC202505 Dirty", -1), "CC-2025-08 Dirty (%)": entry.get("CC202508 Dirty", -1) } if source == "core": row["Category"] = entry.get("Category", "") elif source == "community": row["Contributor"] = entry.get("Contributor", "") html += "" for col in headers: val = row.get(col, "") if isinstance(val, float) and val >= 0: val_display = f"{val:5.1f}" html += f'' elif isinstance(val, float): html += f'' else: html += f'' html += "\n" html += "
{col}
{val_display}N/A{val}
" html += """ """ html += """ """ return html def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split, field_name, hf_config, profile: gr.OAuthProfile): user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview") creation_date = json.loads(user_data.content)["createdAt"] if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10): return format_error("This account is not authorized to submit.") if not benchmark_name or not benchmark_name.strip(): return "❌ Please provide a benchmark name." if not field_name or not field_name.strip(): return "❌ Please provide a field name." has_jsonl = jsonl_file is not None has_hf = hf_path and hf_path.strip() if not has_jsonl and not has_hf: return "❌ Please provide either a .jsonl file or a HuggingFace dataset path." if has_jsonl: try: with open(jsonl_file.name, 'r', encoding='utf-8') as f: line_count = 0 for line in f: line_count += 1 if line_count > 5: break try: entry = json.loads(line.strip()) if field_name.strip() not in entry: available_fields = list(entry.keys()) return f"❌ Field '{field_name.strip()}' not found in JSONL file. Available fields: {', '.join(available_fields)}" except json.JSONDecodeError as e: return f"❌ Invalid JSON format in line {line_count}: {str(e)}" if line_count == 0: return "❌ The uploaded file is empty." except Exception as e: return f"❌ Error reading file: {str(e)}" elif has_hf: if not hf_split or not hf_split.strip(): return "❌ Please provide a dataset split for the HuggingFace dataset." try: if hf_config: dataset_info = load_dataset(hf_path.strip(), hf_config.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True) else: dataset_info = load_dataset(hf_path.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True) first_item = next(iter(dataset_info)) if field_name.strip() not in first_item: available_fields = list(first_item.keys()) return f"❌ Field '{field_name.strip()}' not found in dataset. Available fields: {', '.join(available_fields)}" except Exception as e: return f"❌ Could not access HuggingFace dataset: {str(e)}" try: data = { 'name': benchmark_name.strip(), 'contributor': contributor.strip(), 'type': 'jsonl' if has_jsonl else 'hf', 'split': hf_split.strip() if has_hf else '', 'field_name': field_name.strip(), 'hf_path': hf_path.strip() if has_hf else '', 'hf_config': hf_config.strip() if has_hf else '' } print(json.dumps(data)) files = {} if has_jsonl: files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json') response = requests.post(f"{URL}/", data={"payload": json.dumps(data)}, files=files, timeout=30) if files: files['file'][1].close() if response.status_code == 200: result = response.json() if result.get("status") == "success": message = result.get('message', 'Submission successful!') full_message = f"{message}" return full_message elif result.get("status") == "info": return f"❌ {result.get('message', 'Submission already exists')}" else: return f"❌ {result.get('message', 'Unknown error occurred')}" else: return f"❌ Server error: {response.status_code} - {response.text}" except Exception as e: return f"❌ Error submitting benchmark: {str(e)}" with gr.Blocks() as interface: gr.HTML( '''

📖 Benchmark Contamination Monitoring System

This system monitors potential contamination in benchmark datasets used for evaluating language models across various open-source corpora 🧐.

The system is released along with our paper Infini-gram mini: Exact n-gram Search at the Internet Scale with FM-Index, which documents the methodology and findings in detail.

We welcome the community to submit new benchmarks for contamination analysis using the "Add New Benchmarks" tab.

''' ) with gr.Tabs(): with gr.Tab(label="Bulletin"): gr.Markdown("## Benchmark Contamination Bulletin") with gr.Accordion(label='Click to view instructions', open=False): gr.Markdown(''' The **Benchmark Contamination Bulletin** presents contamination statistics for evaluation benchmarks across different data sources. - Benchmarks analyzed in our paper are under the **core** source. Community-submitted benchmarks appear under the **community** source. - The contamination rate represents the percentage of *dirty* benchmark entries. - The bulletin will be updated regularly to include contamination checks on newly released Common Crawl dumps. ''') source_radio = gr.Radio( choices=["core", "community"], label="Select Benchmark Source", value="core" ) leaderboard_html = gr.HTML(build_table("core", refresh=False)) def update_table(source): return build_table(source, refresh=True) source_radio.change( fn=build_table, inputs=source_radio, outputs=leaderboard_html ) refresh_button = gr.Button("Refresh") refresh_button.click( fn=update_table, inputs=source_radio, outputs=leaderboard_html ) with gr.Tab(label="Add New Benchmarks"): gr.Markdown(''' ## Add Your Own Benchmarks for Contamination Checking You can use this form to submit a benchmark for contamination checking. Submissions may include either a direct upload or a reference to a publicly available dataset on Hugging Face. ### Submission Guidelines: - **Benchmark Name**: Provide a name for your benchmark. - **Contributor**: Enter your name or affiliation. - **Data Source**: - Upload a `.jsonl` file containing your benchmark entries, or - Specify a Hugging Face dataset path (`author/benchmark-name`) along with the appropriate split (e.g., `test`, `validation`). - **Field Name**: Indicate the field to analyze for contamination: - For question-answering datasets: use the question field. - For language understanding tasks: use the context or passage field. ### What Happens Next: Once submitted, your benchmark will be queued for analysis. Results will be published in the **community** section of the bulletin. Processing time may vary depending on the dataset format and size. You can check the results by navigating to the **Bulletin** tab and selecting the **community** source, then clicking **Refresh**. ''') with gr.Row(): benchmark_name_input = gr.Textbox(label="Benchmark Name") contributor_input = gr.Textbox(label="Contributor") with gr.Row(): jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"]) with gr.Column(): hf_path_input = gr.Textbox(label="HuggingFace Dataset Path", placeholder="e.g., author/benchmark-name") hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test") hf_config_input = gr.Textbox(label="Dataset Config (optional)", placeholder="name of dataset config") field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...") with gr.Row(): gr.LoginButton() submit_button = gr.Button("Submit for Contamination Check") result_output = gr.Textbox(label="Submission Status", interactive=False) submit_button.click( fn=record_submission, inputs=[benchmark_name_input, contributor_input, jsonl_input, hf_path_input, hf_split_input, field_name_input, hf_config_input], outputs=result_output, ) interface.launch()