|
import gradio as gr |
|
import os |
|
import json |
|
import pandas as pd |
|
from huggingface_hub import HfApi, hf_hub_download |
|
from datasets import load_dataset |
|
import requests |
|
import datetime |
|
|
|
TOKEN = os.environ.get("HF_TOKEN") |
|
OWNER = os.environ.get("OWNER") |
|
RESULTS_COMMUNITY = f"{OWNER}/benchmark_results" |
|
api = HfApi() |
|
|
|
URL = os.environ.get("URL") |
|
|
|
|
|
def load_data(source, refresh=False): |
|
if source == "core": |
|
with open("data.json", "r") as f: |
|
data = json.load(f) |
|
else: |
|
if refresh: |
|
ds = load_dataset(RESULTS_COMMUNITY, download_mode="force_redownload") |
|
else: |
|
ds = load_dataset(RESULTS_COMMUNITY) |
|
data = [] |
|
for entry in ds['train']: |
|
data.append(entry) |
|
return data |
|
|
|
|
|
def build_table(source, refresh=False): |
|
data = load_data(source, refresh) |
|
|
|
if source == "core": |
|
headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"] |
|
else: |
|
headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"] |
|
|
|
html = """ |
|
<table id="benchmarkTable" style="border-collapse: collapse; width: 100%;"> |
|
<thead> |
|
<tr> |
|
""" |
|
for col in headers: |
|
html += f''' |
|
<th onclick="sortTable(this)" style="cursor: pointer; border: 1px solid #ddd; padding: 8px; text-align: right;"> |
|
{col} |
|
<span class="tri-container"> |
|
<span class="triangle-up"></span> |
|
<span class="triangle-down"></span> |
|
</span> |
|
</th> |
|
''' |
|
html += "</tr></thead><tbody>" |
|
|
|
for entry in data: |
|
name = entry.get("Benchmark", "") |
|
url = entry.get("URL", "#") |
|
hyperlink = f'<a href="{url}" target="_blank">{name}</a>' if url else name |
|
|
|
row = { |
|
"Benchmark": hyperlink, |
|
"Pile-train Dirty (%)": entry.get("Pile Dirty", -1), |
|
"DCLM-baseline Dirty (%)": entry.get("DCLM Dirty", -1), |
|
"CC-2025-05 Dirty (%)": entry.get("CC202505 Dirty", -1), |
|
"CC-2025-08 Dirty (%)": entry.get("CC202508 Dirty", -1) |
|
} |
|
|
|
if source == "core": |
|
row["Category"] = entry.get("Category", "") |
|
elif source == "community": |
|
row["Contributor"] = entry.get("Contributor", "") |
|
|
|
html += "<tr>" |
|
for col in headers: |
|
val = row.get(col, "") |
|
if isinstance(val, float) and val >= 0: |
|
val_display = f"{val:5.1f}" |
|
html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val_display}</td>' |
|
elif isinstance(val, float): |
|
html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">N/A</td>' |
|
else: |
|
html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val}</td>' |
|
html += "</tr>\n" |
|
|
|
html += "</tbody></table>" |
|
|
|
html += """ |
|
<script> |
|
let sortDirection = {}; |
|
|
|
function sortTable(header) { |
|
const table = document.getElementById("benchmarkTable"); |
|
const rows = Array.from(table.tBodies[0].rows); |
|
const columnIndex = Array.from(header.parentNode.children).indexOf(header); |
|
const isAscending = sortDirection[columnIndex] === 'ascending'; |
|
sortDirection[columnIndex] = isAscending ? 'descending' : 'ascending'; |
|
|
|
Array.from(header.parentNode.children).forEach(th => { |
|
const up = th.querySelector('.triangle-up'); |
|
const down = th.querySelector('.triangle-down'); |
|
if (up) up.classList.remove('active'); |
|
if (down) down.classList.remove('active'); |
|
}); |
|
|
|
if (sortDirection[columnIndex] === 'ascending') { |
|
header.querySelector('.triangle-up').classList.add('active'); |
|
} else { |
|
header.querySelector('.triangle-down').classList.add('active'); |
|
} |
|
|
|
rows.sort((rowA, rowB) => { |
|
const cellA = rowA.cells[columnIndex].innerText; |
|
const cellB = rowB.cells[columnIndex].innerText; |
|
if (isNaN(cellA)) { |
|
return isAscending ? cellA.localeCompare(cellB) : cellB.localeCompare(cellA); |
|
} |
|
return isAscending ? parseFloat(cellA) - parseFloat(cellB) : parseFloat(cellB) - parseFloat(cellA); |
|
}); |
|
|
|
rows.forEach(row => table.tBodies[0].appendChild(row)); |
|
} |
|
</script> |
|
""" |
|
|
|
html += """ |
|
<style> |
|
thead tr { |
|
background-color: #f0f0f0; |
|
} |
|
.tri-container { |
|
display: inline-block; |
|
margin-left: 4px; |
|
vertical-align: middle; |
|
} |
|
.triangle-up, .triangle-down { |
|
display: block; |
|
width: 0; |
|
height: 0; |
|
margin: 1px auto; |
|
border-left: 5px solid transparent; |
|
border-right: 5px solid transparent; |
|
} |
|
.triangle-up { |
|
border-bottom: 5px solid #999; |
|
} |
|
.triangle-down { |
|
border-top: 5px solid #999; |
|
} |
|
.triangle-up.active { |
|
border-bottom: 5px solid #000; |
|
} |
|
.triangle-down.active { |
|
border-top: 5px solid #000; |
|
} |
|
</style> |
|
""" |
|
|
|
return html |
|
|
|
|
|
def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split, field_name, hf_config, profile: gr.OAuthProfile): |
|
user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview") |
|
creation_date = json.loads(user_data.content)["createdAt"] |
|
if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=10): |
|
return format_error("This account is not authorized to submit.") |
|
|
|
if not benchmark_name or not benchmark_name.strip(): |
|
return "β Please provide a benchmark name." |
|
|
|
if not field_name or not field_name.strip(): |
|
return "β Please provide a field name." |
|
|
|
has_jsonl = jsonl_file is not None |
|
has_hf = hf_path and hf_path.strip() |
|
|
|
if not has_jsonl and not has_hf: |
|
return "β Please provide either a .jsonl file or a HuggingFace dataset path." |
|
|
|
if has_jsonl: |
|
try: |
|
with open(jsonl_file.name, 'r', encoding='utf-8') as f: |
|
line_count = 0 |
|
for line in f: |
|
line_count += 1 |
|
if line_count > 5: |
|
break |
|
|
|
try: |
|
entry = json.loads(line.strip()) |
|
if field_name.strip() not in entry: |
|
available_fields = list(entry.keys()) |
|
return f"β Field '{field_name.strip()}' not found in JSONL file. Available fields: {', '.join(available_fields)}" |
|
except json.JSONDecodeError as e: |
|
return f"β Invalid JSON format in line {line_count}: {str(e)}" |
|
|
|
if line_count == 0: |
|
return "β The uploaded file is empty." |
|
|
|
except Exception as e: |
|
return f"β Error reading file: {str(e)}" |
|
elif has_hf: |
|
if not hf_split or not hf_split.strip(): |
|
return "β Please provide a dataset split for the HuggingFace dataset." |
|
|
|
try: |
|
if hf_config: |
|
dataset_info = load_dataset(hf_path.strip(), hf_config.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True) |
|
else: |
|
dataset_info = load_dataset(hf_path.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True) |
|
first_item = next(iter(dataset_info)) |
|
if field_name.strip() not in first_item: |
|
available_fields = list(first_item.keys()) |
|
return f"β Field '{field_name.strip()}' not found in dataset. Available fields: {', '.join(available_fields)}" |
|
except Exception as e: |
|
return f"β Could not access HuggingFace dataset: {str(e)}" |
|
|
|
try: |
|
data = { |
|
'name': benchmark_name.strip(), |
|
'contributor': contributor.strip(), |
|
'type': 'jsonl' if has_jsonl else 'hf', |
|
'split': hf_split.strip() if has_hf else '', |
|
'field_name': field_name.strip(), |
|
'hf_path': hf_path.strip() if has_hf else '', |
|
'hf_config': hf_config.strip() if has_hf else '' |
|
} |
|
print(json.dumps(data)) |
|
files = {} |
|
if has_jsonl: |
|
files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json') |
|
|
|
response = requests.post(f"{URL}/", data={"payload": json.dumps(data)}, files=files, timeout=30) |
|
|
|
if files: |
|
files['file'][1].close() |
|
|
|
if response.status_code == 200: |
|
result = response.json() |
|
if result.get("status") == "success": |
|
message = result.get('message', 'Submission successful!') |
|
|
|
full_message = f"{message}" |
|
|
|
return full_message |
|
elif result.get("status") == "info": |
|
return f"β {result.get('message', 'Submission already exists')}" |
|
else: |
|
return f"β {result.get('message', 'Unknown error occurred')}" |
|
else: |
|
return f"β Server error: {response.status_code} - {response.text}" |
|
|
|
except Exception as e: |
|
return f"β Error submitting benchmark: {str(e)}" |
|
|
|
|
|
with gr.Blocks() as interface: |
|
gr.HTML( |
|
'''<h1 text-align="center">π Benchmark Contamination Monitoring System</h1> |
|
|
|
<p style='font-size: 16px;'>This system monitors potential contamination in benchmark datasets used for evaluating language models across various open-source corpora π§.</p> |
|
<p style='font-size: 16px;'>The system is released along with our paper <a href="https://arxiv.org/abs/2506.12229">Infini-gram mini: Exact n-gram Search at the Internet Scale with FM-Index</a>, which documents the methodology and findings in detail.</p> |
|
<p style='font-size: 16px;'>We welcome the community to submit new benchmarks for contamination analysis using the <b>"Add New Benchmarks"</b> tab.</p> |
|
''' |
|
) |
|
|
|
with gr.Tabs(): |
|
with gr.Tab(label="Bulletin"): |
|
gr.Markdown("## Benchmark Contamination Bulletin") |
|
with gr.Accordion(label='Click to view instructions', open=False): |
|
gr.Markdown(''' |
|
The **Benchmark Contamination Bulletin** presents contamination statistics for evaluation benchmarks across different data sources. |
|
|
|
- Benchmarks analyzed in our paper are under the **core** source. Community-submitted benchmarks appear under the **community** source. |
|
- The contamination rate represents the percentage of *dirty* benchmark entries. |
|
- The bulletin will be updated regularly to include contamination checks on newly released Common Crawl dumps. |
|
''') |
|
|
|
source_radio = gr.Radio( |
|
choices=["core", "community"], |
|
label="Select Benchmark Source", |
|
value="core" |
|
) |
|
|
|
leaderboard_html = gr.HTML(build_table("core", refresh=False)) |
|
|
|
def update_table(source): |
|
return build_table(source, refresh=True) |
|
|
|
source_radio.change( |
|
fn=build_table, |
|
inputs=source_radio, |
|
outputs=leaderboard_html |
|
) |
|
|
|
refresh_button = gr.Button("Refresh") |
|
refresh_button.click( |
|
fn=update_table, |
|
inputs=source_radio, |
|
outputs=leaderboard_html |
|
) |
|
|
|
with gr.Tab(label="Add New Benchmarks"): |
|
gr.Markdown(''' |
|
## Add Your Own Benchmarks for Contamination Checking |
|
|
|
You can use this form to submit a benchmark for contamination checking. Submissions may include either a direct upload or a reference to a publicly available dataset on Hugging Face. |
|
|
|
### Submission Guidelines: |
|
- **Benchmark Name**: Provide a name for your benchmark. |
|
- **Contributor**: Enter your name or affiliation. |
|
- **Data Source**: |
|
- Upload a `.jsonl` file containing your benchmark entries, or |
|
- Specify a Hugging Face dataset path (`author/benchmark-name`) along with the appropriate split (e.g., `test`, `validation`). |
|
- **Field Name**: Indicate the field to analyze for contamination: |
|
- For question-answering datasets: use the question field. |
|
- For language understanding tasks: use the context or passage field. |
|
|
|
### What Happens Next: |
|
Once submitted, your benchmark will be queued for analysis. Results will be published in the **community** section of the bulletin. |
|
|
|
Processing time may vary depending on the dataset format and size. You can check the results by navigating to the **Bulletin** tab and selecting the **community** source, then clicking **Refresh**. |
|
''') |
|
|
|
|
|
with gr.Row(): |
|
benchmark_name_input = gr.Textbox(label="Benchmark Name") |
|
contributor_input = gr.Textbox(label="Contributor") |
|
|
|
with gr.Row(): |
|
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"]) |
|
with gr.Column(): |
|
hf_path_input = gr.Textbox(label="HuggingFace Dataset Path", placeholder="e.g., author/benchmark-name") |
|
hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test") |
|
hf_config_input = gr.Textbox(label="Dataset Config (optional)", placeholder="name of dataset config") |
|
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...") |
|
|
|
with gr.Row(): |
|
gr.LoginButton() |
|
submit_button = gr.Button("Submit for Contamination Check") |
|
result_output = gr.Textbox(label="Submission Status", interactive=False) |
|
|
|
submit_button.click( |
|
fn=record_submission, |
|
inputs=[benchmark_name_input, contributor_input, jsonl_input, hf_path_input, hf_split_input, field_name_input, hf_config_input], |
|
outputs=result_output, |
|
) |
|
|
|
interface.launch() |
|
|