|
import gradio as gr |
|
import os |
|
import json |
|
import pandas as pd |
|
|
|
def load_data(source): |
|
data = [] |
|
if source == "core": |
|
with open("data.json", "r") as file: |
|
data = json.load(file) |
|
elif source == "community": |
|
with open("community_results.json", "r") as file: |
|
data = json.load(file) |
|
return data |
|
|
|
def build_table(source): |
|
data = load_data(source) |
|
entries = [] |
|
|
|
for entry in data: |
|
entries.append({ |
|
"Benchmark": entry.get("Benchmark", ""), |
|
"Category": entry.get("Category", ""), |
|
"Pile Dirty (%)": entry.get("Pile Dirty", ""), |
|
"DCLM Dirty (%)": entry.get("DCLM Dirty", ""), |
|
"CC Dirty (%)": entry.get("CC202505 Dirty", ""), |
|
}) |
|
|
|
return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False) |
|
|
|
def record_submission(jsonl_file, hf_path, field_name): |
|
if jsonl_file is None and not hf_path: |
|
return "Please provide either a .jsonl file or a HuggingFace dataset path." |
|
|
|
entry = { |
|
"source": hf_path if hf_path else jsonl_file.name, |
|
"type": "hf" if hf_path else "jsonl", |
|
"field_name": field_name, |
|
} |
|
|
|
queue_file = "pending_submissions.json" |
|
existing = [] |
|
if os.path.exists(queue_file): |
|
with open(queue_file, "r") as f: |
|
existing = json.load(f) |
|
existing.append(entry) |
|
with open(queue_file, "w") as f: |
|
json.dump(existing, f, indent=2) |
|
|
|
return "β
Submission received! You'll be notified when processing is complete." |
|
|
|
with gr.Blocks() as interface: |
|
gr.Markdown("# π Benchmark Contamination Bulletin") |
|
|
|
with gr.Tabs(): |
|
with gr.Tab(label="Leaderboard"): |
|
|
|
source_radio = gr.Radio( |
|
choices=["core", "community"], |
|
label="Select Benchmark Source", |
|
value="core" |
|
) |
|
|
|
table_columns = ["Benchmark", "Category", "Pile Dirty (%)", "DCLM Dirty (%)", "CC202505 Dirty (%)"] |
|
leaderboard_table = gr.Dataframe( |
|
value=build_table("core"), |
|
headers=table_columns, |
|
interactive=False, |
|
wrap=True, |
|
label="Dirty Rates" |
|
) |
|
|
|
def update_table(source): |
|
return build_table(source) |
|
|
|
source_radio.change( |
|
fn=update_table, |
|
inputs=source_radio, |
|
outputs=leaderboard_table |
|
) |
|
|
|
|
|
with gr.Tab(label="Submission"): |
|
gr.Markdown("## Submit Your Dataset for Contamination Checking") |
|
|
|
with gr.Row(): |
|
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"]) |
|
hf_path_input = gr.Textbox(label="HuggingFace Dataset Path") |
|
|
|
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...") |
|
|
|
submit_button = gr.Button("Submit for Contamination Check") |
|
result_output = gr.Textbox(label="Submission Status", interactive=False) |
|
|
|
submit_button.click( |
|
fn=record_submission, |
|
inputs=[jsonl_input, hf_path_input, field_name_input], |
|
outputs=result_output |
|
) |
|
|
|
interface.launch() |
|
|