Hao Xu
leaderboard UI
3c856c0
raw
history blame
3.32 kB
import gradio as gr
import os
import json
import pandas as pd
def load_data(source):
data = []
if source == "core":
with open("data.json", "r") as file:
data = json.load(file)
elif source == "community":
with open("community_results.json", "r") as file:
data = json.load(file)
return data
def build_table(source):
data = load_data(source)
entries = []
for entry in data:
entries.append({
"Benchmark": entry.get("Benchmark", ""),
"Category": entry.get("Category", ""),
"Pile Dirty (%)": entry.get("Pile Dirty", ""),
"DCLM Dirty (%)": entry.get("DCLM Dirty", ""),
"CC Dirty (%)": entry.get("CC202505 Dirty", ""),
})
return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
def record_submission(jsonl_file, hf_path, field_name):
if jsonl_file is None and not hf_path:
return "Please provide either a .jsonl file or a HuggingFace dataset path."
entry = {
"source": hf_path if hf_path else jsonl_file.name,
"type": "hf" if hf_path else "jsonl",
"field_name": field_name,
}
queue_file = "pending_submissions.json"
existing = []
if os.path.exists(queue_file):
with open(queue_file, "r") as f:
existing = json.load(f)
existing.append(entry)
with open(queue_file, "w") as f:
json.dump(existing, f, indent=2)
return "βœ… Submission received! You'll be notified when processing is complete."
with gr.Blocks() as interface:
gr.Markdown("# πŸ“– Benchmark Contamination Bulletin")
with gr.Tabs():
with gr.Tab(label="Leaderboard"):
source_radio = gr.Radio(
choices=["core", "community"],
label="Select Benchmark Source",
value="core"
)
table_columns = ["Benchmark", "Category", "Pile Dirty (%)", "DCLM Dirty (%)", "CC202505 Dirty (%)"]
leaderboard_table = gr.Dataframe(
value=build_table("core"),
headers=table_columns,
interactive=False,
wrap=True,
label="Dirty Rates"
)
def update_table(source):
return build_table(source)
source_radio.change(
fn=update_table,
inputs=source_radio,
outputs=leaderboard_table
)
# Submission Tab
with gr.Tab(label="Submission"):
gr.Markdown("## Submit Your Dataset for Contamination Checking")
with gr.Row():
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
submit_button = gr.Button("Submit for Contamination Check")
result_output = gr.Textbox(label="Submission Status", interactive=False)
submit_button.click(
fn=record_submission,
inputs=[jsonl_input, hf_path_input, field_name_input],
outputs=result_output
)
interface.launch()