File size: 3,320 Bytes
3c856c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
import os
import json
import pandas as pd
def load_data(source):
data = []
if source == "core":
with open("data.json", "r") as file:
data = json.load(file)
elif source == "community":
with open("community_results.json", "r") as file:
data = json.load(file)
return data
def build_table(source):
data = load_data(source)
entries = []
for entry in data:
entries.append({
"Benchmark": entry.get("Benchmark", ""),
"Category": entry.get("Category", ""),
"Pile Dirty (%)": entry.get("Pile Dirty", ""),
"DCLM Dirty (%)": entry.get("DCLM Dirty", ""),
"CC Dirty (%)": entry.get("CC202505 Dirty", ""),
})
return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
def record_submission(jsonl_file, hf_path, field_name):
if jsonl_file is None and not hf_path:
return "Please provide either a .jsonl file or a HuggingFace dataset path."
entry = {
"source": hf_path if hf_path else jsonl_file.name,
"type": "hf" if hf_path else "jsonl",
"field_name": field_name,
}
queue_file = "pending_submissions.json"
existing = []
if os.path.exists(queue_file):
with open(queue_file, "r") as f:
existing = json.load(f)
existing.append(entry)
with open(queue_file, "w") as f:
json.dump(existing, f, indent=2)
return "β
Submission received! You'll be notified when processing is complete."
with gr.Blocks() as interface:
gr.Markdown("# π Benchmark Contamination Bulletin")
with gr.Tabs():
with gr.Tab(label="Leaderboard"):
source_radio = gr.Radio(
choices=["core", "community"],
label="Select Benchmark Source",
value="core"
)
table_columns = ["Benchmark", "Category", "Pile Dirty (%)", "DCLM Dirty (%)", "CC202505 Dirty (%)"]
leaderboard_table = gr.Dataframe(
value=build_table("core"),
headers=table_columns,
interactive=False,
wrap=True,
label="Dirty Rates"
)
def update_table(source):
return build_table(source)
source_radio.change(
fn=update_table,
inputs=source_radio,
outputs=leaderboard_table
)
# Submission Tab
with gr.Tab(label="Submission"):
gr.Markdown("## Submit Your Dataset for Contamination Checking")
with gr.Row():
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
submit_button = gr.Button("Submit for Contamination Check")
result_output = gr.Textbox(label="Submission Status", interactive=False)
submit_button.click(
fn=record_submission,
inputs=[jsonl_input, hf_path_input, field_name_input],
outputs=result_output
)
interface.launch()
|