Hao Xu
commited on
Commit
Β·
3c856c0
1
Parent(s):
b58437b
leaderboard UI
Browse files- app.py +101 -0
- community_results.json +3 -0
- data.json +31 -0
- requirements.txt +2 -0
app.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
def load_data(source):
|
7 |
+
data = []
|
8 |
+
if source == "core":
|
9 |
+
with open("data.json", "r") as file:
|
10 |
+
data = json.load(file)
|
11 |
+
elif source == "community":
|
12 |
+
with open("community_results.json", "r") as file:
|
13 |
+
data = json.load(file)
|
14 |
+
return data
|
15 |
+
|
16 |
+
def build_table(source):
|
17 |
+
data = load_data(source)
|
18 |
+
entries = []
|
19 |
+
|
20 |
+
for entry in data:
|
21 |
+
entries.append({
|
22 |
+
"Benchmark": entry.get("Benchmark", ""),
|
23 |
+
"Category": entry.get("Category", ""),
|
24 |
+
"Pile Dirty (%)": entry.get("Pile Dirty", ""),
|
25 |
+
"DCLM Dirty (%)": entry.get("DCLM Dirty", ""),
|
26 |
+
"CC Dirty (%)": entry.get("CC202505 Dirty", ""),
|
27 |
+
})
|
28 |
+
|
29 |
+
return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
|
30 |
+
|
31 |
+
def record_submission(jsonl_file, hf_path, field_name):
|
32 |
+
if jsonl_file is None and not hf_path:
|
33 |
+
return "Please provide either a .jsonl file or a HuggingFace dataset path."
|
34 |
+
|
35 |
+
entry = {
|
36 |
+
"source": hf_path if hf_path else jsonl_file.name,
|
37 |
+
"type": "hf" if hf_path else "jsonl",
|
38 |
+
"field_name": field_name,
|
39 |
+
}
|
40 |
+
|
41 |
+
queue_file = "pending_submissions.json"
|
42 |
+
existing = []
|
43 |
+
if os.path.exists(queue_file):
|
44 |
+
with open(queue_file, "r") as f:
|
45 |
+
existing = json.load(f)
|
46 |
+
existing.append(entry)
|
47 |
+
with open(queue_file, "w") as f:
|
48 |
+
json.dump(existing, f, indent=2)
|
49 |
+
|
50 |
+
return "β
Submission received! You'll be notified when processing is complete."
|
51 |
+
|
52 |
+
with gr.Blocks() as interface:
|
53 |
+
gr.Markdown("# π Benchmark Contamination Bulletin")
|
54 |
+
|
55 |
+
with gr.Tabs():
|
56 |
+
with gr.Tab(label="Leaderboard"):
|
57 |
+
|
58 |
+
source_radio = gr.Radio(
|
59 |
+
choices=["core", "community"],
|
60 |
+
label="Select Benchmark Source",
|
61 |
+
value="core"
|
62 |
+
)
|
63 |
+
|
64 |
+
table_columns = ["Benchmark", "Category", "Pile Dirty (%)", "DCLM Dirty (%)", "CC202505 Dirty (%)"]
|
65 |
+
leaderboard_table = gr.Dataframe(
|
66 |
+
value=build_table("core"),
|
67 |
+
headers=table_columns,
|
68 |
+
interactive=False,
|
69 |
+
wrap=True,
|
70 |
+
label="Dirty Rates"
|
71 |
+
)
|
72 |
+
|
73 |
+
def update_table(source):
|
74 |
+
return build_table(source)
|
75 |
+
|
76 |
+
source_radio.change(
|
77 |
+
fn=update_table,
|
78 |
+
inputs=source_radio,
|
79 |
+
outputs=leaderboard_table
|
80 |
+
)
|
81 |
+
|
82 |
+
# Submission Tab
|
83 |
+
with gr.Tab(label="Submission"):
|
84 |
+
gr.Markdown("## Submit Your Dataset for Contamination Checking")
|
85 |
+
|
86 |
+
with gr.Row():
|
87 |
+
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
|
88 |
+
hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
|
89 |
+
|
90 |
+
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
|
91 |
+
|
92 |
+
submit_button = gr.Button("Submit for Contamination Check")
|
93 |
+
result_output = gr.Textbox(label="Submission Status", interactive=False)
|
94 |
+
|
95 |
+
submit_button.click(
|
96 |
+
fn=record_submission,
|
97 |
+
inputs=[jsonl_input, hf_path_input, field_name_input],
|
98 |
+
outputs=result_output
|
99 |
+
)
|
100 |
+
|
101 |
+
interface.launch()
|
community_results.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
|
3 |
+
]
|
data.json
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 14.57, "DCLM Dirty": 0, "CC202505 Dirty": 0},
|
3 |
+
{"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 6.87, "DCLM Dirty": 6.87, "CC202505 Dirty": 6.87},
|
4 |
+
{"Benchmark": "Big-Bench-Hard", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.44, "DCLM Dirty": 0.44, "CC202505 Dirty": 0.44},
|
5 |
+
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.46, "DCLM Dirty": 0.46, "CC202505 Dirty": 0.46},
|
6 |
+
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
7 |
+
|
8 |
+
{"Benchmark": "AIME-2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
|
9 |
+
{"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.38, "CC202505 Dirty": 5.76},
|
10 |
+
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
|
11 |
+
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
|
12 |
+
|
13 |
+
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
14 |
+
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
15 |
+
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
16 |
+
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.20},
|
17 |
+
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
|
18 |
+
|
19 |
+
{"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.79, "DCLM Dirty": 34.30, "CC202505 Dirty": 0},
|
20 |
+
{"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.64, "DCLM Dirty": 32.38, "CC202505 Dirty": 0},
|
21 |
+
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.09, "DCLM Dirty": 0.88, "CC202505 Dirty": 0},
|
22 |
+
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.01, "DCLM Dirty": 0, "CC202505 Dirty": 0},
|
23 |
+
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": 0},
|
24 |
+
{"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
|
25 |
+
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": 0.0, "CC202505 Dirty": 0},
|
26 |
+
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
|
27 |
+
|
28 |
+
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.20, "DCLM Dirty": 0, "CC202505 Dirty": 0},
|
29 |
+
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.97, "DCLM Dirty": 0, "CC202505 Dirty": 0}
|
30 |
+
]
|
31 |
+
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
huggingface_hub==0.14.1
|
2 |
+
pandas
|