Hao Xu
commited on
Commit
·
27c9b8f
1
Parent(s):
3c856c0
data format update
Browse files
app.py
CHANGED
@@ -28,16 +28,24 @@ def build_table(source):
|
|
28 |
|
29 |
return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
|
30 |
|
31 |
-
def record_submission(jsonl_file, hf_path, field_name):
|
32 |
if jsonl_file is None and not hf_path:
|
33 |
return "Please provide either a .jsonl file or a HuggingFace dataset path."
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
entry = {
|
36 |
-
"
|
37 |
-
"type": "
|
|
|
|
|
38 |
"field_name": field_name,
|
39 |
}
|
40 |
|
|
|
41 |
queue_file = "pending_submissions.json"
|
42 |
existing = []
|
43 |
if os.path.exists(queue_file):
|
@@ -83,10 +91,13 @@ with gr.Blocks() as interface:
|
|
83 |
with gr.Tab(label="Submission"):
|
84 |
gr.Markdown("## Submit Your Dataset for Contamination Checking")
|
85 |
|
|
|
|
|
86 |
with gr.Row():
|
87 |
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
|
88 |
hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
|
89 |
|
|
|
90 |
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
|
91 |
|
92 |
submit_button = gr.Button("Submit for Contamination Check")
|
@@ -94,7 +105,7 @@ with gr.Blocks() as interface:
|
|
94 |
|
95 |
submit_button.click(
|
96 |
fn=record_submission,
|
97 |
-
inputs=[jsonl_input, hf_path_input, field_name_input],
|
98 |
outputs=result_output
|
99 |
)
|
100 |
|
|
|
28 |
|
29 |
return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
|
30 |
|
31 |
+
def record_submission(benchmark_name, jsonl_file, hf_path, hf_split, field_name):
|
32 |
if jsonl_file is None and not hf_path:
|
33 |
return "Please provide either a .jsonl file or a HuggingFace dataset path."
|
34 |
+
if hf_path and not hf_split:
|
35 |
+
return "Please provide a dataset split for the HuggingFace dataset."
|
36 |
+
if not field_name:
|
37 |
+
return "Please provide a field name."
|
38 |
+
|
39 |
|
40 |
entry = {
|
41 |
+
"name": benchmark_name,
|
42 |
+
"type": "jsonl" if jsonl_file else "hf",
|
43 |
+
"path": jsonl_file.name if jsonl_file else hf_path,
|
44 |
+
"split": hf_split if hf_path else None,
|
45 |
"field_name": field_name,
|
46 |
}
|
47 |
|
48 |
+
|
49 |
queue_file = "pending_submissions.json"
|
50 |
existing = []
|
51 |
if os.path.exists(queue_file):
|
|
|
91 |
with gr.Tab(label="Submission"):
|
92 |
gr.Markdown("## Submit Your Dataset for Contamination Checking")
|
93 |
|
94 |
+
benchmark_name_input = gr.Textbox(label="Benchmark Name")
|
95 |
+
|
96 |
with gr.Row():
|
97 |
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
|
98 |
hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
|
99 |
|
100 |
+
hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
|
101 |
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
|
102 |
|
103 |
submit_button = gr.Button("Submit for Contamination Check")
|
|
|
105 |
|
106 |
submit_button.click(
|
107 |
fn=record_submission,
|
108 |
+
inputs=[benchmark_name_input, jsonl_input, hf_path_input, hf_split_input, field_name_input],
|
109 |
outputs=result_output
|
110 |
)
|
111 |
|
data.json
CHANGED
@@ -1,31 +1,30 @@
|
|
1 |
[
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
1 |
[
|
2 |
+
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 14.57, "DCLM Dirty": 28.81, "CC202505 Dirty": -1},
|
3 |
+
{"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 6.87, "DCLM Dirty": -1, "CC202505 Dirty": -1},
|
4 |
+
{"Benchmark": "Big-Bench-Hard", "Category": "Knowledge and Reasoning", "Pile Dirty": -1, "DCLM Dirty": -1, "CC202505 Dirty": -1},
|
5 |
+
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.46, "DCLM Dirty": -1, "CC202505 Dirty": 2.49},
|
6 |
+
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": -1},
|
7 |
+
|
8 |
+
{"Benchmark": "AIME-2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
|
9 |
+
{"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.38, "CC202505 Dirty": 5.76},
|
10 |
+
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
|
11 |
+
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
|
12 |
+
|
13 |
+
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
14 |
+
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
15 |
+
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": 0.00},
|
16 |
+
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": 0.20},
|
17 |
+
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
|
18 |
+
|
19 |
+
{"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.79, "DCLM Dirty": 34.30, "CC202505 Dirty": -1},
|
20 |
+
{"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.64, "DCLM Dirty": 32.38, "CC202505 Dirty": -1},
|
21 |
+
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.09, "DCLM Dirty": 0.88, "CC202505 Dirty": -1},
|
22 |
+
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.01, "DCLM Dirty": -1, "CC202505 Dirty": -1},
|
23 |
+
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": -1},
|
24 |
+
{"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": -1},
|
25 |
+
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": -1, "CC202505 Dirty": -1},
|
26 |
+
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": -1},
|
27 |
+
|
28 |
+
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.20, "DCLM Dirty": -1, "CC202505 Dirty": -1},
|
29 |
+
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.97, "DCLM Dirty": -1, "CC202505 Dirty": -1}
|
30 |
+
]
|
|