Hao Xu commited on
Commit
27c9b8f
·
1 Parent(s): 3c856c0

data format update

Browse files
Files changed (2) hide show
  1. app.py +15 -4
  2. data.json +29 -30
app.py CHANGED
@@ -28,16 +28,24 @@ def build_table(source):
28
 
29
  return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
30
 
31
- def record_submission(jsonl_file, hf_path, field_name):
32
  if jsonl_file is None and not hf_path:
33
  return "Please provide either a .jsonl file or a HuggingFace dataset path."
 
 
 
 
 
34
 
35
  entry = {
36
- "source": hf_path if hf_path else jsonl_file.name,
37
- "type": "hf" if hf_path else "jsonl",
 
 
38
  "field_name": field_name,
39
  }
40
 
 
41
  queue_file = "pending_submissions.json"
42
  existing = []
43
  if os.path.exists(queue_file):
@@ -83,10 +91,13 @@ with gr.Blocks() as interface:
83
  with gr.Tab(label="Submission"):
84
  gr.Markdown("## Submit Your Dataset for Contamination Checking")
85
 
 
 
86
  with gr.Row():
87
  jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
88
  hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
89
 
 
90
  field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
91
 
92
  submit_button = gr.Button("Submit for Contamination Check")
@@ -94,7 +105,7 @@ with gr.Blocks() as interface:
94
 
95
  submit_button.click(
96
  fn=record_submission,
97
- inputs=[jsonl_input, hf_path_input, field_name_input],
98
  outputs=result_output
99
  )
100
 
 
28
 
29
  return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
30
 
31
+ def record_submission(benchmark_name, jsonl_file, hf_path, hf_split, field_name):
32
  if jsonl_file is None and not hf_path:
33
  return "Please provide either a .jsonl file or a HuggingFace dataset path."
34
+ if hf_path and not hf_split:
35
+ return "Please provide a dataset split for the HuggingFace dataset."
36
+ if not field_name:
37
+ return "Please provide a field name."
38
+
39
 
40
  entry = {
41
+ "name": benchmark_name,
42
+ "type": "jsonl" if jsonl_file else "hf",
43
+ "path": jsonl_file.name if jsonl_file else hf_path,
44
+ "split": hf_split if hf_path else None,
45
  "field_name": field_name,
46
  }
47
 
48
+
49
  queue_file = "pending_submissions.json"
50
  existing = []
51
  if os.path.exists(queue_file):
 
91
  with gr.Tab(label="Submission"):
92
  gr.Markdown("## Submit Your Dataset for Contamination Checking")
93
 
94
+ benchmark_name_input = gr.Textbox(label="Benchmark Name")
95
+
96
  with gr.Row():
97
  jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
98
  hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
99
 
100
+ hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
101
  field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
102
 
103
  submit_button = gr.Button("Submit for Contamination Check")
 
105
 
106
  submit_button.click(
107
  fn=record_submission,
108
+ inputs=[benchmark_name_input, jsonl_input, hf_path_input, hf_split_input, field_name_input],
109
  outputs=result_output
110
  )
111
 
data.json CHANGED
@@ -1,31 +1,30 @@
1
  [
2
- {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 14.57, "DCLM Dirty": 0, "CC202505 Dirty": 0},
3
- {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 6.87, "DCLM Dirty": 6.87, "CC202505 Dirty": 6.87},
4
- {"Benchmark": "Big-Bench-Hard", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.44, "DCLM Dirty": 0.44, "CC202505 Dirty": 0.44},
5
- {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.46, "DCLM Dirty": 0.46, "CC202505 Dirty": 0.46},
6
- {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
7
-
8
- {"Benchmark": "AIME-2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
9
- {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.38, "CC202505 Dirty": 5.76},
10
- {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
11
- {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
12
-
13
- {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
14
- {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
15
- {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
16
- {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.20},
17
- {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
18
-
19
- {"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.79, "DCLM Dirty": 34.30, "CC202505 Dirty": 0},
20
- {"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.64, "DCLM Dirty": 32.38, "CC202505 Dirty": 0},
21
- {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.09, "DCLM Dirty": 0.88, "CC202505 Dirty": 0},
22
- {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.01, "DCLM Dirty": 0, "CC202505 Dirty": 0},
23
- {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": 0},
24
- {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
25
- {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": 0.0, "CC202505 Dirty": 0},
26
- {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
27
-
28
- {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.20, "DCLM Dirty": 0, "CC202505 Dirty": 0},
29
- {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.97, "DCLM Dirty": 0, "CC202505 Dirty": 0}
30
- ]
31
-
 
1
  [
2
+ {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 14.57, "DCLM Dirty": 28.81, "CC202505 Dirty": -1},
3
+ {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 6.87, "DCLM Dirty": -1, "CC202505 Dirty": -1},
4
+ {"Benchmark": "Big-Bench-Hard", "Category": "Knowledge and Reasoning", "Pile Dirty": -1, "DCLM Dirty": -1, "CC202505 Dirty": -1},
5
+ {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.46, "DCLM Dirty": -1, "CC202505 Dirty": 2.49},
6
+ {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": -1},
7
+
8
+ {"Benchmark": "AIME-2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
9
+ {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.38, "CC202505 Dirty": 5.76},
10
+ {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
11
+ {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
12
+
13
+ {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
14
+ {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
15
+ {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": 0.00},
16
+ {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": 0.20},
17
+ {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
18
+
19
+ {"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.79, "DCLM Dirty": 34.30, "CC202505 Dirty": -1},
20
+ {"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.64, "DCLM Dirty": 32.38, "CC202505 Dirty": -1},
21
+ {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.09, "DCLM Dirty": 0.88, "CC202505 Dirty": -1},
22
+ {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.01, "DCLM Dirty": -1, "CC202505 Dirty": -1},
23
+ {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": -1},
24
+ {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": -1},
25
+ {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": -1, "CC202505 Dirty": -1},
26
+ {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": -1},
27
+
28
+ {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.20, "DCLM Dirty": -1, "CC202505 Dirty": -1},
29
+ {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.97, "DCLM Dirty": -1, "CC202505 Dirty": -1}
30
+ ]