Hao Xu commited on
Commit
ed84703
·
1 Parent(s): 27c9b8f

contamination results update

Browse files
Files changed (1) hide show
  1. data.json +24 -23
data.json CHANGED
@@ -1,30 +1,31 @@
1
  [
2
- {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 14.57, "DCLM Dirty": 28.81, "CC202505 Dirty": -1},
3
- {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 6.87, "DCLM Dirty": -1, "CC202505 Dirty": -1},
4
- {"Benchmark": "Big-Bench-Hard", "Category": "Knowledge and Reasoning", "Pile Dirty": -1, "DCLM Dirty": -1, "CC202505 Dirty": -1},
5
- {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.46, "DCLM Dirty": -1, "CC202505 Dirty": 2.49},
6
- {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": -1},
7
-
8
- {"Benchmark": "AIME-2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
9
- {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.38, "CC202505 Dirty": 5.76},
 
10
  {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
11
  {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
12
-
13
  {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
14
  {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
15
- {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": 0.00},
16
- {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": 0.20},
17
  {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
18
-
19
- {"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.79, "DCLM Dirty": 34.30, "CC202505 Dirty": -1},
20
- {"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.64, "DCLM Dirty": 32.38, "CC202505 Dirty": -1},
21
- {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.09, "DCLM Dirty": 0.88, "CC202505 Dirty": -1},
22
- {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.01, "DCLM Dirty": -1, "CC202505 Dirty": -1},
23
- {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": -1},
24
- {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": -1},
25
- {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": -1, "CC202505 Dirty": -1},
26
- {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": -1},
27
-
28
- {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.20, "DCLM Dirty": -1, "CC202505 Dirty": -1},
29
- {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.97, "DCLM Dirty": -1, "CC202505 Dirty": -1}
30
  ]
 
1
  [
2
+ {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.20, "DCLM Dirty": 28.40, "CC202505 Dirty": 13.50},
3
+ {"Benchmark": "MMLU-pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.50, "DCLM Dirty": 16.20, "CC202505 Dirty": 7.10},
4
+ {"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.10, "CC202505 Dirty": 1.40},
5
+ {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.80, "DCLM Dirty": 3.10, "CC202505 Dirty": 2.70},
6
+ {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.89},
7
+ {"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.30, "CC202505 Dirty": 0.10},
8
+
9
+ {"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
10
+ {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 5.00},
11
  {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
12
  {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
13
+
14
  {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
15
  {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
16
+ {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
17
+ {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.20},
18
  {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
19
+
20
+ {"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.80, "DCLM Dirty": 34.10, "CC202505 Dirty": 11.90},
21
+ {"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.30, "DCLM Dirty": 31.70, "CC202505 Dirty": 5.40},
22
+ {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": 1.00, "CC202505 Dirty": 0.10},
23
+ {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
24
+ {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": 14.60},
25
+ {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
26
+ {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.50, "CC202505 Dirty": 0.20},
27
+ {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
28
+
29
+ {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.00, "DCLM Dirty": 18.40, "CC202505 Dirty": 7.40},
30
+ {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.80, "DCLM Dirty": 40.10, "CC202505 Dirty": 2.70}
31
  ]