Hao Xu
commited on
Commit
·
ed84703
1
Parent(s):
27c9b8f
contamination results update
Browse files
data.json
CHANGED
@@ -1,30 +1,31 @@
|
|
1 |
[
|
2 |
-
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty":
|
3 |
-
{"Benchmark": "MMLU-
|
4 |
-
{"Benchmark": "
|
5 |
-
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.
|
6 |
-
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty":
|
7 |
-
|
8 |
-
|
9 |
-
{"Benchmark": "
|
|
|
10 |
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
|
11 |
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
|
12 |
-
|
13 |
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
14 |
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
15 |
-
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty":
|
16 |
-
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty":
|
17 |
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
|
18 |
-
|
19 |
-
{"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.
|
20 |
-
{"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.
|
21 |
-
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
22 |
-
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
23 |
-
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty":
|
24 |
-
{"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty":
|
25 |
-
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
26 |
-
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty":
|
27 |
-
|
28 |
-
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.
|
29 |
-
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.
|
30 |
]
|
|
|
1 |
[
|
2 |
+
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.20, "DCLM Dirty": 28.40, "CC202505 Dirty": 13.50},
|
3 |
+
{"Benchmark": "MMLU-pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.50, "DCLM Dirty": 16.20, "CC202505 Dirty": 7.10},
|
4 |
+
{"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.10, "CC202505 Dirty": 1.40},
|
5 |
+
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.80, "DCLM Dirty": 3.10, "CC202505 Dirty": 2.70},
|
6 |
+
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.89},
|
7 |
+
{"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.30, "CC202505 Dirty": 0.10},
|
8 |
+
|
9 |
+
{"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
|
10 |
+
{"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 5.00},
|
11 |
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
|
12 |
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
|
13 |
+
|
14 |
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
15 |
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
16 |
+
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
17 |
+
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.20},
|
18 |
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
|
19 |
+
|
20 |
+
{"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.80, "DCLM Dirty": 34.10, "CC202505 Dirty": 11.90},
|
21 |
+
{"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.30, "DCLM Dirty": 31.70, "CC202505 Dirty": 5.40},
|
22 |
+
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": 1.00, "CC202505 Dirty": 0.10},
|
23 |
+
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
24 |
+
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": 14.60},
|
25 |
+
{"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
26 |
+
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.50, "CC202505 Dirty": 0.20},
|
27 |
+
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
|
28 |
+
|
29 |
+
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.00, "DCLM Dirty": 18.40, "CC202505 Dirty": 7.40},
|
30 |
+
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.80, "DCLM Dirty": 40.10, "CC202505 Dirty": 2.70}
|
31 |
]
|