Hao Xu commited on
Commit
e55b0a5
·
1 Parent(s): d56bc01

add cc-2025-08 result

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. data.json +24 -24
app.py CHANGED
@@ -33,9 +33,9 @@ def build_table(source, refresh=False):
33
  data = load_data(source, refresh)
34
 
35
  if source == "core":
36
- headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)"]
37
  else:
38
- headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)"]
39
 
40
  html = """
41
  <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
 
33
  data = load_data(source, refresh)
34
 
35
  if source == "core":
36
+ headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"]
37
  else:
38
+ headers = ["Benchmark", "Contributor", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)", "CC-2025-08 Dirty (%)"]
39
 
40
  html = """
41
  <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
data.json CHANGED
@@ -1,30 +1,30 @@
1
  [
2
- {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.2, "DCLM Dirty": 28.4, "CC202505 Dirty": 13.5, "URL": "https://huggingface.co/datasets/cais/mmlu"},
3
- {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.5, "DCLM Dirty": 16.2, "CC202505 Dirty": 7.1, "URL": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"},
4
- {"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.1, "CC202505 Dirty": 1.4, "URL": "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"},
5
- {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.8, "DCLM Dirty": 3.1, "CC202505 Dirty": 2.7, "URL": "https://github.com/ruixiangcui/AGIEval/tree/main/data/v1_1"},
6
- {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.89, "URL": "https://huggingface.co/datasets/Idavidrein/gpqa"},
7
- {"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.3, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/cais/hle"},
8
 
9
- {"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 10.0, "URL": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024"},
10
- {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 5.0, "URL": "https://huggingface.co/datasets/openai/gsm8k"},
11
- {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.6, "DCLM Dirty": 3.2, "CC202505 Dirty": 0.6, "URL": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500"},
12
- {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 5.6, "URL": "https://huggingface.co/datasets/juletxara/mgsm"},
13
 
14
- {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/openai/openai_humaneval"},
15
- {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/evalplus/humanevalplus"},
16
- {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/livecodebench/code_generation"},
17
- {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified"},
18
- {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 1.0, "URL": "https://huggingface.co/datasets/google-research-datasets/mbpp"},
19
 
20
- {"Benchmark": "ARC-Challenge", "Category": "Commonsense Understanding", "Pile Dirty": 1.8, "DCLM Dirty": 34.1, "CC202505 Dirty": 11.9, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
21
- {"Benchmark": "ARC-Easy", "Category": "Commonsense Understanding", "Pile Dirty": 1.3, "DCLM Dirty": 31.7, "CC202505 Dirty": 5.4, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
22
- {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
23
- {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
24
- {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
25
- {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
26
- {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
27
 
28
- {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.0, "DCLM Dirty": 18.4, "CC202505 Dirty": 7.4, "URL": "https://huggingface.co/datasets/stanfordnlp/coqa"},
29
- {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.8, "DCLM Dirty": 40.1, "CC202505 Dirty": 2.7, "URL": "https://huggingface.co/datasets/rajpurkar/squad"}
30
  ]
 
1
  [
2
+ {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.2, "DCLM Dirty": 28.4, "CC202505 Dirty": 13.5, "CC202508 Dirty": 9.0, "URL": "https://huggingface.co/datasets/cais/mmlu"},
3
+ {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.5, "DCLM Dirty": 16.2, "CC202505 Dirty": 7.1, "CC202508 Dirty": 5.4, "URL": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"},
4
+ {"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.1, "CC202505 Dirty": 1.4, "CC202508 Dirty": 1.4, "URL": "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"},
5
+ {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.8, "DCLM Dirty": 3.1, "CC202505 Dirty": 2.7, "CC202508 Dirty": 3.6, "URL": "https://github.com/ruixiangcui/AGIEval/tree/main/data/v1_1"},
6
+ {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.89, "CC202508 Dirty": 2.0, "URL": "https://huggingface.co/datasets/Idavidrein/gpqa"},
7
+ {"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.3, "CC202505 Dirty": 0.1, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/cais/hle"},
8
 
9
+ {"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 10.0, "CC202508 Dirty": 3.3, "URL": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024"},
10
+ {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 5.0, "CC202508 Dirty": 0.8, "URL": "https://huggingface.co/datasets/openai/gsm8k"},
11
+ {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.6, "DCLM Dirty": 3.2, "CC202505 Dirty": 0.6, "CC202508 Dirty": 7.8, "URL": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500"},
12
+ {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 5.6, "CC202508 Dirty": 1.6, "URL": "https://huggingface.co/datasets/juletxara/mgsm"},
13
 
14
+ {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.6, "URL": "https://huggingface.co/datasets/openai/openai_humaneval"},
15
+ {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.6, "URL": "https://huggingface.co/datasets/evalplus/humanevalplus"},
16
+ {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/livecodebench/code_generation"},
17
+ {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.2, "CC202508 Dirty": 0.2, "URL": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified"},
18
+ {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 1.0, "CC202508 Dirty": 1.4, "URL": "https://huggingface.co/datasets/google-research-datasets/mbpp"},
19
 
20
+ {"Benchmark": "ARC-Challenge", "Category": "Commonsense Understanding", "Pile Dirty": 1.8, "DCLM Dirty": 34.1, "CC202505 Dirty": 11.9, "CC202508 Dirty": 4.0, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
21
+ {"Benchmark": "ARC-Easy", "Category": "Commonsense Understanding", "Pile Dirty": 1.3, "DCLM Dirty": 31.7, "CC202505 Dirty": 5.4, "CC202508 Dirty": 9.5, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
22
+ {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "CC202508 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
23
+ {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
24
+ {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "CC202508 Dirty": 30.2, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
25
+ {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "CC202508 Dirty": 4.4, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
26
+ {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "CC202508 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
27
 
28
+ {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.0, "DCLM Dirty": 18.4, "CC202505 Dirty": 7.4, "CC202508 Dirty": 8.8, "URL": "https://huggingface.co/datasets/stanfordnlp/coqa"},
29
+ {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.8, "DCLM Dirty": 40.1, "CC202505 Dirty": 2.7, "CC202508 Dirty": 33.0, "URL": "https://huggingface.co/datasets/rajpurkar/squad"}
30
  ]