|
[ |
|
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.2, "DCLM Dirty": 28.4, "CC202505 Dirty": 13.5, "URL": "https://huggingface.co/datasets/cais/mmlu"}, |
|
{"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.5, "DCLM Dirty": 16.2, "CC202505 Dirty": 7.1, "URL": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"}, |
|
{"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.1, "CC202505 Dirty": 1.4, "URL": "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"}, |
|
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.8, "DCLM Dirty": 3.1, "CC202505 Dirty": 2.7, "URL": "https://github.com/ruixiangcui/AGIEval/tree/main/data/v1_1"}, |
|
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.89, "URL": "https://huggingface.co/datasets/Idavidrein/gpqa"}, |
|
{"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.3, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/cais/hle"}, |
|
|
|
{"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 10.0, "URL": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024"}, |
|
{"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 5.0, "URL": "https://huggingface.co/datasets/openai/gsm8k"}, |
|
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.6, "DCLM Dirty": 3.2, "CC202505 Dirty": 0.6, "URL": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500"}, |
|
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 5.6, "URL": "https://huggingface.co/datasets/juletxara/mgsm"}, |
|
|
|
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/openai/openai_humaneval"}, |
|
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/evalplus/humanevalplus"}, |
|
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/livecodebench/code_generation"}, |
|
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified"}, |
|
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 1.0, "URL": "https://huggingface.co/datasets/google-research-datasets/mbpp"}, |
|
|
|
{"Benchmark": "ARC-Challenge", "Category": "Commonsense Understanding", "Pile Dirty": 1.8, "DCLM Dirty": 34.1, "CC202505 Dirty": 11.9, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"}, |
|
{"Benchmark": "ARC-Easy", "Category": "Commonsense Understanding", "Pile Dirty": 1.3, "DCLM Dirty": 31.7, "CC202505 Dirty": 5.4, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"}, |
|
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"}, |
|
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"}, |
|
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "URL": "https://huggingface.co/datasets/allenai/openbookqa"}, |
|
{"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": ""}, |
|
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"}, |
|
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"}, |
|
|
|
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.0, "DCLM Dirty": 18.4, "CC202505 Dirty": 7.4, "URL": "https://huggingface.co/datasets/stanfordnlp/coqa"}, |
|
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.8, "DCLM Dirty": 40.1, "CC202505 Dirty": 2.7, "URL": "https://huggingface.co/datasets/rajpurkar/squad"} |
|
] |
|
|