Hao Xu
commited on
Commit
Β·
7357a15
1
Parent(s):
30c3967
data update
Browse files
app.py
CHANGED
@@ -188,12 +188,12 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
|
|
188 |
'field_name': field_name.strip(),
|
189 |
'hf_path': hf_path.strip() if has_hf else ''
|
190 |
}
|
191 |
-
|
192 |
files = {}
|
193 |
if has_jsonl:
|
194 |
files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json')
|
195 |
|
196 |
-
response = requests.post(URL
|
197 |
|
198 |
if files:
|
199 |
files['file'][1].close()
|
@@ -204,13 +204,13 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
|
|
204 |
message = result.get('message', 'Submission successful!')
|
205 |
|
206 |
full_message = f"{message}\n\n" \
|
207 |
-
f"
|
208 |
-
f"
|
209 |
-
f"
|
210 |
|
211 |
return full_message
|
212 |
elif result.get("status") == "info":
|
213 |
-
return f"
|
214 |
else:
|
215 |
return f"β {result.get('message', 'Unknown error occurred')}"
|
216 |
else:
|
@@ -258,9 +258,9 @@ with gr.Blocks() as interface:
|
|
258 |
|
259 |
with gr.Row():
|
260 |
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
|
265 |
|
266 |
submit_button = gr.Button("Submit for Contamination Check")
|
|
|
188 |
'field_name': field_name.strip(),
|
189 |
'hf_path': hf_path.strip() if has_hf else ''
|
190 |
}
|
191 |
+
print(json.dumps(data))
|
192 |
files = {}
|
193 |
if has_jsonl:
|
194 |
files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json')
|
195 |
|
196 |
+
response = requests.post(f"{URL}/submit", data=data, files=files, timeout=30)
|
197 |
|
198 |
if files:
|
199 |
files['file'][1].close()
|
|
|
204 |
message = result.get('message', 'Submission successful!')
|
205 |
|
206 |
full_message = f"{message}\n\n" \
|
207 |
+
f"β
Your submission has been saved and will be processed automatically.\n" \
|
208 |
+
f"Results will appear in the main leaderboard when ready.\n" \
|
209 |
+
f"You can refresh the leaderboard to check for updates."
|
210 |
|
211 |
return full_message
|
212 |
elif result.get("status") == "info":
|
213 |
+
return f"β {result.get('message', 'Submission already exists')}"
|
214 |
else:
|
215 |
return f"β {result.get('message', 'Unknown error occurred')}"
|
216 |
else:
|
|
|
258 |
|
259 |
with gr.Row():
|
260 |
jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
|
261 |
+
with gr.Column():
|
262 |
+
hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
|
263 |
+
hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
|
264 |
field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
|
265 |
|
266 |
submit_button = gr.Button("Submit for Contamination Check")
|
data.json
CHANGED
@@ -1,31 +1,31 @@
|
|
1 |
[
|
2 |
-
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.
|
3 |
-
{"Benchmark": "MMLU-
|
4 |
-
{"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.
|
5 |
-
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.
|
6 |
-
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.
|
7 |
-
{"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.
|
8 |
-
|
9 |
-
{"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.
|
10 |
-
{"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.
|
11 |
-
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.
|
12 |
-
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.
|
13 |
-
|
14 |
-
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.
|
15 |
-
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.
|
16 |
-
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.
|
17 |
-
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.
|
18 |
-
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.
|
19 |
-
|
20 |
-
{"Benchmark": "ARC-
|
21 |
-
{"Benchmark": "ARC-
|
22 |
-
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
23 |
-
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
24 |
-
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.
|
25 |
-
{"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
26 |
-
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
27 |
-
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.
|
28 |
-
|
29 |
-
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.
|
30 |
-
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.
|
31 |
]
|
|
|
1 |
[
|
2 |
+
{"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.2, "DCLM Dirty": 28.4, "CC202505 Dirty": 13.5, "URL": "https://huggingface.co/datasets/cais/mmlu"},
|
3 |
+
{"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.5, "DCLM Dirty": 16.2, "CC202505 Dirty": 7.1, "URL": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"},
|
4 |
+
{"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.1, "CC202505 Dirty": 1.4, "URL": "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"},
|
5 |
+
{"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.8, "DCLM Dirty": 3.1, "CC202505 Dirty": 2.7, "URL": "https://github.com/ruixiangcui/AGIEval/tree/main/data/v1_1"},
|
6 |
+
{"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.89, "URL": "https://huggingface.co/datasets/Idavidrein/gpqa"},
|
7 |
+
{"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.3, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/cais/hle"},
|
8 |
+
|
9 |
+
{"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 10.0, "URL": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024"},
|
10 |
+
{"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 5.0, "URL": "https://huggingface.co/datasets/openai/gsm8k"},
|
11 |
+
{"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.6, "DCLM Dirty": 3.2, "CC202505 Dirty": 0.6, "URL": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500"},
|
12 |
+
{"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 5.6, "URL": "https://huggingface.co/datasets/juletxara/mgsm"},
|
13 |
+
|
14 |
+
{"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/openai/openai_humaneval"},
|
15 |
+
{"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/evalplus/humanevalplus"},
|
16 |
+
{"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/livecodebench/code_generation"},
|
17 |
+
{"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified"},
|
18 |
+
{"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 1.0, "URL": "https://huggingface.co/datasets/google-research-datasets/mbpp"},
|
19 |
+
|
20 |
+
{"Benchmark": "ARC-Challenge", "Category": "Commonsense Understanding", "Pile Dirty": 1.8, "DCLM Dirty": 34.1, "CC202505 Dirty": 11.9, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
|
21 |
+
{"Benchmark": "ARC-Easy", "Category": "Commonsense Understanding", "Pile Dirty": 1.3, "DCLM Dirty": 31.7, "CC202505 Dirty": 5.4, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
|
22 |
+
{"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
|
23 |
+
{"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
|
24 |
+
{"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
|
25 |
+
{"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": ""},
|
26 |
+
{"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
|
27 |
+
{"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
|
28 |
+
|
29 |
+
{"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.0, "DCLM Dirty": 18.4, "CC202505 Dirty": 7.4, "URL": "https://huggingface.co/datasets/stanfordnlp/coqa"},
|
30 |
+
{"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.8, "DCLM Dirty": 40.1, "CC202505 Dirty": 2.7, "URL": "https://huggingface.co/datasets/rajpurkar/squad"}
|
31 |
]
|