Hao Xu commited on
Commit
7357a15
Β·
1 Parent(s): 30c3967

data update

Browse files
Files changed (2) hide show
  1. app.py +9 -9
  2. data.json +29 -29
app.py CHANGED
@@ -188,12 +188,12 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
188
  'field_name': field_name.strip(),
189
  'hf_path': hf_path.strip() if has_hf else ''
190
  }
191
-
192
  files = {}
193
  if has_jsonl:
194
  files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json')
195
 
196
- response = requests.post(URL + "/submit", data=data, files=files)
197
 
198
  if files:
199
  files['file'][1].close()
@@ -204,13 +204,13 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
204
  message = result.get('message', 'Submission successful!')
205
 
206
  full_message = f"{message}\n\n" \
207
- f"πŸ“‹ Your submission has been saved and will be processed automatically.\n" \
208
- f"πŸ’‘ Results will appear in the main leaderboard when ready.\n" \
209
- f"πŸ”„ You can refresh the leaderboard to check for updates."
210
 
211
  return full_message
212
  elif result.get("status") == "info":
213
- return f"ℹ️ {result.get('message', 'Submission already exists')}"
214
  else:
215
  return f"❌ {result.get('message', 'Unknown error occurred')}"
216
  else:
@@ -258,9 +258,9 @@ with gr.Blocks() as interface:
258
 
259
  with gr.Row():
260
  jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
261
- hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
262
-
263
- hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
264
  field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
265
 
266
  submit_button = gr.Button("Submit for Contamination Check")
 
188
  'field_name': field_name.strip(),
189
  'hf_path': hf_path.strip() if has_hf else ''
190
  }
191
+ print(json.dumps(data))
192
  files = {}
193
  if has_jsonl:
194
  files['file'] = (benchmark_name.strip() + '.jsonl', open(jsonl_file.name, 'rb'), 'application/json')
195
 
196
+ response = requests.post(f"{URL}/submit", data=data, files=files, timeout=30)
197
 
198
  if files:
199
  files['file'][1].close()
 
204
  message = result.get('message', 'Submission successful!')
205
 
206
  full_message = f"{message}\n\n" \
207
+ f"βœ… Your submission has been saved and will be processed automatically.\n" \
208
+ f"Results will appear in the main leaderboard when ready.\n" \
209
+ f"You can refresh the leaderboard to check for updates."
210
 
211
  return full_message
212
  elif result.get("status") == "info":
213
+ return f"❌ {result.get('message', 'Submission already exists')}"
214
  else:
215
  return f"❌ {result.get('message', 'Unknown error occurred')}"
216
  else:
 
258
 
259
  with gr.Row():
260
  jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
261
+ with gr.Column():
262
+ hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
263
+ hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
264
  field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
265
 
266
  submit_button = gr.Button("Submit for Contamination Check")
data.json CHANGED
@@ -1,31 +1,31 @@
1
  [
2
- {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.20, "DCLM Dirty": 28.40, "CC202505 Dirty": 13.50},
3
- {"Benchmark": "MMLU-pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.50, "DCLM Dirty": 16.20, "CC202505 Dirty": 7.10},
4
- {"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.10, "CC202505 Dirty": 1.40},
5
- {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.80, "DCLM Dirty": 3.10, "CC202505 Dirty": 2.70},
6
- {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.89},
7
- {"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.30, "CC202505 Dirty": 0.10},
8
-
9
- {"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
10
- {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 5.00},
11
- {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
12
- {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
13
-
14
- {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
15
- {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
16
- {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
17
- {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.20},
18
- {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
19
-
20
- {"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.80, "DCLM Dirty": 34.10, "CC202505 Dirty": 11.90},
21
- {"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.30, "DCLM Dirty": 31.70, "CC202505 Dirty": 5.40},
22
- {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": 1.00, "CC202505 Dirty": 0.10},
23
- {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
24
- {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": 14.60},
25
- {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
26
- {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.50, "CC202505 Dirty": 0.20},
27
- {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
28
-
29
- {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.00, "DCLM Dirty": 18.40, "CC202505 Dirty": 7.40},
30
- {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.80, "DCLM Dirty": 40.10, "CC202505 Dirty": 2.70}
31
  ]
 
1
  [
2
+ {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 13.2, "DCLM Dirty": 28.4, "CC202505 Dirty": 13.5, "URL": "https://huggingface.co/datasets/cais/mmlu"},
3
+ {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 5.5, "DCLM Dirty": 16.2, "CC202505 Dirty": 7.1, "URL": "https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro"},
4
+ {"Benchmark": "BBH", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.1, "CC202505 Dirty": 1.4, "URL": "https://github.com/suzgunmirac/BIG-Bench-Hard/tree/main/bbh"},
5
+ {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.8, "DCLM Dirty": 3.1, "CC202505 Dirty": 2.7, "URL": "https://github.com/ruixiangcui/AGIEval/tree/main/data/v1_1"},
6
+ {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.89, "URL": "https://huggingface.co/datasets/Idavidrein/gpqa"},
7
+ {"Benchmark": "HLE", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.0, "DCLM Dirty": 0.3, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/cais/hle"},
8
+
9
+ {"Benchmark": "AIME_2024", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 10.0, "URL": "https://huggingface.co/datasets/Maxwell-Jia/AIME_2024"},
10
+ {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 5.0, "URL": "https://huggingface.co/datasets/openai/gsm8k"},
11
+ {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.6, "DCLM Dirty": 3.2, "CC202505 Dirty": 0.6, "URL": "https://huggingface.co/datasets/HuggingFaceH4/MATH-500"},
12
+ {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 5.6, "URL": "https://huggingface.co/datasets/juletxara/mgsm"},
13
+
14
+ {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/openai/openai_humaneval"},
15
+ {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/evalplus/humanevalplus"},
16
+ {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/livecodebench/code_generation"},
17
+ {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/princeton-nlp/SWE-bench_Verified"},
18
+ {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.0, "DCLM Dirty": 0.4, "CC202505 Dirty": 1.0, "URL": "https://huggingface.co/datasets/google-research-datasets/mbpp"},
19
+
20
+ {"Benchmark": "ARC-Challenge", "Category": "Commonsense Understanding", "Pile Dirty": 1.8, "DCLM Dirty": 34.1, "CC202505 Dirty": 11.9, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
21
+ {"Benchmark": "ARC-Easy", "Category": "Commonsense Understanding", "Pile Dirty": 1.3, "DCLM Dirty": 31.7, "CC202505 Dirty": 5.4, "URL": "https://huggingface.co/datasets/allenai/ai2_arc"},
22
+ {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
23
+ {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
24
+ {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
25
+ {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": ""},
26
+ {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
27
+ {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
28
+
29
+ {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.0, "DCLM Dirty": 18.4, "CC202505 Dirty": 7.4, "URL": "https://huggingface.co/datasets/stanfordnlp/coqa"},
30
+ {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.8, "DCLM Dirty": 40.1, "CC202505 Dirty": 2.7, "URL": "https://huggingface.co/datasets/rajpurkar/squad"}
31
  ]