Spaces:

infini-gram-mini
/

Benchmark-Contamination-Monitoring-System

Running

App Files Files Community

Hao Xu commited on Apr 16

Commit

27c9b8f

1 Parent(s): 3c856c0

data format update

Browse files

Files changed (2) hide show

app.py +15 -4
data.json +29 -30

app.py CHANGED Viewed

@@ -28,16 +28,24 @@ def build_table(source):
     return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
-def record_submission(jsonl_file, hf_path, field_name):
     if jsonl_file is None and not hf_path:
         return "Please provide either a .jsonl file or a HuggingFace dataset path."
     entry = {
-        "source": hf_path if hf_path else jsonl_file.name,
-        "type": "hf" if hf_path else "jsonl",
         "field_name": field_name,
     }
     queue_file = "pending_submissions.json"
     existing = []
     if os.path.exists(queue_file):
@@ -83,10 +91,13 @@ with gr.Blocks() as interface:
         with gr.Tab(label="Submission"):
             gr.Markdown("## Submit Your Dataset for Contamination Checking")
             with gr.Row():
                 jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
                 hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
             field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
             submit_button = gr.Button("Submit for Contamination Check")
@@ -94,7 +105,7 @@ with gr.Blocks() as interface:
             submit_button.click(
                 fn=record_submission,
-                inputs=[jsonl_input, hf_path_input, field_name_input],
                 outputs=result_output
             )

     return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
+def record_submission(benchmark_name, jsonl_file, hf_path, hf_split, field_name):
     if jsonl_file is None and not hf_path:
         return "Please provide either a .jsonl file or a HuggingFace dataset path."
+    if hf_path and not hf_split:
+        return "Please provide a dataset split for the HuggingFace dataset."
+    if not field_name:
+        return "Please provide a field name."
     entry = {
+        "name": benchmark_name,
+        "type": "jsonl" if jsonl_file else "hf",
+        "path": jsonl_file.name if jsonl_file else hf_path,
+        "split": hf_split if hf_path else None,
         "field_name": field_name,
     }
     queue_file = "pending_submissions.json"
     existing = []
     if os.path.exists(queue_file):
         with gr.Tab(label="Submission"):
             gr.Markdown("## Submit Your Dataset for Contamination Checking")
+            benchmark_name_input = gr.Textbox(label="Benchmark Name")
             with gr.Row():
                 jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
                 hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
+            hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
             field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
             submit_button = gr.Button("Submit for Contamination Check")
             submit_button.click(
                 fn=record_submission,
+                inputs=[benchmark_name_input, jsonl_input, hf_path_input, hf_split_input, field_name_input],
                 outputs=result_output
             )

data.json CHANGED Viewed

@@ -1,31 +1,30 @@
 [
-    {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 14.57, "DCLM Dirty": 0, "CC202505 Dirty": 0},
-    {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 6.87, "DCLM Dirty": 6.87, "CC202505 Dirty": 6.87},
-    {"Benchmark": "Big-Bench-Hard", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.44, "DCLM Dirty": 0.44, "CC202505 Dirty": 0.44},
-    {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.46, "DCLM Dirty": 0.46, "CC202505 Dirty": 0.46},
-    {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
-    {"Benchmark": "AIME-2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
-    {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.38, "CC202505 Dirty": 5.76},
-    {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
-    {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
-    {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
-    {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
-    {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
-    {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.20},
-    {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
-    {"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.79, "DCLM Dirty": 34.30, "CC202505 Dirty": 0},
-    {"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.64, "DCLM Dirty": 32.38, "CC202505 Dirty": 0},
-    {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.09, "DCLM Dirty": 0.88, "CC202505 Dirty": 0},
-    {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.01, "DCLM Dirty": 0, "CC202505 Dirty": 0},
-    {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": 0},
-    {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
-    {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": 0.0, "CC202505 Dirty": 0},
-    {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
-    {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.20, "DCLM Dirty": 0, "CC202505 Dirty": 0},
-    {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.97, "DCLM Dirty": 0, "CC202505 Dirty": 0}
-  ]

 [
+  {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 14.57, "DCLM Dirty": 28.81, "CC202505 Dirty": -1},
+  {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 6.87, "DCLM Dirty": -1, "CC202505 Dirty": -1},
+  {"Benchmark": "Big-Bench-Hard", "Category": "Knowledge and Reasoning", "Pile Dirty": -1, "DCLM Dirty": -1, "CC202505 Dirty": -1},
+  {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.46, "DCLM Dirty": -1, "CC202505 Dirty": 2.49},
+  {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": -1},
+  {"Benchmark": "AIME-2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
+  {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.38, "CC202505 Dirty": 5.76},
+  {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
+  {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
+  {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
+  {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
+  {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": 0.00},
+  {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": 0.20},
+  {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
+  {"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.79, "DCLM Dirty": 34.30, "CC202505 Dirty": -1},
+  {"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.64, "DCLM Dirty": 32.38, "CC202505 Dirty": -1},
+  {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.09, "DCLM Dirty": 0.88, "CC202505 Dirty": -1},
+  {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.01, "DCLM Dirty": -1, "CC202505 Dirty": -1},
+  {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": -1},
+  {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": -1},
+  {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": -1, "CC202505 Dirty": -1},
+  {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": -1, "CC202505 Dirty": -1},
+  {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.20, "DCLM Dirty": -1, "CC202505 Dirty": -1},
+  {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.97, "DCLM Dirty": -1, "CC202505 Dirty": -1}
+]