Hao Xu commited on
Commit
3c856c0
Β·
1 Parent(s): b58437b

leaderboard UI

Browse files
Files changed (4) hide show
  1. app.py +101 -0
  2. community_results.json +3 -0
  3. data.json +31 -0
  4. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import json
4
+ import pandas as pd
5
+
6
+ def load_data(source):
7
+ data = []
8
+ if source == "core":
9
+ with open("data.json", "r") as file:
10
+ data = json.load(file)
11
+ elif source == "community":
12
+ with open("community_results.json", "r") as file:
13
+ data = json.load(file)
14
+ return data
15
+
16
+ def build_table(source):
17
+ data = load_data(source)
18
+ entries = []
19
+
20
+ for entry in data:
21
+ entries.append({
22
+ "Benchmark": entry.get("Benchmark", ""),
23
+ "Category": entry.get("Category", ""),
24
+ "Pile Dirty (%)": entry.get("Pile Dirty", ""),
25
+ "DCLM Dirty (%)": entry.get("DCLM Dirty", ""),
26
+ "CC Dirty (%)": entry.get("CC202505 Dirty", ""),
27
+ })
28
+
29
+ return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)
30
+
31
+ def record_submission(jsonl_file, hf_path, field_name):
32
+ if jsonl_file is None and not hf_path:
33
+ return "Please provide either a .jsonl file or a HuggingFace dataset path."
34
+
35
+ entry = {
36
+ "source": hf_path if hf_path else jsonl_file.name,
37
+ "type": "hf" if hf_path else "jsonl",
38
+ "field_name": field_name,
39
+ }
40
+
41
+ queue_file = "pending_submissions.json"
42
+ existing = []
43
+ if os.path.exists(queue_file):
44
+ with open(queue_file, "r") as f:
45
+ existing = json.load(f)
46
+ existing.append(entry)
47
+ with open(queue_file, "w") as f:
48
+ json.dump(existing, f, indent=2)
49
+
50
+ return "βœ… Submission received! You'll be notified when processing is complete."
51
+
52
+ with gr.Blocks() as interface:
53
+ gr.Markdown("# πŸ“– Benchmark Contamination Bulletin")
54
+
55
+ with gr.Tabs():
56
+ with gr.Tab(label="Leaderboard"):
57
+
58
+ source_radio = gr.Radio(
59
+ choices=["core", "community"],
60
+ label="Select Benchmark Source",
61
+ value="core"
62
+ )
63
+
64
+ table_columns = ["Benchmark", "Category", "Pile Dirty (%)", "DCLM Dirty (%)", "CC202505 Dirty (%)"]
65
+ leaderboard_table = gr.Dataframe(
66
+ value=build_table("core"),
67
+ headers=table_columns,
68
+ interactive=False,
69
+ wrap=True,
70
+ label="Dirty Rates"
71
+ )
72
+
73
+ def update_table(source):
74
+ return build_table(source)
75
+
76
+ source_radio.change(
77
+ fn=update_table,
78
+ inputs=source_radio,
79
+ outputs=leaderboard_table
80
+ )
81
+
82
+ # Submission Tab
83
+ with gr.Tab(label="Submission"):
84
+ gr.Markdown("## Submit Your Dataset for Contamination Checking")
85
+
86
+ with gr.Row():
87
+ jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
88
+ hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")
89
+
90
+ field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
91
+
92
+ submit_button = gr.Button("Submit for Contamination Check")
93
+ result_output = gr.Textbox(label="Submission Status", interactive=False)
94
+
95
+ submit_button.click(
96
+ fn=record_submission,
97
+ inputs=[jsonl_input, hf_path_input, field_name_input],
98
+ outputs=result_output
99
+ )
100
+
101
+ interface.launch()
community_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [
2
+
3
+ ]
data.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {"Benchmark": "MMLU", "Category": "Knowledge and Reasoning", "Pile Dirty": 14.57, "DCLM Dirty": 0, "CC202505 Dirty": 0},
3
+ {"Benchmark": "MMLU-Pro", "Category": "Knowledge and Reasoning", "Pile Dirty": 6.87, "DCLM Dirty": 6.87, "CC202505 Dirty": 6.87},
4
+ {"Benchmark": "Big-Bench-Hard", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.44, "DCLM Dirty": 0.44, "CC202505 Dirty": 0.44},
5
+ {"Benchmark": "AGIEval", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.46, "DCLM Dirty": 0.46, "CC202505 Dirty": 0.46},
6
+ {"Benchmark": "GPQA", "Category": "Knowledge and Reasoning", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
7
+
8
+ {"Benchmark": "AIME-2024", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 10.00},
9
+ {"Benchmark": "GSM8K", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.38, "CC202505 Dirty": 5.76},
10
+ {"Benchmark": "MATH-500", "Category": "Math", "Pile Dirty": 0.60, "DCLM Dirty": 3.20, "CC202505 Dirty": 0.60},
11
+ {"Benchmark": "MGSM", "Category": "Math", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 5.60},
12
+
13
+ {"Benchmark": "HumanEval", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
14
+ {"Benchmark": "HumanEval+", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
15
+ {"Benchmark": "LiveCodeBench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.00},
16
+ {"Benchmark": "SWE-bench", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0.20},
17
+ {"Benchmark": "MBPP", "Category": "Code", "Pile Dirty": 0.00, "DCLM Dirty": 0.40, "CC202505 Dirty": 1.00},
18
+
19
+ {"Benchmark": "ARC-C", "Category": "Commonsense Understanding", "Pile Dirty": 1.79, "DCLM Dirty": 34.30, "CC202505 Dirty": 0},
20
+ {"Benchmark": "ARC-E", "Category": "Commonsense Understanding", "Pile Dirty": 1.64, "DCLM Dirty": 32.38, "CC202505 Dirty": 0},
21
+ {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.09, "DCLM Dirty": 0.88, "CC202505 Dirty": 0},
22
+ {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.01, "DCLM Dirty": 0, "CC202505 Dirty": 0},
23
+ {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.80, "DCLM Dirty": 15.60, "CC202505 Dirty": 0},
24
+ {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
25
+ {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.10, "DCLM Dirty": 0.0, "CC202505 Dirty": 0},
26
+ {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.00, "DCLM Dirty": 0.00, "CC202505 Dirty": 0},
27
+
28
+ {"Benchmark": "CoQA", "Category": "Reading Comprehension", "Pile Dirty": 8.20, "DCLM Dirty": 0, "CC202505 Dirty": 0},
29
+ {"Benchmark": "SQuAD", "Category": "Reading Comprehension", "Pile Dirty": 2.97, "DCLM Dirty": 0, "CC202505 Dirty": 0}
30
+ ]
31
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ huggingface_hub==0.14.1
2
+ pandas