Hao Xu commited on
Commit
d56bc01
Β·
1 Parent(s): 490d0b1
Files changed (2) hide show
  1. app.py +58 -38
  2. data.json +0 -1
app.py CHANGED
@@ -31,7 +31,6 @@ def load_data(source, refresh=False):
31
 
32
  def build_table(source, refresh=False):
33
  data = load_data(source, refresh)
34
- entries = []
35
 
36
  if source == "core":
37
  headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)"]
@@ -40,19 +39,25 @@ def build_table(source, refresh=False):
40
 
41
  html = """
42
  <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
43
- <thead><tr>
 
44
  """
45
  for col in headers:
46
- html += f'<th style="border: 1px solid #ddd; padding: 8px; text-align: right;" onclick="sortTable(this)">{col} <span class="triangle"></span></th>'
47
- html += '</tr></thead>\n<tbody>\n'
 
 
 
 
 
 
 
 
48
 
49
  for entry in data:
50
  name = entry.get("Benchmark", "")
51
  url = entry.get("URL", "#")
52
- if url:
53
- hyperlink = f'<a href="{url}" target="_blank">{name}</a>'
54
- else:
55
- hyperlink = name
56
 
57
  row = {
58
  "Benchmark": hyperlink,
@@ -70,10 +75,12 @@ def build_table(source, refresh=False):
70
  for col in headers:
71
  val = row.get(col, "")
72
  if isinstance(val, float) and val >= 0:
73
- val = f"{val:5.1f}"
 
74
  elif isinstance(val, float):
75
- val = "N/A"
76
- html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val}</td>'
 
77
  html += "</tr>\n"
78
 
79
  html += "</tbody></table>"
@@ -83,52 +90,67 @@ def build_table(source, refresh=False):
83
  let sortDirection = {};
84
 
85
  function sortTable(header) {
86
- var table = document.getElementById("benchmarkTable");
87
- var rows = Array.from(table.rows).slice(1);
88
- var columnIndex = Array.from(header.parentNode.children).indexOf(header);
89
- var isAscending = sortDirection[columnIndex] === 'ascending';
90
-
91
  sortDirection[columnIndex] = isAscending ? 'descending' : 'ascending';
92
 
93
- var allHeaders = header.parentNode.children;
94
- Array.from(allHeaders).forEach(th => {
95
- th.querySelector('.triangle').classList.remove('ascending', 'descending');
 
 
96
  });
97
 
98
- header.querySelector('.triangle').classList.add(sortDirection[columnIndex]);
99
-
100
- rows.sort(function(rowA, rowB) {
101
- var cellA = rowA.cells[columnIndex].innerText;
102
- var cellB = rowB.cells[columnIndex].innerText;
103
 
 
 
 
104
  if (isNaN(cellA)) {
105
  return isAscending ? cellA.localeCompare(cellB) : cellB.localeCompare(cellA);
106
  }
107
  return isAscending ? parseFloat(cellA) - parseFloat(cellB) : parseFloat(cellB) - parseFloat(cellA);
108
  });
109
 
110
- for (var i = 0; i < rows.length; i++) {
111
- table.appendChild(rows[i]);
112
- }
113
  }
114
  </script>
115
  """
116
 
117
  html += """
118
  <style>
119
- .triangle {
 
 
 
120
  display: inline-block;
 
 
 
 
 
121
  width: 0;
122
  height: 0;
 
123
  border-left: 5px solid transparent;
124
  border-right: 5px solid transparent;
125
- margin-left: 5px;
126
- transition: transform 0.2s;
127
  }
128
- .ascending {
 
 
 
 
 
 
129
  border-bottom: 5px solid #000;
130
  }
131
- .descending {
132
  border-top: 5px solid #000;
133
  }
134
  </style>
@@ -231,9 +253,9 @@ with gr.Blocks() as interface:
231
  gr.HTML(
232
  '''<h1 text-align="center">πŸ“– Benchmark Contamination Monitoring System</h1>
233
 
234
- <p style='font-size: 16px;'>This system monitors potential contamination in benchmark datasets used for evaluating language models across various open-source corpora.</p>
235
  <p style='font-size: 16px;'>The system is released along with our paper Infini-gram mini: Exact n-gram Search at the Internet Scale with FM-Index, which documents the methodology and findings in detail.</p>
236
- <p style='font-size: 16px;'>We invite the community to contribute by submitting new benchmarks for contamination analysis using the form available in the <b>"Add New Benchmarks"</b> tab.</p>
237
  '''
238
  )
239
 
@@ -244,11 +266,9 @@ with gr.Blocks() as interface:
244
  gr.Markdown('''
245
  The **Benchmark Contamination Bulletin** presents contamination statistics for evaluation benchmarks across different data sources.
246
 
247
- - Benchmarks analyzed in our accompanying paper are listed under the **core** source.
248
- - User-submitted benchmarks appear under the **community** source.
249
- - The contamination rate represents the percentage of benchmark entries identified as *dirty* based on our detection criteria.
250
  - The bulletin will be updated regularly to include contamination checks on newly released Common Crawl dumps.
251
- - You can sort the results by clicking on the column headers.
252
  ''')
253
 
254
  source_radio = gr.Radio(
 
31
 
32
  def build_table(source, refresh=False):
33
  data = load_data(source, refresh)
 
34
 
35
  if source == "core":
36
  headers = ["Benchmark", "Category", "Pile-train Dirty (%)", "DCLM-baseline Dirty (%)", "CC-2025-05 Dirty (%)"]
 
39
 
40
  html = """
41
  <table id="benchmarkTable" style="border-collapse: collapse; width: 100%;">
42
+ <thead>
43
+ <tr>
44
  """
45
  for col in headers:
46
+ html += f'''
47
+ <th onclick="sortTable(this)" style="cursor: pointer; border: 1px solid #ddd; padding: 8px; text-align: right;">
48
+ {col}
49
+ <span class="tri-container">
50
+ <span class="triangle-up"></span>
51
+ <span class="triangle-down"></span>
52
+ </span>
53
+ </th>
54
+ '''
55
+ html += "</tr></thead><tbody>"
56
 
57
  for entry in data:
58
  name = entry.get("Benchmark", "")
59
  url = entry.get("URL", "#")
60
+ hyperlink = f'<a href="{url}" target="_blank">{name}</a>' if url else name
 
 
 
61
 
62
  row = {
63
  "Benchmark": hyperlink,
 
75
  for col in headers:
76
  val = row.get(col, "")
77
  if isinstance(val, float) and val >= 0:
78
+ val_display = f"{val:5.1f}"
79
+ html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val_display}</td>'
80
  elif isinstance(val, float):
81
+ html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">N/A</td>'
82
+ else:
83
+ html += f'<td style="border: 1px solid #ddd; padding: 8px; text-align: right;">{val}</td>'
84
  html += "</tr>\n"
85
 
86
  html += "</tbody></table>"
 
90
  let sortDirection = {};
91
 
92
  function sortTable(header) {
93
+ const table = document.getElementById("benchmarkTable");
94
+ const rows = Array.from(table.tBodies[0].rows);
95
+ const columnIndex = Array.from(header.parentNode.children).indexOf(header);
96
+ const isAscending = sortDirection[columnIndex] === 'ascending';
 
97
  sortDirection[columnIndex] = isAscending ? 'descending' : 'ascending';
98
 
99
+ Array.from(header.parentNode.children).forEach(th => {
100
+ const up = th.querySelector('.triangle-up');
101
+ const down = th.querySelector('.triangle-down');
102
+ if (up) up.classList.remove('active');
103
+ if (down) down.classList.remove('active');
104
  });
105
 
106
+ if (sortDirection[columnIndex] === 'ascending') {
107
+ header.querySelector('.triangle-up').classList.add('active');
108
+ } else {
109
+ header.querySelector('.triangle-down').classList.add('active');
110
+ }
111
 
112
+ rows.sort((rowA, rowB) => {
113
+ const cellA = rowA.cells[columnIndex].innerText;
114
+ const cellB = rowB.cells[columnIndex].innerText;
115
  if (isNaN(cellA)) {
116
  return isAscending ? cellA.localeCompare(cellB) : cellB.localeCompare(cellA);
117
  }
118
  return isAscending ? parseFloat(cellA) - parseFloat(cellB) : parseFloat(cellB) - parseFloat(cellA);
119
  });
120
 
121
+ rows.forEach(row => table.tBodies[0].appendChild(row));
 
 
122
  }
123
  </script>
124
  """
125
 
126
  html += """
127
  <style>
128
+ thead tr {
129
+ background-color: #f0f0f0;
130
+ }
131
+ .tri-container {
132
  display: inline-block;
133
+ margin-left: 4px;
134
+ vertical-align: middle;
135
+ }
136
+ .triangle-up, .triangle-down {
137
+ display: block;
138
  width: 0;
139
  height: 0;
140
+ margin: 1px auto;
141
  border-left: 5px solid transparent;
142
  border-right: 5px solid transparent;
 
 
143
  }
144
+ .triangle-up {
145
+ border-bottom: 5px solid #999;
146
+ }
147
+ .triangle-down {
148
+ border-top: 5px solid #999;
149
+ }
150
+ .triangle-up.active {
151
  border-bottom: 5px solid #000;
152
  }
153
+ .triangle-down.active {
154
  border-top: 5px solid #000;
155
  }
156
  </style>
 
253
  gr.HTML(
254
  '''<h1 text-align="center">πŸ“– Benchmark Contamination Monitoring System</h1>
255
 
256
+ <p style='font-size: 16px;'>This system monitors potential contamination in benchmark datasets used for evaluating language models across various open-source corpora 🧐.</p>
257
  <p style='font-size: 16px;'>The system is released along with our paper Infini-gram mini: Exact n-gram Search at the Internet Scale with FM-Index, which documents the methodology and findings in detail.</p>
258
+ <p style='font-size: 16px;'>We welcome the community to submit new benchmarks for contamination analysis using the <b>"Add New Benchmarks"</b> tab.</p>
259
  '''
260
  )
261
 
 
266
  gr.Markdown('''
267
  The **Benchmark Contamination Bulletin** presents contamination statistics for evaluation benchmarks across different data sources.
268
 
269
+ - Benchmarks analyzed in our paper are under the **core** source. Community-submitted benchmarks appear under the **community** source.
270
+ - The contamination rate represents the percentage of *dirty* benchmark entries.
 
271
  - The bulletin will be updated regularly to include contamination checks on newly released Common Crawl dumps.
 
272
  ''')
273
 
274
  source_radio = gr.Radio(
data.json CHANGED
@@ -22,7 +22,6 @@
22
  {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
23
  {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
24
  {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
25
- {"Benchmark": "PIQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": ""},
26
  {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
27
  {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
28
 
 
22
  {"Benchmark": "CSQA", "Category": "Commonsense Understanding", "Pile Dirty": 0.1, "DCLM Dirty": 1.0, "CC202505 Dirty": 0.1, "URL": "https://huggingface.co/datasets/tau/commonsense_qa"},
23
  {"Benchmark": "HellaSwag", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/Rowan/hellaswag"},
24
  {"Benchmark": "OpenbookQA", "Category": "Commonsense Understanding", "Pile Dirty": 10.8, "DCLM Dirty": 15.6, "CC202505 Dirty": 14.6, "URL": "https://huggingface.co/datasets/allenai/openbookqa"},
 
25
  {"Benchmark": "Social IQa", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.5, "CC202505 Dirty": 0.2, "URL": "https://huggingface.co/datasets/allenai/social_i_qa"},
26
  {"Benchmark": "WinoGrande", "Category": "Commonsense Understanding", "Pile Dirty": 0.0, "DCLM Dirty": 0.0, "CC202505 Dirty": 0.0, "URL": "https://huggingface.co/datasets/allenai/winogrande"},
27