File size: 3,320 Bytes
3c856c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
import os
import json
import pandas as pd

def load_data(source):
    data = []
    if source == "core":
        with open("data.json", "r") as file:
            data = json.load(file)
    elif source == "community":
        with open("community_results.json", "r") as file:
            data = json.load(file)
    return data

def build_table(source):
    data = load_data(source)
    entries = []

    for entry in data:
        entries.append({
            "Benchmark": entry.get("Benchmark", ""),
            "Category": entry.get("Category", ""),
            "Pile Dirty (%)": entry.get("Pile Dirty", ""),
            "DCLM Dirty (%)": entry.get("DCLM Dirty", ""),
            "CC Dirty (%)": entry.get("CC202505 Dirty", ""),
        })

    return pd.DataFrame(entries).sort_values(by="Pile Dirty (%)", ascending=False)

def record_submission(jsonl_file, hf_path, field_name):
    if jsonl_file is None and not hf_path:
        return "Please provide either a .jsonl file or a HuggingFace dataset path."

    entry = {
        "source": hf_path if hf_path else jsonl_file.name,
        "type": "hf" if hf_path else "jsonl",
        "field_name": field_name,
    }

    queue_file = "pending_submissions.json"
    existing = []
    if os.path.exists(queue_file):
        with open(queue_file, "r") as f:
            existing = json.load(f)
    existing.append(entry)
    with open(queue_file, "w") as f:
        json.dump(existing, f, indent=2)

    return "βœ… Submission received! You'll be notified when processing is complete."

with gr.Blocks() as interface:
    gr.Markdown("# πŸ“– Benchmark Contamination Bulletin")

    with gr.Tabs():
        with gr.Tab(label="Leaderboard"):

            source_radio = gr.Radio(
                choices=["core", "community"],
                label="Select Benchmark Source",
                value="core"
            )

            table_columns = ["Benchmark", "Category", "Pile Dirty (%)", "DCLM Dirty (%)", "CC202505 Dirty (%)"]
            leaderboard_table = gr.Dataframe(
                value=build_table("core"),
                headers=table_columns,
                interactive=False,
                wrap=True,
                label="Dirty Rates"
            )

            def update_table(source):
                return build_table(source)

            source_radio.change(
                fn=update_table,
                inputs=source_radio,
                outputs=leaderboard_table
            )

        # Submission Tab
        with gr.Tab(label="Submission"):
            gr.Markdown("## Submit Your Dataset for Contamination Checking")

            with gr.Row():
                jsonl_input = gr.File(label="Upload .jsonl File", file_types=[".jsonl"])
                hf_path_input = gr.Textbox(label="HuggingFace Dataset Path")

            field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")

            submit_button = gr.Button("Submit for Contamination Check")
            result_output = gr.Textbox(label="Submission Status", interactive=False)

            submit_button.click(
                fn=record_submission,
                inputs=[jsonl_input, hf_path_input, field_name_input],
                outputs=result_output
            )

interface.launch()