Spaces:

infini-gram-mini
/

Benchmark-Contamination-Monitoring-System

Running

App Files Files Community

Hao Xu commited on 28 days ago

Commit

04cf55a

1 Parent(s): 73b542b

handling datasets with config

Browse files

Files changed (1) hide show

app.py +7 -3

app.py CHANGED Viewed

@@ -137,7 +137,7 @@ def build_table(source, refresh=False):
     return html
-def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split, field_name):
     if not benchmark_name or not benchmark_name.strip():
         return "❌ Please provide a benchmark name."
@@ -177,7 +177,10 @@ def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split
             return "❌ Please provide a dataset split for the HuggingFace dataset."
         try:
-            dataset_info = load_dataset(hf_path.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True)
             first_item = next(iter(dataset_info))
             if field_name.strip() not in first_item:
                 available_fields = list(first_item.keys())
@@ -303,6 +306,7 @@ with gr.Blocks() as interface:
                 with gr.Column():
                     hf_path_input = gr.Textbox(label="HuggingFace Dataset Path", placeholder="e.g., author/benchmark-name")
                     hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
             field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
             submit_button = gr.Button("Submit for Contamination Check")
@@ -310,7 +314,7 @@ with gr.Blocks() as interface:
             submit_button.click(
                 fn=record_submission,
-                inputs=[benchmark_name_input, contributor_input, jsonl_input, hf_path_input, hf_split_input, field_name_input],
                 outputs=result_output
             )

     return html
+def record_submission(benchmark_name, contributor, jsonl_file, hf_path, hf_split, field_name, hf_config):
     if not benchmark_name or not benchmark_name.strip():
         return "❌ Please provide a benchmark name."
             return "❌ Please provide a dataset split for the HuggingFace dataset."
         try:
+            if hf_config and hf_config.strip():
+                dataset_info = load_dataset(hf_path.strip(), hf_config.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True)
+            else:
+                dataset_info = load_dataset(hf_path.strip(), split=hf_split.strip(), streaming=True, trust_remote_code=True)
             first_item = next(iter(dataset_info))
             if field_name.strip() not in first_item:
                 available_fields = list(first_item.keys())
                 with gr.Column():
                     hf_path_input = gr.Textbox(label="HuggingFace Dataset Path", placeholder="e.g., author/benchmark-name")
                     hf_split_input = gr.Textbox(label="Dataset split (only if providing HuggingFace Dataset)", placeholder="e.g., validation, test")
+                    hf_config_input = gr.Textbox(label="Dataset Config (optional)", placeholder="name of dataset config")
             field_name_input = gr.Textbox(label="Context or Question Field Name", placeholder="e.g., context, question, ...")
             submit_button = gr.Button("Submit for Contamination Check")
             submit_button.click(
                 fn=record_submission,
+                inputs=[benchmark_name_input, contributor_input, jsonl_input, hf_path_input, hf_split_input, field_name_input, hf_config_input],
                 outputs=result_output
             )