Enderchef commited on
Commit
cda939c
·
verified ·
1 Parent(s): 05331fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -66
app.py CHANGED
@@ -321,10 +321,10 @@ def save_text(text_content):
321
  gr.Error(f"Error saving file: {e}")
322
  return None
323
 
324
- def load_leaderboard():
325
  """
326
- Loads evaluation data from 'eval.jsonl', computes average accuracy per model for MMLU and MMLU-Pro,
327
- and prepares data for two separate leaderboard tables.
328
  """
329
  try:
330
  df = pd.read_json("eval.jsonl", lines=True)
@@ -335,68 +335,36 @@ def load_leaderboard():
335
 
336
  if df.empty:
337
  gr.Warning("No valid evaluation data found to populate the leaderboard.")
338
- # Return empty dataframes for both MMLU and MMLU-Pro
339
- return (
340
- pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
341
- pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
342
- )
343
-
344
- # Filter for MMLU data
345
- df_mmlu = df[df['benchmark'] == 'MMLU']
346
- if 'subject' in df_mmlu.columns:
347
- # For MMLU, if "ALL" subjects are evaluated, consider the overall accuracy.
348
- # Otherwise, average specific subject accuracies.
349
- df_mmlu_grouped = df_mmlu[df_mmlu['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
350
- # If a model only has specific subject evaluations, average those.
351
- # This is a simplification; a more robust approach might be to calculate weighted average.
352
- # For now, if "ALL" exists, we use that; otherwise, we average available subjects.
353
-
354
- # If no 'ALL' subject records, average across available subjects for MMLU
355
- if df_mmlu_grouped.empty:
356
- df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()
357
 
358
- else: # Handle older eval.jsonl without 'subject' column or if only MMLU was run
359
- df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()
360
 
 
 
 
361
 
362
- df_mmlu_grouped.columns = ["Model ID", "Average Accuracy (%)"]
363
- df_mmlu_sorted = df_mmlu_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
 
 
 
364
 
365
- # Filter for MMLU-Pro data
366
- df_mmlu_pro = df[df['benchmark'] == 'MMLU-Pro']
367
- if 'subject' in df_mmlu_pro.columns:
368
- df_mmlu_pro_grouped = df_mmlu_pro[df_mmlu_pro['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
369
- if df_mmlu_pro_grouped.empty:
370
- df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
371
- else: # Handle older eval.jsonl
372
- df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
373
-
374
-
375
- df_mmlu_pro_grouped.columns = ["Model ID", "Average Accuracy (%)"]
376
- df_mmlu_pro_sorted = df_mmlu_pro_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
377
-
378
- # Return two dataframes as lists of dictionaries
379
- return df_mmlu_sorted.to_dict('records'), df_mmlu_pro_sorted.to_dict('records')
380
 
381
  except FileNotFoundError:
382
  gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
383
- return (
384
- pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
385
- pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
386
- )
387
  except Exception as e:
388
  gr.Error(f"Error loading leaderboard: {e}")
389
  traceback.print_exc() # Print full traceback for debugging
390
- return (
391
- pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
392
- pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
393
- )
394
 
395
 
396
  # --- Gradio Interface Definition ---
397
  with gr.Blocks(css="""
398
  /* Import Google Font - Inter */
399
- @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
400
 
401
  /* General body and container styling */
402
  body {
@@ -728,31 +696,30 @@ with gr.Blocks(css="""
728
  </div>
729
  """)
730
 
731
- # MMLU Leaderboard Table
732
- gr.Markdown("### MMLU Top Models")
733
- mmlu_leaderboard_table = gr.Dataframe(
734
- headers=["Model ID", "Average Accuracy (%)"],
735
- interactive=False,
736
- datatype=["str", "number"],
737
- row_count=10,
738
- col_count=2,
739
- label="MMLU Leaderboard Data",
740
- elem_classes="leaderboard-table" # Apply custom class for styling
741
  )
742
 
743
- gr.Markdown("### MMLU-Pro Top Models")
744
- mmlu_pro_leaderboard_table = gr.Dataframe(
745
  headers=["Model ID", "Average Accuracy (%)"],
746
  interactive=False,
747
  datatype=["str", "number"],
748
  row_count=10,
749
  col_count=2,
750
- label="MMLU-Pro Leaderboard Data",
751
  elem_classes="leaderboard-table" # Apply custom class for styling
752
  )
753
 
754
- # Load leaderboard when the tab is selected or when the app loads
755
- demo.load(load_leaderboard, inputs=[], outputs=[mmlu_leaderboard_table, mmlu_pro_leaderboard_table])
 
756
 
757
  # Launch the Gradio app
758
- demo.launch()
 
321
  gr.Error(f"Error saving file: {e}")
322
  return None
323
 
324
+ def load_leaderboard(benchmark_filter):
325
  """
326
+ Loads evaluation data from 'eval.jsonl', computes average accuracy per model for the selected benchmark,
327
+ and prepares data for the leaderboard table.
328
  """
329
  try:
330
  df = pd.read_json("eval.jsonl", lines=True)
 
335
 
336
  if df.empty:
337
  gr.Warning("No valid evaluation data found to populate the leaderboard.")
338
+ return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
 
340
+ # Filter data based on the selected benchmark
341
+ df_filtered = df[df['benchmark'] == benchmark_filter]
342
 
343
+ if df_filtered.empty:
344
+ gr.Warning(f"No evaluation data for {benchmark_filter} found yet.")
345
+ return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
346
 
347
+ # For the leaderboard, we typically want the average across all subjects within that benchmark.
348
+ # So we group by model_id and take the mean of accuracy.
349
+ df_grouped = df_filtered.groupby("model_id")["accuracy"].mean().reset_index()
350
+ df_grouped.columns = ["Model ID", "Average Accuracy (%)"]
351
+ df_sorted = df_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
352
 
353
+ return df_sorted.to_dict('records')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
354
 
355
  except FileNotFoundError:
356
  gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
357
+ return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
 
 
 
358
  except Exception as e:
359
  gr.Error(f"Error loading leaderboard: {e}")
360
  traceback.print_exc() # Print full traceback for debugging
361
+ return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
 
 
 
362
 
363
 
364
  # --- Gradio Interface Definition ---
365
  with gr.Blocks(css="""
366
  /* Import Google Font - Inter */
367
+ @import url('https://fonts.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
368
 
369
  /* General body and container styling */
370
  body {
 
696
  </div>
697
  """)
698
 
699
+ # Leaderboard Type Toggle
700
+ leaderboard_type_toggle = gr.Radio(
701
+ ["MMLU", "MMLU-Pro"],
702
+ label="Select Benchmark for Leaderboard",
703
+ value="MMLU", # Default to MMLU
704
+ interactive=True,
705
+ container=False, # Make it inline with content
706
+ elem_id="leaderboard-toggle"
 
 
707
  )
708
 
709
+ # Leaderboard Table
710
+ leaderboard_table_output = gr.Dataframe(
711
  headers=["Model ID", "Average Accuracy (%)"],
712
  interactive=False,
713
  datatype=["str", "number"],
714
  row_count=10,
715
  col_count=2,
716
+ label="Benchmark Leaderboard Data",
717
  elem_classes="leaderboard-table" # Apply custom class for styling
718
  )
719
 
720
+ # Initial load and dynamic update for the leaderboard
721
+ demo.load(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
722
+ leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
723
 
724
  # Launch the Gradio app
725
+ demo.launch()