Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -321,10 +321,10 @@ def save_text(text_content):
|
|
321 |
gr.Error(f"Error saving file: {e}")
|
322 |
return None
|
323 |
|
324 |
-
def load_leaderboard():
|
325 |
"""
|
326 |
-
Loads evaluation data from 'eval.jsonl', computes average accuracy per model for
|
327 |
-
and prepares data for
|
328 |
"""
|
329 |
try:
|
330 |
df = pd.read_json("eval.jsonl", lines=True)
|
@@ -335,68 +335,36 @@ def load_leaderboard():
|
|
335 |
|
336 |
if df.empty:
|
337 |
gr.Warning("No valid evaluation data found to populate the leaderboard.")
|
338 |
-
|
339 |
-
return (
|
340 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
|
341 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
342 |
-
)
|
343 |
-
|
344 |
-
# Filter for MMLU data
|
345 |
-
df_mmlu = df[df['benchmark'] == 'MMLU']
|
346 |
-
if 'subject' in df_mmlu.columns:
|
347 |
-
# For MMLU, if "ALL" subjects are evaluated, consider the overall accuracy.
|
348 |
-
# Otherwise, average specific subject accuracies.
|
349 |
-
df_mmlu_grouped = df_mmlu[df_mmlu['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
|
350 |
-
# If a model only has specific subject evaluations, average those.
|
351 |
-
# This is a simplification; a more robust approach might be to calculate weighted average.
|
352 |
-
# For now, if "ALL" exists, we use that; otherwise, we average available subjects.
|
353 |
-
|
354 |
-
# If no 'ALL' subject records, average across available subjects for MMLU
|
355 |
-
if df_mmlu_grouped.empty:
|
356 |
-
df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()
|
357 |
|
358 |
-
|
359 |
-
|
360 |
|
|
|
|
|
|
|
361 |
|
362 |
-
|
363 |
-
|
|
|
|
|
|
|
364 |
|
365 |
-
|
366 |
-
df_mmlu_pro = df[df['benchmark'] == 'MMLU-Pro']
|
367 |
-
if 'subject' in df_mmlu_pro.columns:
|
368 |
-
df_mmlu_pro_grouped = df_mmlu_pro[df_mmlu_pro['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
|
369 |
-
if df_mmlu_pro_grouped.empty:
|
370 |
-
df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
|
371 |
-
else: # Handle older eval.jsonl
|
372 |
-
df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
|
373 |
-
|
374 |
-
|
375 |
-
df_mmlu_pro_grouped.columns = ["Model ID", "Average Accuracy (%)"]
|
376 |
-
df_mmlu_pro_sorted = df_mmlu_pro_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
|
377 |
-
|
378 |
-
# Return two dataframes as lists of dictionaries
|
379 |
-
return df_mmlu_sorted.to_dict('records'), df_mmlu_pro_sorted.to_dict('records')
|
380 |
|
381 |
except FileNotFoundError:
|
382 |
gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
|
383 |
-
return (
|
384 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
|
385 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
386 |
-
)
|
387 |
except Exception as e:
|
388 |
gr.Error(f"Error loading leaderboard: {e}")
|
389 |
traceback.print_exc() # Print full traceback for debugging
|
390 |
-
return (
|
391 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
|
392 |
-
pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
393 |
-
)
|
394 |
|
395 |
|
396 |
# --- Gradio Interface Definition ---
|
397 |
with gr.Blocks(css="""
|
398 |
/* Import Google Font - Inter */
|
399 |
-
@import url('https://fonts.
|
400 |
|
401 |
/* General body and container styling */
|
402 |
body {
|
@@ -728,31 +696,30 @@ with gr.Blocks(css="""
|
|
728 |
</div>
|
729 |
""")
|
730 |
|
731 |
-
#
|
732 |
-
gr.
|
733 |
-
|
734 |
-
|
735 |
-
|
736 |
-
|
737 |
-
|
738 |
-
|
739 |
-
label="MMLU Leaderboard Data",
|
740 |
-
elem_classes="leaderboard-table" # Apply custom class for styling
|
741 |
)
|
742 |
|
743 |
-
|
744 |
-
|
745 |
headers=["Model ID", "Average Accuracy (%)"],
|
746 |
interactive=False,
|
747 |
datatype=["str", "number"],
|
748 |
row_count=10,
|
749 |
col_count=2,
|
750 |
-
label="
|
751 |
elem_classes="leaderboard-table" # Apply custom class for styling
|
752 |
)
|
753 |
|
754 |
-
#
|
755 |
-
demo.load(load_leaderboard, inputs=[], outputs=[
|
|
|
756 |
|
757 |
# Launch the Gradio app
|
758 |
-
demo.launch()
|
|
|
321 |
gr.Error(f"Error saving file: {e}")
|
322 |
return None
|
323 |
|
324 |
+
def load_leaderboard(benchmark_filter):
|
325 |
"""
|
326 |
+
Loads evaluation data from 'eval.jsonl', computes average accuracy per model for the selected benchmark,
|
327 |
+
and prepares data for the leaderboard table.
|
328 |
"""
|
329 |
try:
|
330 |
df = pd.read_json("eval.jsonl", lines=True)
|
|
|
335 |
|
336 |
if df.empty:
|
337 |
gr.Warning("No valid evaluation data found to populate the leaderboard.")
|
338 |
+
return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
|
340 |
+
# Filter data based on the selected benchmark
|
341 |
+
df_filtered = df[df['benchmark'] == benchmark_filter]
|
342 |
|
343 |
+
if df_filtered.empty:
|
344 |
+
gr.Warning(f"No evaluation data for {benchmark_filter} found yet.")
|
345 |
+
return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
346 |
|
347 |
+
# For the leaderboard, we typically want the average across all subjects within that benchmark.
|
348 |
+
# So we group by model_id and take the mean of accuracy.
|
349 |
+
df_grouped = df_filtered.groupby("model_id")["accuracy"].mean().reset_index()
|
350 |
+
df_grouped.columns = ["Model ID", "Average Accuracy (%)"]
|
351 |
+
df_sorted = df_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
|
352 |
|
353 |
+
return df_sorted.to_dict('records')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
354 |
|
355 |
except FileNotFoundError:
|
356 |
gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
|
357 |
+
return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
|
|
|
|
|
|
358 |
except Exception as e:
|
359 |
gr.Error(f"Error loading leaderboard: {e}")
|
360 |
traceback.print_exc() # Print full traceback for debugging
|
361 |
+
return pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
|
|
|
|
|
|
|
362 |
|
363 |
|
364 |
# --- Gradio Interface Definition ---
|
365 |
with gr.Blocks(css="""
|
366 |
/* Import Google Font - Inter */
|
367 |
+
@import url('https://fonts.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
|
368 |
|
369 |
/* General body and container styling */
|
370 |
body {
|
|
|
696 |
</div>
|
697 |
""")
|
698 |
|
699 |
+
# Leaderboard Type Toggle
|
700 |
+
leaderboard_type_toggle = gr.Radio(
|
701 |
+
["MMLU", "MMLU-Pro"],
|
702 |
+
label="Select Benchmark for Leaderboard",
|
703 |
+
value="MMLU", # Default to MMLU
|
704 |
+
interactive=True,
|
705 |
+
container=False, # Make it inline with content
|
706 |
+
elem_id="leaderboard-toggle"
|
|
|
|
|
707 |
)
|
708 |
|
709 |
+
# Leaderboard Table
|
710 |
+
leaderboard_table_output = gr.Dataframe(
|
711 |
headers=["Model ID", "Average Accuracy (%)"],
|
712 |
interactive=False,
|
713 |
datatype=["str", "number"],
|
714 |
row_count=10,
|
715 |
col_count=2,
|
716 |
+
label="Benchmark Leaderboard Data",
|
717 |
elem_classes="leaderboard-table" # Apply custom class for styling
|
718 |
)
|
719 |
|
720 |
+
# Initial load and dynamic update for the leaderboard
|
721 |
+
demo.load(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
|
722 |
+
leaderboard_type_toggle.change(load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output])
|
723 |
|
724 |
# Launch the Gradio app
|
725 |
+
demo.launch()
|