Enderchef commited on
Commit
05331fd
Β·
verified Β·
1 Parent(s): bddb36d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +242 -105
app.py CHANGED
@@ -18,9 +18,6 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
18
  # --- Constants for Benchmarks ---
19
  MMLU_DATASET = "cais/mmlu"
20
  MMLU_PRO_DATASET = "cais/mmlu_pro"
21
- # Humanity's Last Exam is a composite benchmark, not a single dataset readily available like MMLU/MMLU-Pro.
22
- # For this implementation, we will focus on MMLU and MMLU-Pro, which are direct datasets.
23
- # Integrating HLE would require evaluating across multiple specific datasets.
24
 
25
  def get_all_benchmark_options():
26
  """
@@ -68,12 +65,12 @@ def load_model(model_id):
68
  return model_cache[model_id]
69
  try:
70
  # Load tokenizer and model, using bfloat16 if CUDA is available for efficiency
71
- tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
72
  model = AutoModelForCausalLM.from_pretrained(
73
  model_id,
74
  token=HF_TOKEN,
75
- trust_remote_code=True,
76
- torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
77
  ).to("cuda" if torch.cuda.is_available() else "cpu")
78
 
79
  # Create a text-generation pipeline
@@ -107,7 +104,7 @@ def extract_choice_letter(output):
107
  It prioritizes an exact match after "Answer:", then looks for any single capital letter.
108
  """
109
  # Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B")
110
- match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE) # Added IGNORECASE for robustness
111
  if match:
112
  return match.group(1).upper() # Ensure it's uppercase
113
 
@@ -270,8 +267,6 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
270
  score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
271
 
272
  # Format detailed results for display in the text box
273
- # The key change here is to wrap the entire multi-line string construction for each item
274
- # within parentheses to ensure it's treated as a single element in the list comprehension.
275
  formatted_details = "\n\n".join([
276
  (
277
  f"### Question:\n{item['question']}\n\n"
@@ -300,7 +295,7 @@ def run_evaluation(model_id, selected_benchmark_subject, sample_count, progress=
300
  gr.Info("Evaluation completed successfully!")
301
  return score_string, \
302
  gr.update(value="", visible=False), gr.update(visible=False), \
303
- gr.update(visible=true), gr.update(visible=true), gr.update(value=formatted_details, visible=False)
304
 
305
  except Exception as e:
306
  error_message = str(e)
@@ -328,158 +323,284 @@ def save_text(text_content):
328
 
329
  def load_leaderboard():
330
  """
331
- Loads evaluation data from 'eval.jsonl', computes average accuracy per model,
332
- and prepares data for the leaderboard plot and table.
333
  """
334
  try:
335
- # Read the JSONL file into a pandas DataFrame
336
  df = pd.read_json("eval.jsonl", lines=True)
337
 
338
- # Calculate average accuracy per model across all recorded evaluations
339
- df_avg = df.groupby("model_id")["accuracy"].mean().reset_index()
340
- df_avg.columns = ["Model ID", "Average Accuracy (%)"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
341
 
342
- # Sort models by average accuracy in descending order
343
- df_sorted = df_avg.sort_values(by="Average Accuracy (%)", ascending=False)
 
 
 
 
 
 
 
 
 
 
344
 
345
- # Select top 10 models for the bar chart
346
- top_models = df_sorted.head(10)
347
-
348
- # Create the matplotlib plot
349
- fig, ax = plt.subplots(figsize=(10, 6)) # Adjust figure size for better readability
350
- # For horizontal bars, it's often better to plot data sorted in ascending order
351
- # so the highest bar appears at the top of the chart.
352
- top_models_plot = top_models.sort_values(by="Average Accuracy (%)", ascending=True)
353
-
354
- ax.barh(top_models_plot['Model ID'], top_models_plot['Average Accuracy (%)'], color='#007bff') # Use a nice blue color
355
- ax.set_xlabel("Average Accuracy (%)", fontsize=12)
356
- ax.set_ylabel("Model ID", fontsize=12)
357
- ax.set_title("Top 10 Models by Average MMLU/MMLU-Pro Accuracy", fontsize=14)
358
- ax.set_xlim(0, 100) # Ensure accuracy scale is 0-100%
359
- ax.tick_params(axis='x', labelsize=10)
360
- ax.tick_params(axis='y', labelsize=10)
361
- ax.grid(axis='x', linestyle='--', alpha=0.7) # Add grid lines
362
- plt.tight_layout() # Adjust layout to prevent labels overlapping
363
-
364
- # Return the figure and the sorted dataframe as a list of dictionaries for Gradio Dataframe
365
- return fig, df_sorted.to_dict('records')
366
  except FileNotFoundError:
367
  gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
368
- return plt.figure(), pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
 
 
 
369
  except Exception as e:
370
  gr.Error(f"Error loading leaderboard: {e}")
371
- # Return an empty plot and dataframe in case of any other error
372
- return plt.figure(), pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
 
 
 
373
 
374
 
375
  # --- Gradio Interface Definition ---
376
  with gr.Blocks(css="""
 
 
 
377
  /* General body and container styling */
378
- body { font-family: 'Inter', sans-serif; background-color: #f0f2f5; margin: 0; padding: 20px; }
 
 
 
 
 
379
  .gradio-container {
380
  max-width: 1200px;
381
  margin: 20px auto;
382
- padding: 30px;
383
- box-shadow: 0 8px 16px rgba(0,0,0,0.15);
384
- border-radius: 12px;
385
  background-color: #ffffff;
386
- border: 1px solid #e0e0e0;
387
  }
388
 
389
  /* Headings */
390
  h1 {
391
- color: #2c3e50;
392
  text-align: center;
393
  margin-bottom: 30px;
394
- font-size: 2.5em;
395
  font-weight: 700;
396
- letter-spacing: -0.02em;
 
 
 
 
 
 
 
397
  }
398
- h3 { color: #34495e; font-size: 1.2em; margin-bottom: 10px; }
399
 
400
  /* Markdown text */
401
- .markdown-text { text-align: center; color: #555; line-height: 1.6; }
402
- .markdown-text div { font-size: 1.1em; }
 
 
 
 
 
 
 
 
 
 
403
 
404
  /* Buttons */
405
  .gr-button {
406
- background-color: #007bff; /* Primary blue */
407
  color: white;
408
  border: none;
409
- padding: 12px 25px;
410
- border-radius: 8px;
411
  cursor: pointer;
412
- transition: background-color 0.3s ease, transform 0.2s ease;
413
- font-size: 1.1em;
414
  font-weight: 600;
415
- box-shadow: 0 4px 8px rgba(0,0,0,0.1);
 
416
  }
417
  .gr-button:hover {
418
- background-color: #0056b3; /* Darker blue on hover */
419
- transform: translateY(-2px); /* Slight lift effect */
 
420
  }
421
  .gr-button:active {
422
  transform: translateY(0);
423
- box-shadow: 0 2px 4px rgba(0,0,0,0.1);
424
  }
425
  /* Specific button styling for debug/show details */
426
  #debug-button, #show-details-button {
427
- background-color: #6c757d; /* Grey for secondary actions */
 
428
  }
429
  #debug-button:hover, #show-details-button:hover {
430
- background-color: #5a6268;
 
431
  }
432
  #download-button {
433
- background-color: #28a745; /* Green for download */
 
434
  }
435
  #download-button:hover {
436
- background-color: #218838;
 
437
  }
438
 
439
-
440
- /* Input/Output Boxes */
441
  .gr-box {
442
- border: 1px solid #dee2e6;
443
- border-radius: 10px;
444
- padding: 20px;
445
- margin-bottom: 20px;
446
- background-color: #fdfdfd;
447
- box-shadow: inset 0 1px 3px rgba(0,0,0,0.05);
448
  }
 
449
  .gr-output-text {
450
  white-space: pre-wrap;
451
  word-wrap: break-word;
452
- background-color: #f9f9fb;
453
- border: 1px solid #e9ecef;
454
  border-radius: 8px;
455
- padding: 15px;
456
- min-height: 100px; /* Ensure a minimum height */
 
 
 
 
457
  }
458
  /* Specific error output style */
459
  #error-message-output {
460
- background-color: #ffe0e0;
461
- border-color: #ff9999;
462
- color: #cc0000;
 
 
463
  }
464
 
465
 
466
  /* Labels for inputs */
467
  .gr-textbox label, .gr-dropdown label, .gr-slider label {
468
  font-weight: 600;
469
- color: #495057;
470
- margin-bottom: 8px;
471
  display: block;
472
- font-size: 1em;
473
  }
474
 
475
- /* Tab styling */
476
- .gr-tab-item { padding: 25px; } /* More padding inside tabs */
477
  .gr-tabs-nav button {
478
  font-weight: 600;
479
  font-size: 1.1em;
480
- padding: 10px 20px;
481
- border-top-left-radius: 8px;
482
- border-top-right-radius: 8px;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  }
484
  """) as demo:
485
  gr.Markdown("""
@@ -489,10 +610,11 @@ with gr.Blocks(css="""
489
  with gr.Tabs():
490
  with gr.TabItem("πŸš€ Run Evaluation"):
491
  gr.Markdown("""
492
- <div style="text-align: center; margin-bottom: 20px; color: #666; font-size: 1.1em;">
493
  Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro),
494
  select a subject (or 'ALL' for a comprehensive evaluation),
495
  and specify the number of samples per subject.
 
496
  </div>
497
  """)
498
 
@@ -521,6 +643,8 @@ with gr.Blocks(css="""
521
  )
522
  run_button = gr.Button("πŸš€ Run Evaluation", elem_classes="gr-button")
523
 
 
 
524
  with gr.Column(elem_classes="gr-box"):
525
  acc_output = gr.Textbox(
526
  label="Benchmark Accuracy Results",
@@ -598,24 +722,37 @@ with gr.Blocks(css="""
598
 
599
  with gr.TabItem("πŸ“Š Leaderboard"):
600
  gr.Markdown("""
601
- <div style="text-align: center; margin-bottom: 20px; color: #666; font-size: 1.1em;">
602
- See how different models perform on average across all evaluated benchmarks.
603
- This leaderboard updates with every new evaluation.
604
  </div>
605
  """)
606
- with gr.Row():
607
- leaderboard_plot_output = gr.Plot(label="Top 10 Models by Average Accuracy", scale=2) # Scale for better visibility
608
- leaderboard_table_output = gr.Dataframe(
609
- headers=["Model ID", "Average Accuracy (%)"],
610
- interactive=False,
611
- datatype=["str", "number"],
612
- row_count=10, # Display top 10 rows initially, but can scroll
613
- col_count=2,
614
- label="Full Leaderboard Data"
615
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
616
 
617
  # Load leaderboard when the tab is selected or when the app loads
618
- demo.load(load_leaderboard, inputs=[], outputs=[leaderboard_plot_output, leaderboard_table_output])
619
 
620
  # Launch the Gradio app
621
- demo.launch()
 
18
  # --- Constants for Benchmarks ---
19
  MMLU_DATASET = "cais/mmlu"
20
  MMLU_PRO_DATASET = "cais/mmlu_pro"
 
 
 
21
 
22
  def get_all_benchmark_options():
23
  """
 
65
  return model_cache[model_id]
66
  try:
67
  # Load tokenizer and model, using bfloat16 if CUDA is available for efficiency
68
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
69
  model = AutoModelForCausalLM.from_pretrained(
70
  model_id,
71
  token=HF_TOKEN,
72
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
73
+ trust_remote_code=True
74
  ).to("cuda" if torch.cuda.is_available() else "cpu")
75
 
76
  # Create a text-generation pipeline
 
104
  It prioritizes an exact match after "Answer:", then looks for any single capital letter.
105
  """
106
  # Look for "Answer: X" pattern first (e.g., "Answer: A" or "Answer: B")
107
+ match = re.search(r"Answer:\s*([ABCD])", output, re.IGNORECASE)
108
  if match:
109
  return match.group(1).upper() # Ensure it's uppercase
110
 
 
267
  score_string = f"Accuracy for {benchmark_name} - {subject_name}: {accuracy:.2f}% out of {num_evaluated_samples} samples."
268
 
269
  # Format detailed results for display in the text box
 
 
270
  formatted_details = "\n\n".join([
271
  (
272
  f"### Question:\n{item['question']}\n\n"
 
295
  gr.Info("Evaluation completed successfully!")
296
  return score_string, \
297
  gr.update(value="", visible=False), gr.update(visible=False), \
298
+ gr.update(visible=True), gr.update(visible=True), gr.update(value=formatted_details, visible=False)
299
 
300
  except Exception as e:
301
  error_message = str(e)
 
323
 
324
  def load_leaderboard():
325
  """
326
+ Loads evaluation data from 'eval.jsonl', computes average accuracy per model for MMLU and MMLU-Pro,
327
+ and prepares data for two separate leaderboard tables.
328
  """
329
  try:
 
330
  df = pd.read_json("eval.jsonl", lines=True)
331
 
332
+ # Ensure 'accuracy' is numeric, coerce errors to NaN and drop them
333
+ df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
334
+ df = df.dropna(subset=['accuracy'])
335
+
336
+ if df.empty:
337
+ gr.Warning("No valid evaluation data found to populate the leaderboard.")
338
+ # Return empty dataframes for both MMLU and MMLU-Pro
339
+ return (
340
+ pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
341
+ pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
342
+ )
343
+
344
+ # Filter for MMLU data
345
+ df_mmlu = df[df['benchmark'] == 'MMLU']
346
+ if 'subject' in df_mmlu.columns:
347
+ # For MMLU, if "ALL" subjects are evaluated, consider the overall accuracy.
348
+ # Otherwise, average specific subject accuracies.
349
+ df_mmlu_grouped = df_mmlu[df_mmlu['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
350
+ # If a model only has specific subject evaluations, average those.
351
+ # This is a simplification; a more robust approach might be to calculate weighted average.
352
+ # For now, if "ALL" exists, we use that; otherwise, we average available subjects.
353
+
354
+ # If no 'ALL' subject records, average across available subjects for MMLU
355
+ if df_mmlu_grouped.empty:
356
+ df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()
357
+
358
+ else: # Handle older eval.jsonl without 'subject' column or if only MMLU was run
359
+ df_mmlu_grouped = df_mmlu.groupby("model_id")["accuracy"].mean().reset_index()
360
+
361
+
362
+ df_mmlu_grouped.columns = ["Model ID", "Average Accuracy (%)"]
363
+ df_mmlu_sorted = df_mmlu_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
364
 
365
+ # Filter for MMLU-Pro data
366
+ df_mmlu_pro = df[df['benchmark'] == 'MMLU-Pro']
367
+ if 'subject' in df_mmlu_pro.columns:
368
+ df_mmlu_pro_grouped = df_mmlu_pro[df_mmlu_pro['subject'] == 'ALL'].groupby("model_id")["accuracy"].mean().reset_index()
369
+ if df_mmlu_pro_grouped.empty:
370
+ df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
371
+ else: # Handle older eval.jsonl
372
+ df_mmlu_pro_grouped = df_mmlu_pro.groupby("model_id")["accuracy"].mean().reset_index()
373
+
374
+
375
+ df_mmlu_pro_grouped.columns = ["Model ID", "Average Accuracy (%)"]
376
+ df_mmlu_pro_sorted = df_mmlu_pro_grouped.sort_values(by="Average Accuracy (%)", ascending=False)
377
 
378
+ # Return two dataframes as lists of dictionaries
379
+ return df_mmlu_sorted.to_dict('records'), df_mmlu_pro_sorted.to_dict('records')
380
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  except FileNotFoundError:
382
  gr.Warning("No evaluation data found yet. Run an evaluation to populate the leaderboard!")
383
+ return (
384
+ pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
385
+ pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
386
+ )
387
  except Exception as e:
388
  gr.Error(f"Error loading leaderboard: {e}")
389
+ traceback.print_exc() # Print full traceback for debugging
390
+ return (
391
+ pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records'),
392
+ pd.DataFrame(columns=["Model ID", "Average Accuracy (%)"]).to_dict('records')
393
+ )
394
 
395
 
396
  # --- Gradio Interface Definition ---
397
  with gr.Blocks(css="""
398
+ /* Import Google Font - Inter */
399
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
400
+
401
  /* General body and container styling */
402
+ body {
403
+ font-family: 'Inter', sans-serif;
404
+ background-color: #eef2f6; /* Lighter background */
405
+ margin: 0;
406
+ padding: 20px;
407
+ }
408
  .gradio-container {
409
  max-width: 1200px;
410
  margin: 20px auto;
411
+ padding: 40px; /* Increased padding */
412
+ box-shadow: 0 10px 25px rgba(0,0,0,0.1); /* Softer, larger shadow */
413
+ border-radius: 15px; /* More rounded corners */
414
  background-color: #ffffff;
415
+ border: 1px solid #e0e6ed; /* Subtle border */
416
  }
417
 
418
  /* Headings */
419
  h1 {
420
+ color: #1a202c; /* Darker, more professional heading color */
421
  text-align: center;
422
  margin-bottom: 30px;
423
+ font-size: 2.8em; /* Slightly larger H1 */
424
  font-weight: 700;
425
+ letter-spacing: -0.03em;
426
+ text-shadow: 1px 1px 2px rgba(0,0,0,0.05); /* Subtle text shadow */
427
+ }
428
+ h3 {
429
+ color: #2d3748;
430
+ font-size: 1.3em; /* Slightly larger H3 */
431
+ margin-bottom: 15px;
432
+ font-weight: 600;
433
  }
 
434
 
435
  /* Markdown text */
436
+ .markdown-text {
437
+ text-align: center;
438
+ color: #4a5568;
439
+ line-height: 1.7;
440
+ font-size: 1.05em;
441
+ margin-bottom: 30px;
442
+ }
443
+ .markdown-text div {
444
+ font-size: 1.1em;
445
+ max-width: 800px; /* Constrain width for readability */
446
+ margin: 0 auto;
447
+ }
448
 
449
  /* Buttons */
450
  .gr-button {
451
+ background-color: #2f80ed; /* A vibrant, professional blue */
452
  color: white;
453
  border: none;
454
+ padding: 14px 30px; /* More padding */
455
+ border-radius: 10px; /* More rounded */
456
  cursor: pointer;
457
+ transition: background-color 0.3s ease, transform 0.2s ease, box-shadow 0.2s ease;
458
+ font-size: 1.15em; /* Slightly larger font */
459
  font-weight: 600;
460
+ box-shadow: 0 5px 15px rgba(0, 123, 255, 0.2); /* Enhanced shadow for primary button */
461
+ margin: 5px; /* Add some margin for spacing between buttons */
462
  }
463
  .gr-button:hover {
464
+ background-color: #1a6dcd; /* Darker blue on hover */
465
+ transform: translateY(-3px); /* More pronounced lift effect */
466
+ box-shadow: 0 8px 20px rgba(0, 123, 255, 0.3);
467
  }
468
  .gr-button:active {
469
  transform: translateY(0);
470
+ box-shadow: 0 2px 5px rgba(0,0,0,0.1);
471
  }
472
  /* Specific button styling for debug/show details */
473
  #debug-button, #show-details-button {
474
+ background-color: #718096; /* Professional grey */
475
+ box-shadow: 0 3px 10px rgba(113, 128, 150, 0.2);
476
  }
477
  #debug-button:hover, #show-details-button:hover {
478
+ background-color: #5d6d81;
479
+ box-shadow: 0 5px 12px rgba(113, 128, 150, 0.3);
480
  }
481
  #download-button {
482
+ background-color: #38a169; /* Muted green for download */
483
+ box-shadow: 0 3px 10px rgba(56, 161, 105, 0.2);
484
  }
485
  #download-button:hover {
486
+ background-color: #277e50;
487
+ box-shadow: 0 5px 12px rgba(56, 161, 105, 0.3);
488
  }
489
 
490
+ /* Input/Output Boxes (Containers) */
 
491
  .gr-box {
492
+ border: 1px solid #cbd5e0; /* Lighter, subtle border */
493
+ border-radius: 12px;
494
+ padding: 25px; /* Increased padding */
495
+ margin-bottom: 25px;
496
+ background-color: #f8fafc; /* Very light background */
497
+ box-shadow: inset 0 2px 5px rgba(0,0,0,0.03); /* Subtle inner shadow */
498
  }
499
+ /* Specific text output boxes (the content inside the containers) */
500
  .gr-output-text {
501
  white-space: pre-wrap;
502
  word-wrap: break-word;
503
+ background-color: #ffffff; /* White background for readability */
504
+ border: 1px solid #e2e8f0;
505
  border-radius: 8px;
506
+ padding: 18px; /* More padding */
507
+ min-height: 120px; /* Ensure a minimum height */
508
+ box-shadow: 0 2px 8px rgba(0,0,0,0.05); /* Small shadow for depth */
509
+ color: #2d3748; /* Darker text for readability */
510
+ font-size: 0.95em;
511
+ line-height: 1.6;
512
  }
513
  /* Specific error output style */
514
  #error-message-output {
515
+ background-color: #ffe0e6; /* Light red */
516
+ border-color: #ff99aa; /* Slightly darker red border */
517
+ color: #c53030; /* Stronger red text */
518
+ font-weight: 500;
519
+ padding: 20px;
520
  }
521
 
522
 
523
  /* Labels for inputs */
524
  .gr-textbox label, .gr-dropdown label, .gr-slider label {
525
  font-weight: 600;
526
+ color: #2d3748; /* Darker label text */
527
+ margin-bottom: 10px;
528
  display: block;
529
+ font-size: 1.05em; /* Slightly larger label font */
530
  }
531
 
532
+ /* Tabs styling */
 
533
  .gr-tabs-nav button {
534
  font-weight: 600;
535
  font-size: 1.1em;
536
+ padding: 12px 25px; /* More padding for tabs */
537
+ border-top-left-radius: 10px;
538
+ border-top-right-radius: 10px;
539
+ background-color: #ebf4f8; /* Light blueish tab background */
540
+ color: #4a5568;
541
+ border: 1px solid #cce0eb; /* Subtle border for tabs */
542
+ border-bottom: none;
543
+ transition: background-color 0.3s ease, color 0.3s ease;
544
+ }
545
+ .gr-tabs-nav button.selected {
546
+ background-color: #ffffff; /* White for selected tab */
547
+ color: #2f80ed; /* Blue for selected text */
548
+ border-color: #2f80ed;
549
+ border-bottom: 1px solid #ffffff; /* Hide bottom border to merge with content */
550
+ }
551
+
552
+ /* Leaderboard specific table styling (general for all leaderboard tables) */
553
+ .leaderboard-table {
554
+ border-radius: 12px;
555
+ box-shadow: 0 4px 15px rgba(0,0,0,0.08);
556
+ overflow: hidden;
557
+ margin-bottom: 25px; /* Space between tables */
558
+ }
559
+ .leaderboard-table table {
560
+ border-collapse: separate;
561
+ border-spacing: 0;
562
+ width: 100%;
563
+ background-color: #ffffff;
564
+ }
565
+ .leaderboard-table thead th {
566
+ background-color: #edf2f7; /* Light grey header */
567
+ color: #2d3748;
568
+ font-weight: 700;
569
+ padding: 15px 20px;
570
+ text-align: left;
571
+ border-bottom: 2px solid #e2e8f0;
572
+ }
573
+ .leaderboard-table tbody tr {
574
+ transition: background-color 0.2s ease;
575
+ }
576
+ .leaderboard-table tbody tr:nth-child(odd) {
577
+ background-color: #f7fafc; /* Zebra striping */
578
+ }
579
+ .leaderboard-table tbody tr:hover {
580
+ background-color: #e6fffa; /* Light teal on hover for rows */
581
+ }
582
+ .leaderboard-table tbody td {
583
+ padding: 12px 20px;
584
+ border-bottom: 1px solid #ebf4f8;
585
+ color: #4a5568;
586
+ }
587
+ .leaderboard-table tbody tr:last-child td {
588
+ border-bottom: none;
589
+ }
590
+ .leaderboard-table tbody tr:first-child td {
591
+ border-top-left-radius: 12px;
592
+ border-top-right-radius: 12px;
593
+ }
594
+ .leaderboard-table tbody tr:last-child td {
595
+ border-bottom-left-radius: 12px;
596
+ border-bottom-right-radius: 12px;
597
+ }
598
+
599
+ /* Horizontal line for separation */
600
+ hr {
601
+ border: none;
602
+ border-top: 1px solid #e2e8f0;
603
+ margin: 30px 0;
604
  }
605
  """) as demo:
606
  gr.Markdown("""
 
610
  with gr.Tabs():
611
  with gr.TabItem("πŸš€ Run Evaluation"):
612
  gr.Markdown("""
613
+ <div class="markdown-text">
614
  Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro),
615
  select a subject (or 'ALL' for a comprehensive evaluation),
616
  and specify the number of samples per subject.
617
+ Ensure your Hugging Face token is set as an environment variable for private models.
618
  </div>
619
  """)
620
 
 
643
  )
644
  run_button = gr.Button("πŸš€ Run Evaluation", elem_classes="gr-button")
645
 
646
+ gr.Markdown("<hr>") # Visual separator
647
+
648
  with gr.Column(elem_classes="gr-box"):
649
  acc_output = gr.Textbox(
650
  label="Benchmark Accuracy Results",
 
722
 
723
  with gr.TabItem("πŸ“Š Leaderboard"):
724
  gr.Markdown("""
725
+ <div class="markdown-text">
726
+ Explore the performance of various LLMs on the MMLU and MMLU-Pro benchmarks.
727
+ This leaderboard is updated automatically with each new evaluation.
728
  </div>
729
  """)
730
+
731
+ # MMLU Leaderboard Table
732
+ gr.Markdown("### MMLU Top Models")
733
+ mmlu_leaderboard_table = gr.Dataframe(
734
+ headers=["Model ID", "Average Accuracy (%)"],
735
+ interactive=False,
736
+ datatype=["str", "number"],
737
+ row_count=10,
738
+ col_count=2,
739
+ label="MMLU Leaderboard Data",
740
+ elem_classes="leaderboard-table" # Apply custom class for styling
741
+ )
742
+
743
+ gr.Markdown("### MMLU-Pro Top Models")
744
+ mmlu_pro_leaderboard_table = gr.Dataframe(
745
+ headers=["Model ID", "Average Accuracy (%)"],
746
+ interactive=False,
747
+ datatype=["str", "number"],
748
+ row_count=10,
749
+ col_count=2,
750
+ label="MMLU-Pro Leaderboard Data",
751
+ elem_classes="leaderboard-table" # Apply custom class for styling
752
+ )
753
 
754
  # Load leaderboard when the tab is selected or when the app loads
755
+ demo.load(load_leaderboard, inputs=[], outputs=[mmlu_leaderboard_table, mmlu_pro_leaderboard_table])
756
 
757
  # Launch the Gradio app
758
+ demo.launch()