Enderchef commited on
Commit
1c17342
Β·
verified Β·
1 Parent(s): 6cc6a40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -12
app.py CHANGED
@@ -20,11 +20,7 @@ MMLU_DATASET = "cais/mmlu"
20
  MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
21
 
22
  def get_all_benchmark_options():
23
- """
24
- Dynamically fetches all available subjects for MMLU and MMLU-Pro.
25
- Returns a dictionary mapping benchmark dataset IDs to their subjects,
26
- and a flattened list suitable for a Gradio dropdown.
27
- """
28
  all_options = {}
29
  gr_dropdown_options = [] # This is for initial display only, not used for dynamic updates directly
30
 
@@ -89,10 +85,7 @@ def load_model(model_id):
89
 
90
 
91
  def format_prompt(item):
92
- """
93
- Formats a single MMLU/MMLU-Pro question item into a clear prompt for the LLM.
94
- The prompt is designed for the model to output a single letter answer (A, B, C, D).
95
- """
96
  prompt = f"""{item['question']}
97
  A. {item['choices'][0]}
98
  B. {item['choices'][1]}
@@ -647,7 +640,7 @@ with gr.Blocks(css="""
647
  with gr.TabItem("πŸš€ Run Evaluation"):
648
  gr.Markdown("""
649
  <div class="markdown-text">
650
- Enter your Hugging Face Model ID, choose a benchmark (MMLU or MMLU-Pro),
651
  select a subject (or 'ALL' for a comprehensive evaluation),
652
  and specify the number of samples per subject.
653
  Ensure your Hugging Face token is set as an environment variable for private models.
@@ -750,14 +743,14 @@ with gr.Blocks(css="""
750
  with gr.TabItem("πŸ“Š Leaderboard"):
751
  gr.Markdown("""
752
  <div class="markdown-text">
753
- Explore the performance of various LLMs on the MMLU and MMLU-Pro benchmarks.
754
  This leaderboard is updated automatically with each new evaluation.
755
  </div>
756
  """)
757
 
758
  # Leaderboard Type Toggle
759
  leaderboard_type_toggle = gr.Radio(
760
- ["MMLU", "MMLU-Pro"],
761
  label="Select Benchmark for Leaderboard",
762
  value="MMLU", # Default to MMLU
763
  interactive=True,
 
20
  MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
21
 
22
  def get_all_benchmark_options():
23
+
 
 
 
 
24
  all_options = {}
25
  gr_dropdown_options = [] # This is for initial display only, not used for dynamic updates directly
26
 
 
85
 
86
 
87
  def format_prompt(item):
88
+
 
 
 
89
  prompt = f"""{item['question']}
90
  A. {item['choices'][0]}
91
  B. {item['choices'][1]}
 
640
  with gr.TabItem("πŸš€ Run Evaluation"):
641
  gr.Markdown("""
642
  <div class="markdown-text">
643
+ Enter your Hugging Face Model ID, choose a benchmark (MMLU only for now),
644
  select a subject (or 'ALL' for a comprehensive evaluation),
645
  and specify the number of samples per subject.
646
  Ensure your Hugging Face token is set as an environment variable for private models.
 
743
  with gr.TabItem("πŸ“Š Leaderboard"):
744
  gr.Markdown("""
745
  <div class="markdown-text">
746
+ Explore the performance of various LLMs on a chunk of MMLU called MMLU Small.
747
  This leaderboard is updated automatically with each new evaluation.
748
  </div>
749
  """)
750
 
751
  # Leaderboard Type Toggle
752
  leaderboard_type_toggle = gr.Radio(
753
+ ["MMLU Small"],
754
  label="Select Benchmark for Leaderboard",
755
  value="MMLU", # Default to MMLU
756
  interactive=True,