Enderchef commited on
Commit
aae1544
Β·
verified Β·
1 Parent(s): 678a72a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +175 -308
app.py CHANGED
@@ -14,7 +14,7 @@ from datetime import datetime
14
  # It's good practice to ensure the cache directory exists.
15
  CACHE_DIR = "evaluation_cache"
16
  os.makedirs(CACHE_DIR, exist_ok=True)
17
- EVAL_FILE = os.path.join(CACHE_DIR, "eval.jsonl")
18
 
19
  # Cache to avoid reloading models and dataset configs
20
  model_cache = {}
@@ -25,14 +25,12 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
25
 
26
  # --- Constants for Benchmarks ---
27
  MMLU_DATASET = "cais/mmlu"
28
- # Temporarily remove MMLU-Pro references
29
- # MMLU_PRO_DATASET = "TIGER-Lab/MMLU-Pro"
30
  BENCHMARK_MAP = {
31
  "MMLU": MMLU_DATASET,
32
- # "MMLU-Pro": MMLU_PRO_DATASET # Temporarily removed
33
  }
34
 
35
  # --- Data Loading and Preparation ---
 
36
  def get_all_benchmark_options():
37
  """
38
  Fetches and caches the available subjects (configs) for each benchmark dataset.
@@ -41,16 +39,13 @@ def get_all_benchmark_options():
41
  if benchmark_subject_cache:
42
  return benchmark_subject_cache
43
  print("Fetching benchmark configurations for the first time...")
44
-
45
- # Only iterate over the allowed benchmarks (MMLU)
46
  for key, dataset_id in BENCHMARK_MAP.items():
47
  try:
48
- # Fetching dataset configurations requires authentication if the dataset is private
49
  subjects = get_dataset_config_names(dataset_id, token=HF_TOKEN)
50
- benchmark_subject_cache[key] = ["ALL"] + sorted([s for s in subjects if s != 'all']) # Sort subjects
51
  except Exception as e:
52
  print(f"Warning: Could not load configs for {key} ({dataset_id}). It might be private or unavailable. Error: {e}")
53
- benchmark_subject_cache[key] = ["ALL"] # Provide a default
54
  print("Benchmark configurations cached.")
55
  return benchmark_subject_cache
56
 
@@ -65,39 +60,34 @@ def load_model(model_id):
65
  """
66
  if not model_id:
67
  raise ValueError("Model ID cannot be empty.")
68
- gr.Info(f"Attempting to load model: {model_id}...")
69
  if model_id in model_cache:
70
  gr.Info(f"Model '{model_id}' found in cache.")
71
  return model_cache[model_id]
72
  try:
73
- # Use bfloat16 for better performance on modern GPUs
74
  dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
75
-
76
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
77
  model = AutoModelForCausalLM.from_pretrained(
78
  model_id,
79
  token=HF_TOKEN,
80
  torch_dtype=dtype,
81
  trust_remote_code=True,
82
- low_cpu_mem_usage=True, # Optimization for large models
83
  ).to("cuda" if torch.cuda.is_available() else "cpu")
84
-
85
- # Create the pipeline for text generation
86
  generator = pipeline(
87
- "text-generation",
88
- model=model,
89
- tokenizer=tokenizer,
90
  device=0 if torch.cuda.is_available() else -1
91
  )
92
-
93
  model_cache[model_id] = generator
94
  gr.Info(f"Model '{model_id}' loaded successfully.")
95
  return generator
96
  except Exception as e:
97
- # Raise a more specific error to be caught by the main evaluation function
98
- raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token (if required). Error: {e}")
99
 
100
  # --- Evaluation Logic ---
 
101
  def format_prompt(item):
102
  """Formats the MMLU question and choices into a standardized prompt."""
103
  prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
@@ -108,125 +98,123 @@ def get_choice_letter(index):
108
  return chr(ord('A') + index) if 0 <= index <= 3 else None
109
 
110
  def extract_predicted_letter(output_text):
111
- """
112
- Extracts the predicted letter from the model's output.
113
- It looks for a letter (A, B, C, D) immediately following 'Answer:'.
114
- """
115
- # Look for "Answer: X" and capture X
116
  match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
117
  if match:
118
  return match.group(1).upper()
119
-
120
- # Fallback: if the model just outputs a letter
121
  match = re.search(r"^\s*([ABCD])\b", output_text.strip())
122
  if match:
123
  return match.group(1).upper()
124
  return None
125
 
126
- def evaluate_single_subject(generator, dataset_id, subject, sample_count, progress):
 
 
 
 
 
 
 
127
  """
128
- Evaluates a model on a specific subject from a dataset.
129
- """
130
- gr.Info(f"Loading dataset: {dataset_id} ({subject})...")
131
- try:
132
- # Load the 'test' split as it's standard for MMLU evaluation
133
- dataset = load_dataset(dataset_id, subject, token=HF_TOKEN, split="test")
134
- except Exception as e:
135
- raise RuntimeError(f"Failed to load dataset '{dataset_id}' for subject '{subject}'. Error: {e}")
136
-
137
- # Shuffle and select a subset of samples for evaluation
138
- num_samples = min(sample_count, len(dataset))
139
- dataset = dataset.shuffle(seed=42).select(range(num_samples))
140
-
141
- correct_predictions = 0
142
- results_details = []
143
-
144
- for item in progress.tqdm(dataset, desc=f"Evaluating {subject}"):
145
- prompt, correct_answer_idx = format_prompt(item)
146
- expected_letter = get_choice_letter(correct_answer_idx)
147
-
148
- # The generated text is often just after the prompt. We need to slice it.
149
- full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)
150
-
151
- # Generate a short response, aiming for a single letter answer.
152
- # do_sample=False (greedy decoding) is crucial for reproducibility.
153
- raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]
154
-
155
- # Isolate the newly generated part
156
- generated_text_only = raw_output[len(full_prompt_text):].strip()
157
- predicted_letter = extract_predicted_letter(generated_text_only)
158
- is_correct = (predicted_letter == expected_letter)
159
-
160
- if is_correct:
161
- correct_predictions += 1
162
-
163
- results_details.append({
164
- "Question": item['question'],
165
- "Correct": "βœ…" if is_correct else "❌",
166
- "Expected": expected_letter,
167
- "Predicted": predicted_letter or "N/A",
168
- "Model Output": generated_text_only
169
- })
170
- accuracy = (correct_predictions / num_samples) * 100 if num_samples > 0 else 0
171
- return accuracy, results_details
172
 
173
  @spaces.GPU()
174
- def run_evaluation(model_id, benchmark_category, subject_name, sample_count, progress=gr.Progress(track_tqdm=True)):
175
  """
176
- Main function to orchestrate the entire evaluation process.
177
- Handles single subject or 'ALL' subjects evaluation.
178
- Returns a dictionary of Gradio updates.
179
  """
180
  try:
181
- gr.Info("Starting evaluation...")
 
 
 
 
 
 
 
 
 
182
  generator = load_model(model_id)
183
-
184
  dataset_id = BENCHMARK_MAP.get(benchmark_category)
185
  if not dataset_id:
186
  raise ValueError(f"Invalid benchmark category: {benchmark_category}")
187
 
188
- all_results_details = []
189
- summary_lines = []
190
- total_correct = 0
191
- total_samples = 0
192
-
193
  subjects_to_run = []
194
  if subject_name == "ALL":
195
- # Exclude the "ALL" placeholder from the list of subjects to run
196
  subjects_to_run = [s for s in ALL_BENCHMARK_SUBJECTS.get(benchmark_category, []) if s != "ALL"]
197
  else:
198
  subjects_to_run = [subject_name]
199
 
200
  if not subjects_to_run:
201
  gr.Warning(f"No subjects found for '{benchmark_category}'.")
202
- # Return an empty but valid structure
203
- return {
204
- result_summary_output: gr.update(value="No subjects found to evaluate.", visible=True),
205
- error_box: gr.update(visible=False),
206
- details_box: gr.update(visible=False),
207
- }
208
 
 
 
 
 
 
 
209
  for i, subject in enumerate(subjects_to_run):
210
- gr.Info(f"Evaluating {benchmark_category} - {subject} ({i+1}/{len(subjects_to_run)})...")
 
 
 
 
211
  try:
212
- accuracy, subject_details = evaluate_single_subject(generator, dataset_id, subject, sample_count, progress)
213
-
214
- all_results_details.extend(subject_details)
215
- num_correct = sum(1 for d in subject_details if d['Correct'] == "βœ…")
216
- num_evaluated = len(subject_details)
217
- total_correct += num_correct
218
- total_samples += num_evaluated
219
- summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({num_correct}/{num_evaluated})")
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  except Exception as e:
222
  error_trace = traceback.format_exc()
223
  gr.Error(f"Skipping {subject} due to an error: {e}")
224
  summary_lines.append(f"- **{subject}**: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
225
  continue
226
-
 
227
  overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
228
-
229
- # --- Prepare Outputs ---
230
  if subject_name == "ALL":
231
  result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
232
  result_summary += f"across {total_samples:,} total samples from {len(subjects_to_run)} subjects.\n\n---\n\n**Breakdown by Subject:**\n"
@@ -234,44 +222,49 @@ def run_evaluation(model_id, benchmark_category, subject_name, sample_count, pro
234
  else:
235
  result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
236
  result_summary += f"({total_correct:,}/{total_samples:,} correct)"
237
-
238
- # Save results for leaderboard
239
  record = {
240
  "model_id": model_id,
241
  "benchmark": benchmark_category,
242
  "accuracy": overall_accuracy,
243
- "subject": subject_name, # Record if it was an 'ALL' run
244
  "sample_count": total_samples,
245
  "timestamp": datetime.now().isoformat()
246
  }
247
  with open(EVAL_FILE, "a") as f:
248
  f.write(json.dumps(record) + "\n")
249
-
250
  gr.Info("Evaluation completed successfully!")
251
-
252
  df_details = pd.DataFrame(all_results_details)
253
-
254
- # Return a dictionary of component updates
255
- return {
256
- result_summary_output: gr.update(value=result_summary, visible=True),
257
- error_box: gr.update(visible=False),
 
258
  details_box: gr.update(visible=True),
259
- detailed_results_df: gr.update(value=df_details)
 
260
  }
 
261
  except Exception as e:
262
- error_message = f"An unexpected error occurred during setup: {e}"
263
  error_details = traceback.format_exc()
264
  gr.Error(error_message)
265
-
266
- return {
267
- result_summary_output: gr.update(visible=False),
 
 
 
268
  error_box: gr.update(visible=True),
269
  error_output: gr.update(value=error_message),
270
  error_details_output: gr.update(value=error_details),
271
- details_box: gr.update(visible=False)
272
  }
273
 
274
  # --- UI Helper Functions ---
 
275
  def update_subject_dropdown(benchmark_category):
276
  """Updates the subject dropdown choices based on the selected benchmark."""
277
  choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
@@ -281,46 +274,41 @@ def update_subject_dropdown(benchmark_category):
281
  def load_leaderboard(benchmark_filter, progress=gr.Progress()):
282
  """
283
  Loads and processes evaluation data to display on the leaderboard.
284
- It now correctly averages scores for models that were evaluated on 'ALL' subjects.
285
  """
286
  progress(0, desc="Loading Leaderboard...")
287
  try:
288
  if not os.path.exists(EVAL_FILE):
289
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
290
-
291
  df = pd.read_json(EVAL_FILE, lines=True)
292
  if df.empty:
293
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
294
-
295
- # Coerce accuracy to numeric and filter valid entries
296
  df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
297
  df.dropna(subset=['accuracy'], inplace=True)
298
-
299
- # Filter by the selected benchmark (e.g., MMLU or MMLU-Pro)
300
  df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()
301
-
302
  if df_filtered.empty:
303
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
304
-
305
- # Find the latest evaluation for each model
306
  df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
307
  latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()
308
-
309
  leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()
310
-
311
- # Add Rank
312
  leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))
313
- # Rename and format columns
314
  leaderboard_df.rename(columns={
315
  'model_id': 'Model ID',
316
  'accuracy': 'Avg. Accuracy (%)',
317
  'sample_count': 'Total Samples',
318
  'timestamp': 'Date'
319
  }, inplace=True)
320
-
321
  leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
322
  leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')
323
-
324
  progress(1, desc="Done.")
325
  return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]
326
  except Exception as e:
@@ -329,174 +317,60 @@ def load_leaderboard(benchmark_filter, progress=gr.Progress()):
329
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
330
 
331
  # --- Gradio Interface Definition ---
332
- # Black/Orange Theme and bigger to fit screen
333
  custom_css = """
334
  /* --- Global & Layout (Bigger to fit screen) --- */
335
  body { font-family: 'Inter', sans-serif; background-color: #1a1a1a; color: #f0f0f0; } /* Dark background, light text */
336
  .gradio-container { max-width: 95% !important; margin: auto; padding: 20px; } /* Wider container */
337
- .gr-group {
338
- border-radius: 12px !important;
339
- box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; /* Darker shadow */
340
- border: 1px solid #333 !important; /* Darker border */
341
- background-color: #2a2a2a; /* Darker group background */
342
- }
343
- .gr-panel {
344
- border-radius: 12px !important;
345
- box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important;
346
- border: 1px solid #333 !important;
347
- background-color: #2a2a2a;
348
- }
349
-
350
  /* --- Typography (Orange Hues) --- */
351
  h1 { text-align: center; font-size: 3rem !important; font-weight: 800; color: #ff8c00; margin-bottom: 0.5rem; letter-spacing: -1.5px; } /* Orange title */
352
  h3, h4 { color: #ffa500; } /* Orange headings */
353
  .subtitle { text-align: center; color: #cccccc; font-size: 1.2rem; margin-bottom: 2.5rem; max-width: 900px; margin-left: auto; margin-right: auto;}
354
  label { color: #f0f0f0 !important; } /* Label text color */
355
-
 
 
356
  /* --- Tabs --- */
357
  .gradio-tabs { background-color: #2a2a2a; border-radius: 12px; }
358
- .gradio-tab-item { color: #f0f0f0; }
359
- .gradio-tabs button {
360
- background-color: #3a3a3a !important;
361
- color: #f0f0f0 !important;
362
- border-radius: 8px 8px 0 0 !important;
363
- transition: all 0.3s ease;
364
- }
365
- .gradio-tabs button.selected {
366
- background-color: #ff8c00 !important; /* Orange selected tab */
367
- color: #1a1a1a !important; /* Dark text on orange */
368
- font-weight: 700;
369
- }
370
- .gradio-tabs button:hover { background-color: #555 !important; }
371
-
372
  /* --- Inputs --- */
373
- .gr-textbox, .gr-dropdown, .gr-slider {
374
- background-color: #3a3a3a !important;
375
- color: #f0f0f0 !important;
376
- border: 1px solid #555 !important;
377
- border-radius: 8px !important;
378
- }
379
- .gr-textbox textarea, .gr-textbox input, .gr-dropdown input {
380
- color: #f0f0f0 !important;
381
- }
382
- .gr-textbox.gr-text-input:focus-within {
383
- border-color: #ff8c00 !important; /* Orange focus border */
384
- box-shadow: 0 0 0 2px rgba(255, 140, 0, 0.5) !important;
385
- }
386
-
387
-
388
  /* --- Buttons --- */
389
- .gr-button { font-weight: 600 !important; transition: all 0.2s ease; border-radius: 8px !important; }
390
- .gr-button-primary {
391
- background-color: #ff8c00 !important; /* Orange primary button */
392
- color: #1a1a1a !important;
393
- box-shadow: 0 4px 10px rgba(255, 140, 0, 0.3);
394
- border: none;
395
- }
396
- .gr-button-primary:hover {
397
- transform: translateY(-2px);
398
- box-shadow: 0 6px 15px rgba(255, 140, 0, 0.5);
399
- background-color: #ffa500 !important; /* Slightly lighter orange on hover */
400
- }
401
- .gr-button-secondary {
402
- background-color: #444 !important;
403
- color: #f0f0f0 !important;
404
- border: 1px solid #555 !important;
405
- }
406
- .gr-button-secondary:hover {
407
- background-color: #555 !important;
408
- }
409
-
410
- /* --- Custom Radio Buttons (Segmented Control) --- */
411
- #leaderboard-toggle-group { display: flex; justify-content: center; align-items: center; gap: 1rem; margin-bottom: 1.5rem; }
412
- #leaderboard-toggle { background-color: #3a3a3a; padding: 5px; border-radius: 10px; display: inline-flex; border: 1px solid #555; }
413
- #leaderboard-toggle div.gr-form { display: flex; gap: 5px; }
414
- #leaderboard-toggle input[type='radio'] { display: none; }
415
- #leaderboard-toggle label {
416
- padding: 8px 16px;
417
- border-radius: 8px;
418
- cursor: pointer;
419
- transition: all 0.3s ease;
420
- font-weight: 500;
421
- color: #f0f0f0;
422
- background: transparent;
423
- border: none;
424
- box-shadow: none;
425
- }
426
- #leaderboard-toggle input[type='radio']:checked + label {
427
- background-color: #ff8c00; /* Orange selected */
428
- color: #1a1a1a;
429
- font-weight: 600;
430
- box-shadow: 0 2px 5px rgba(255, 140, 0, 0.3);
431
- }
432
- #leaderboard-toggle label:hover {
433
- background-color: #555;
434
- }
435
-
436
  /* --- Dataframe / Table Styling --- */
437
- .leaderboard-table .gr-dataframe table { border-collapse: collapse; width: 100%; }
438
- .leaderboard-table .gr-dataframe thead th {
439
- background-color: #3a3a3a !important;
440
- color: #ffa500 !important; /* Orange headers */
441
- font-weight: 600 !important;
442
- text-align: left;
443
- padding: 12px 15px;
444
- border-bottom: 2px solid #555;
445
- }
446
- .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #2f2f2f; } /* Alternating row color */
447
- .leaderboard-table .gr-dataframe tbody tr:hover { background-color: #4a4a4a; } /* Hover effect */
448
- .leaderboard-table .gr-dataframe tbody td {
449
- padding: 12px 15px;
450
- border-bottom: 1px solid #3a3a3a;
451
- color: #f0f0f0;
452
- }
453
- .leaderboard-table .gr-dataframe tbody td:first-child { font-weight: 700; color: #ffcc99; } /* Lighter orange for rank */
454
-
455
  /* --- Error & Result Panes --- */
456
- #error-display-box {
457
- background-color: #4a1e1e !important; /* Dark red for error */
458
- border-color: #8c2f2f !important;
459
- color: #ffc9c9 !important; /* Lighter red text */
460
- }
461
- #result-summary-box {
462
- background-color: #1e3a2a !important; /* Dark green for success */
463
- border-color: #2f8c4a !important;
464
- color: #c9ffc9 !important; /* Lighter green text */
465
- }
466
- .gr-markdown p { color: #f0f0f0 !important; } /* Ensure markdown paragraph text is visible */
467
- .gr-markdown strong { color: #ffa500 !important; } /* Strong text in orange */
468
- .gradio-message { background-color: #ff8c00 !important; color: #1a1a1a !important; border: 1px solid #ff8c00 !important; } /* Gradio Info messages */
469
  """
470
 
471
  with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
472
  gr.Markdown("<h1>πŸ† SuperBench Eval: Evaluate models and view leaderboards πŸ†</h1>")
473
  gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")
474
-
475
  with gr.Tabs() as tabs:
476
  # --- Leaderboard Tab ---
477
  with gr.TabItem("πŸ“Š Leaderboard", id=0):
478
  with gr.Column():
479
- with gr.Row(elem_id="leaderboard-toggle-group"):
480
- # Temporarily remove MMLU-Pro from radio options
481
  leaderboard_type_toggle = gr.Radio(
482
- ["MMLU"],
483
- label="Select Benchmark",
484
- value="MMLU",
485
- interactive=True,
486
- elem_id="leaderboard-toggle",
487
- container=False,
488
- show_label=False,
489
  )
490
  refresh_button = gr.Button("πŸ”„ Refresh", size="sm")
491
  leaderboard_table_output = gr.DataFrame(
492
  headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
493
- interactive=False,
494
- datatype=["number", "str", "str", "number", "str"],
495
- row_count=15, # Adjusted for more rows
496
- elem_classes="leaderboard-table",
497
- # Removed col_count to allow dynamic width
498
  )
499
-
500
  # --- Evaluation Tab ---
501
  with gr.TabItem("πŸš€ Run Evaluation", id=1):
502
  with gr.Row(variant='panel'):
@@ -504,77 +378,71 @@ with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
504
  with gr.Group():
505
  gr.Markdown("### 1. Configure Evaluation")
506
  model_id_input = gr.Textbox(
507
- label="Hugging Face Model ID",
508
- placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
509
- interactive=True,
510
- scale=2 # Increased scale for textbox
511
  )
512
- # Temporarily remove MMLU-Pro from radio options
513
  benchmark_selection_radio = gr.Radio(
514
- ["MMLU"],
515
- label="Benchmark",
516
- value="MMLU",
517
- interactive=True,
518
  )
519
  with gr.Row():
520
  benchmark_subject_dropdown = gr.Dropdown(
521
- label="Subject",
522
- # Ensure only MMLU subjects are fetched
523
- choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
524
- value="ALL",
525
- interactive=True
526
  )
527
  sample_count_slider = gr.Slider(
528
- label="Samples per Subject",
529
- minimum=5, maximum=100, value=25, step=5, interactive=True
530
  )
531
  run_button = gr.Button("Start Evaluation", variant="primary", scale=1)
532
-
533
  with gr.Column(scale=3):
534
  gr.Markdown("### 2. View Results")
535
-
 
 
 
 
 
536
  # Panel for displaying the summary of results
537
  with gr.Group(visible=False) as result_summary_box:
538
  result_summary_output = gr.Markdown(elem_id="result-summary-box")
539
-
540
  # Panel for displaying errors
541
  with gr.Group(visible=False) as error_box:
542
  error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
543
  error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
544
-
545
  # Panel for detailed, row-by-row results
546
  with gr.Group(visible=False) as details_box:
547
  gr.Markdown("#### Detailed Evaluation Log")
548
  detailed_results_df = gr.DataFrame(
549
  headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
550
  datatype=["str", "str", "str", "str", "str"],
551
- interactive=False,
552
- row_count=10, # Adjusted for more rows
553
- # Removed col_count to allow dynamic width
554
- wrap=True,
555
  )
556
 
557
- # --- Event Handlers & Logic ---
558
- # Update subject dropdown when benchmark type changes
559
  benchmark_selection_radio.change(
560
  fn=update_subject_dropdown,
561
  inputs=[benchmark_selection_radio],
562
  outputs=[benchmark_subject_dropdown]
563
  )
564
-
565
- # Main evaluation trigger
566
  run_button.click(
567
  fn=run_evaluation,
568
  inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider],
569
- outputs=[result_summary_output, error_box, error_output, error_details_output, details_box, detailed_results_df]
570
- ).then(
571
- # After evaluation, switch to the leaderboard tab and refresh it
572
- lambda: gr.update(selected=0), outputs=[tabs]
 
 
573
  ).then(
 
574
  load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output]
575
  )
576
-
577
- # Leaderboard loading logic
578
  demo.load(
579
  fn=load_leaderboard,
580
  inputs=[leaderboard_type_toggle],
@@ -593,6 +461,5 @@ with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
593
  show_progress='full'
594
  )
595
 
596
- # Launch the Gradio app
597
  if __name__ == "__main__":
598
- demo.launch(debug=True)
 
14
  # It's good practice to ensure the cache directory exists.
15
  CACHE_DIR = "evaluation_cache"
16
  os.makedirs(CACHE_DIR, exist_ok=True)
17
+ EVAL_FILE = os.path.join(CACHE_DIR, "evals.jsonl")
18
 
19
  # Cache to avoid reloading models and dataset configs
20
  model_cache = {}
 
25
 
26
  # --- Constants for Benchmarks ---
27
  MMLU_DATASET = "cais/mmlu"
 
 
28
  BENCHMARK_MAP = {
29
  "MMLU": MMLU_DATASET,
 
30
  }
31
 
32
  # --- Data Loading and Preparation ---
33
+
34
  def get_all_benchmark_options():
35
  """
36
  Fetches and caches the available subjects (configs) for each benchmark dataset.
 
39
  if benchmark_subject_cache:
40
  return benchmark_subject_cache
41
  print("Fetching benchmark configurations for the first time...")
 
 
42
  for key, dataset_id in BENCHMARK_MAP.items():
43
  try:
 
44
  subjects = get_dataset_config_names(dataset_id, token=HF_TOKEN)
45
+ benchmark_subject_cache[key] = ["ALL"] + sorted([s for s in subjects if s != 'all'])
46
  except Exception as e:
47
  print(f"Warning: Could not load configs for {key} ({dataset_id}). It might be private or unavailable. Error: {e}")
48
+ benchmark_subject_cache[key] = ["ALL"]
49
  print("Benchmark configurations cached.")
50
  return benchmark_subject_cache
51
 
 
60
  """
61
  if not model_id:
62
  raise ValueError("Model ID cannot be empty.")
63
+ gr.Info(f"Attempting to load model: {model_id}...")
64
  if model_id in model_cache:
65
  gr.Info(f"Model '{model_id}' found in cache.")
66
  return model_cache[model_id]
67
  try:
 
68
  dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
 
69
  tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN, trust_remote_code=True)
70
  model = AutoModelForCausalLM.from_pretrained(
71
  model_id,
72
  token=HF_TOKEN,
73
  torch_dtype=dtype,
74
  trust_remote_code=True,
75
+ low_cpu_mem_usage=True,
76
  ).to("cuda" if torch.cuda.is_available() else "cpu")
 
 
77
  generator = pipeline(
78
+ "text-generation",
79
+ model=model,
80
+ tokenizer=tokenizer,
81
  device=0 if torch.cuda.is_available() else -1
82
  )
 
83
  model_cache[model_id] = generator
84
  gr.Info(f"Model '{model_id}' loaded successfully.")
85
  return generator
86
  except Exception as e:
87
+ raise RuntimeError(f"Failed to load model '{model_id}'. Please verify the model ID and your Hugging Face token. Error: {e}")
 
88
 
89
  # --- Evaluation Logic ---
90
+
91
  def format_prompt(item):
92
  """Formats the MMLU question and choices into a standardized prompt."""
93
  prompt = f"Question: {item['question']}\n\nChoices:\nA. {item['choices'][0]}\nB. {item['choices'][1]}\nC. {item['choices'][2]}\nD. {item['choices'][3]}\n\nAnswer:"
 
98
  return chr(ord('A') + index) if 0 <= index <= 3 else None
99
 
100
  def extract_predicted_letter(output_text):
101
+ """Extracts the predicted letter from the model's output."""
 
 
 
 
102
  match = re.search(r"Answer:\s*([ABCD])", output_text.strip(), re.IGNORECASE)
103
  if match:
104
  return match.group(1).upper()
 
 
105
  match = re.search(r"^\s*([ABCD])\b", output_text.strip())
106
  if match:
107
  return match.group(1).upper()
108
  return None
109
 
110
+ def make_progress_html(text, percentage):
111
+ """Helper function to create the HTML for the progress bar."""
112
+ return f"""
113
+ <div class="progress-container">
114
+ <div class="progress-bar" style="width: {percentage}%;">
115
+ {text}
116
+ </div>
117
+ </div>
118
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  @spaces.GPU()
121
+ def run_evaluation(model_id, benchmark_category, subject_name, sample_count):
122
  """
123
+ Main generator function to orchestrate the evaluation, yielding progress updates.
 
 
124
  """
125
  try:
126
+ # 1. Initial yield to set up the UI for loading state
127
+ yield {
128
+ progress_box: gr.update(visible=True),
129
+ progress_text_output: gr.update(value=f"Preparing evaluation for **{model_id}**..."),
130
+ progress_bar_output: gr.update(value=make_progress_html("Loading Model...", 0)),
131
+ result_summary_box: gr.update(visible=False),
132
+ details_box: gr.update(visible=False),
133
+ error_box: gr.update(visible=False),
134
+ }
135
+
136
  generator = load_model(model_id)
 
137
  dataset_id = BENCHMARK_MAP.get(benchmark_category)
138
  if not dataset_id:
139
  raise ValueError(f"Invalid benchmark category: {benchmark_category}")
140
 
 
 
 
 
 
141
  subjects_to_run = []
142
  if subject_name == "ALL":
 
143
  subjects_to_run = [s for s in ALL_BENCHMARK_SUBJECTS.get(benchmark_category, []) if s != "ALL"]
144
  else:
145
  subjects_to_run = [subject_name]
146
 
147
  if not subjects_to_run:
148
  gr.Warning(f"No subjects found for '{benchmark_category}'.")
149
+ yield { progress_box: gr.update(visible=False) }
150
+ return
 
 
 
 
151
 
152
+ all_results_details = []
153
+ summary_lines = []
154
+ total_correct = 0
155
+ total_samples = 0
156
+
157
+ # 2. Main evaluation loop
158
  for i, subject in enumerate(subjects_to_run):
159
+ overall_progress_text = f"**Overall Progress ({i+1}/{len(subjects_to_run)} subjects)**"
160
+ yield {
161
+ progress_text_output: gr.update(value=f"{overall_progress_text}\n\nLoading dataset for **{subject}**...")
162
+ }
163
+
164
  try:
165
+ # Load dataset for the current subject
166
+ dataset = load_dataset(dataset_id, subject, token=HF_TOKEN, split="test")
167
+ num_samples = min(sample_count, len(dataset))
168
+ dataset = dataset.shuffle(seed=42).select(range(num_samples))
169
+
170
+ correct_predictions_subject = 0
171
+ subject_details = []
172
+
173
+ # Loop over samples within the subject
174
+ for j, item in enumerate(dataset):
175
+ prompt, correct_answer_idx = format_prompt(item)
176
+ expected_letter = get_choice_letter(correct_answer_idx)
177
+
178
+ full_prompt_text = generator.tokenizer.decode(generator.tokenizer.encode(prompt), skip_special_tokens=True)
179
+ raw_output = generator(prompt, max_new_tokens=5, do_sample=False, pad_token_id=generator.tokenizer.eos_token_id)[0]["generated_text"]
180
+ generated_text_only = raw_output[len(full_prompt_text):].strip()
181
+ predicted_letter = extract_predicted_letter(generated_text_only)
182
+
183
+ is_correct = (predicted_letter == expected_letter)
184
+ if is_correct:
185
+ correct_predictions_subject += 1
186
 
187
+ subject_details.append({
188
+ "Question": item['question'],
189
+ "Correct": "βœ…" if is_correct else "❌",
190
+ "Expected": expected_letter,
191
+ "Predicted": predicted_letter or "N/A",
192
+ "Model Output": generated_text_only
193
+ })
194
+
195
+ # Yield progress update for each sample
196
+ percentage = ((j + 1) / num_samples) * 100
197
+ progress_bar_text = f"Evaluating: {subject} ({j+1}/{num_samples})"
198
+ yield {
199
+ progress_bar_output: gr.update(value=make_progress_html(f"{percentage:.1f}%", percentage)),
200
+ progress_text_output: gr.update(value=f"{overall_progress_text}\n\n{progress_bar_text}")
201
+ }
202
+
203
+ accuracy = (correct_predictions_subject / num_samples) * 100 if num_samples > 0 else 0
204
+ all_results_details.extend(subject_details)
205
+ total_correct += correct_predictions_subject
206
+ total_samples += num_samples
207
+ summary_lines.append(f"- **{subject}**: {accuracy:.2f}% ({correct_predictions_subject}/{num_samples})")
208
+
209
  except Exception as e:
210
  error_trace = traceback.format_exc()
211
  gr.Error(f"Skipping {subject} due to an error: {e}")
212
  summary_lines.append(f"- **{subject}**: Evaluation failed. See logs for details:\n```\n{error_trace}\n```")
213
  continue
214
+
215
+ # 3. Final processing and result preparation
216
  overall_accuracy = (total_correct / total_samples) * 100 if total_samples > 0 else 0
217
+
 
218
  if subject_name == "ALL":
219
  result_summary = f"### Overall Average Accuracy: {overall_accuracy:.2f}%\n"
220
  result_summary += f"across {total_samples:,} total samples from {len(subjects_to_run)} subjects.\n\n---\n\n**Breakdown by Subject:**\n"
 
222
  else:
223
  result_summary = f"### Accuracy for {benchmark_category} - {subject_name}: {overall_accuracy:.2f}%\n"
224
  result_summary += f"({total_correct:,}/{total_samples:,} correct)"
225
+
226
+ # Write final result to the JSONL file
227
  record = {
228
  "model_id": model_id,
229
  "benchmark": benchmark_category,
230
  "accuracy": overall_accuracy,
231
+ "subject": subject_name,
232
  "sample_count": total_samples,
233
  "timestamp": datetime.now().isoformat()
234
  }
235
  with open(EVAL_FILE, "a") as f:
236
  f.write(json.dumps(record) + "\n")
237
+
238
  gr.Info("Evaluation completed successfully!")
 
239
  df_details = pd.DataFrame(all_results_details)
240
+
241
+ # 4. Final yield to show results and hide progress UI
242
+ yield {
243
+ progress_box: gr.update(visible=False),
244
+ result_summary_box: gr.update(visible=True),
245
+ result_summary_output: gr.update(value=result_summary),
246
  details_box: gr.update(visible=True),
247
+ detailed_results_df: gr.update(value=df_details),
248
+ error_box: gr.update(visible=False)
249
  }
250
+
251
  except Exception as e:
252
+ error_message = f"An unexpected error occurred: {e}"
253
  error_details = traceback.format_exc()
254
  gr.Error(error_message)
255
+
256
+ # Yield to show error message and hide progress UI
257
+ yield {
258
+ progress_box: gr.update(visible=False),
259
+ result_summary_box: gr.update(visible=False),
260
+ details_box: gr.update(visible=False),
261
  error_box: gr.update(visible=True),
262
  error_output: gr.update(value=error_message),
263
  error_details_output: gr.update(value=error_details),
 
264
  }
265
 
266
  # --- UI Helper Functions ---
267
+
268
  def update_subject_dropdown(benchmark_category):
269
  """Updates the subject dropdown choices based on the selected benchmark."""
270
  choices = ALL_BENCHMARK_SUBJECTS.get(benchmark_category, [])
 
274
  def load_leaderboard(benchmark_filter, progress=gr.Progress()):
275
  """
276
  Loads and processes evaluation data to display on the leaderboard.
 
277
  """
278
  progress(0, desc="Loading Leaderboard...")
279
  try:
280
  if not os.path.exists(EVAL_FILE):
281
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
282
+
283
  df = pd.read_json(EVAL_FILE, lines=True)
284
  if df.empty:
285
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
286
+
 
287
  df['accuracy'] = pd.to_numeric(df['accuracy'], errors='coerce')
288
  df.dropna(subset=['accuracy'], inplace=True)
289
+
290
+ # Filter for 'ALL' subject runs for the selected benchmark
291
  df_filtered = df[(df['benchmark'] == benchmark_filter) & (df['subject'] == 'ALL')].copy()
292
+
293
  if df_filtered.empty:
294
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
295
+
 
296
  df_filtered['timestamp'] = pd.to_datetime(df_filtered['timestamp'])
297
  latest_evals = df_filtered.loc[df_filtered.groupby('model_id')['timestamp'].idxmax()].copy()
298
+
299
  leaderboard_df = latest_evals.sort_values(by="accuracy", ascending=False).copy()
300
+
 
301
  leaderboard_df.insert(0, 'Rank', range(1, len(leaderboard_df) + 1))
 
302
  leaderboard_df.rename(columns={
303
  'model_id': 'Model ID',
304
  'accuracy': 'Avg. Accuracy (%)',
305
  'sample_count': 'Total Samples',
306
  'timestamp': 'Date'
307
  }, inplace=True)
308
+
309
  leaderboard_df['Avg. Accuracy (%)'] = leaderboard_df['Avg. Accuracy (%)'].map('{:.2f}'.format)
310
  leaderboard_df['Date'] = leaderboard_df['Date'].dt.strftime('%Y-%m-%d')
311
+
312
  progress(1, desc="Done.")
313
  return leaderboard_df[['Rank', 'Model ID', 'Avg. Accuracy (%)', 'Total Samples', 'Date']]
314
  except Exception as e:
 
317
  return pd.DataFrame(columns=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"])
318
 
319
  # --- Gradio Interface Definition ---
 
320
  custom_css = """
321
  /* --- Global & Layout (Bigger to fit screen) --- */
322
  body { font-family: 'Inter', sans-serif; background-color: #1a1a1a; color: #f0f0f0; } /* Dark background, light text */
323
  .gradio-container { max-width: 95% !important; margin: auto; padding: 20px; } /* Wider container */
324
+ .gr-group { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; border: 1px solid #333 !important; background-color: #2a2a2a; }
325
+ .gr-panel { border-radius: 12px !important; box-shadow: 0 4px 12px rgba(0,0,0,0.3) !important; border: 1px solid #333 !important; background-color: #2a2a2a; }
 
 
 
 
 
 
 
 
 
 
 
326
  /* --- Typography (Orange Hues) --- */
327
  h1 { text-align: center; font-size: 3rem !important; font-weight: 800; color: #ff8c00; margin-bottom: 0.5rem; letter-spacing: -1.5px; } /* Orange title */
328
  h3, h4 { color: #ffa500; } /* Orange headings */
329
  .subtitle { text-align: center; color: #cccccc; font-size: 1.2rem; margin-bottom: 2.5rem; max-width: 900px; margin-left: auto; margin-right: auto;}
330
  label { color: #f0f0f0 !important; } /* Label text color */
331
+ /* --- Progress Bar --- */
332
+ .progress-container { background-color: #3a3a3a; border-radius: 8px; overflow: hidden; border: 1px solid #555; height: 28px; padding: 4px; }
333
+ .progress-bar { background: linear-gradient(90deg, #ff8c00, #ffa500); height: 100%; border-radius: 5px; transition: width 0.3s ease-in-out; display: flex; align-items: center; justify-content: center; color: #1a1a1a; font-weight: 600; font-size: 0.9rem; }
334
  /* --- Tabs --- */
335
  .gradio-tabs { background-color: #2a2a2a; border-radius: 12px; }
336
+ .gradio-tabs button { background-color: #3a3a3a !important; color: #f0f0f0 !important; border-radius: 8px 8px 0 0 !important; transition: all 0.3s ease; }
337
+ .gradio-tabs button.selected { background-color: #ff8c00 !important; color: #1a1a1a !important; font-weight: 700; }
 
 
 
 
 
 
 
 
 
 
 
 
338
  /* --- Inputs --- */
339
+ .gr-textbox, .gr-dropdown, .gr-slider { background-color: #3a3a3a !important; color: #f0f0f0 !important; border: 1px solid #555 !important; border-radius: 8px !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  /* --- Buttons --- */
341
+ .gr-button-primary { background-color: #ff8c00 !important; color: #1a1a1a !important; box-shadow: 0 4px 10px rgba(255, 140, 0, 0.3); border: none; }
342
+ .gr-button-primary:hover { transform: translateY(-2px); box-shadow: 0 6px 15px rgba(255, 140, 0, 0.5); background-color: #ffa500 !important; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  /* --- Dataframe / Table Styling --- */
344
+ .leaderboard-table .gr-dataframe thead th { background-color: #3a3a3a !important; color: #ffa500 !important; font-weight: 600 !important; text-align: left; padding: 12px 15px; border-bottom: 2px solid #555; }
345
+ .leaderboard-table .gr-dataframe tbody tr:nth-of-type(even) { background-color: #2f2f2f; }
346
+ .leaderboard-table .gr-dataframe tbody tr:hover { background-color: #4a4a4a; }
347
+ .leaderboard-table .gr-dataframe tbody td { padding: 12px 15px; border-bottom: 1px solid #3a3a3a; color: #f0f0f0; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  /* --- Error & Result Panes --- */
349
+ #error-display-box { background-color: #4a1e1e !important; border-color: #8c2f2f !important; color: #ffc9c9 !important; }
350
+ #result-summary-box { background-color: #1e3a2a !important; border-color: #2f8c4a !important; color: #c9ffc9 !important; }
351
+ .gr-markdown p { color: #f0f0f0 !important; } .gr-markdown strong { color: #ffa500 !important; }
352
+ .gradio-message { background-color: #ff8c00 !important; color: #1a1a1a !important; border: 1px solid #ff8c00 !important; }
 
 
 
 
 
 
 
 
 
353
  """
354
 
355
  with gr.Blocks(theme=gr.themes.Base(), css=custom_css) as demo:
356
  gr.Markdown("<h1>πŸ† SuperBench Eval: Evaluate models and view leaderboards πŸ†</h1>")
357
  gr.Markdown("<p class='subtitle'>Benchmark leading models on MMLU. Your results contribute to a live leaderboard. Select a benchmark and run an evaluation, or view the current standings.</p>")
358
+
359
  with gr.Tabs() as tabs:
360
  # --- Leaderboard Tab ---
361
  with gr.TabItem("πŸ“Š Leaderboard", id=0):
362
  with gr.Column():
363
+ with gr.Row():
 
364
  leaderboard_type_toggle = gr.Radio(
365
+ ["MMLU"], label="Select Benchmark", value="MMLU", interactive=True
 
 
 
 
 
 
366
  )
367
  refresh_button = gr.Button("πŸ”„ Refresh", size="sm")
368
  leaderboard_table_output = gr.DataFrame(
369
  headers=["Rank", "Model ID", "Avg. Accuracy (%)", "Total Samples", "Date"],
370
+ interactive=False, datatype=["number", "str", "str", "number", "str"],
371
+ row_count=15, elem_classes="leaderboard-table",
 
 
 
372
  )
373
+
374
  # --- Evaluation Tab ---
375
  with gr.TabItem("πŸš€ Run Evaluation", id=1):
376
  with gr.Row(variant='panel'):
 
378
  with gr.Group():
379
  gr.Markdown("### 1. Configure Evaluation")
380
  model_id_input = gr.Textbox(
381
+ label="Hugging Face Model ID", placeholder="e.g., meta-llama/Meta-Llama-3-8B-Instruct",
382
+ interactive=True, scale=2
 
 
383
  )
 
384
  benchmark_selection_radio = gr.Radio(
385
+ ["MMLU"], label="Benchmark", value="MMLU", interactive=True
 
 
 
386
  )
387
  with gr.Row():
388
  benchmark_subject_dropdown = gr.Dropdown(
389
+ label="Subject", choices=ALL_BENCHMARK_SUBJECTS.get("MMLU", []),
390
+ value="ALL", interactive=True
 
 
 
391
  )
392
  sample_count_slider = gr.Slider(
393
+ label="Samples per Subject", minimum=5, maximum=100, value=10, step=5, interactive=True
 
394
  )
395
  run_button = gr.Button("Start Evaluation", variant="primary", scale=1)
396
+
397
  with gr.Column(scale=3):
398
  gr.Markdown("### 2. View Results")
399
+
400
+ # NEW: Progress Bar UI
401
+ with gr.Group(visible=False) as progress_box:
402
+ progress_text_output = gr.Markdown("Starting...")
403
+ progress_bar_output = gr.HTML(make_progress_html("Waiting...", 0))
404
+
405
  # Panel for displaying the summary of results
406
  with gr.Group(visible=False) as result_summary_box:
407
  result_summary_output = gr.Markdown(elem_id="result-summary-box")
408
+
409
  # Panel for displaying errors
410
  with gr.Group(visible=False) as error_box:
411
  error_output = gr.Textbox(label="Error Message", interactive=False, elem_id="error-display-box")
412
  error_details_output = gr.Textbox(label="Error Details (Traceback)", interactive=False, lines=8)
413
+
414
  # Panel for detailed, row-by-row results
415
  with gr.Group(visible=False) as details_box:
416
  gr.Markdown("#### Detailed Evaluation Log")
417
  detailed_results_df = gr.DataFrame(
418
  headers=["Question", "Correct", "Expected", "Predicted", "Model Output"],
419
  datatype=["str", "str", "str", "str", "str"],
420
+ interactive=False, row_count=10, wrap=True,
 
 
 
421
  )
422
 
423
+ # --- Event Handlers & Logic ---
 
424
  benchmark_selection_radio.change(
425
  fn=update_subject_dropdown,
426
  inputs=[benchmark_selection_radio],
427
  outputs=[benchmark_subject_dropdown]
428
  )
429
+
430
+ # Main evaluation trigger, now handles a generator for progress updates
431
  run_button.click(
432
  fn=run_evaluation,
433
  inputs=[model_id_input, benchmark_selection_radio, benchmark_subject_dropdown, sample_count_slider],
434
+ outputs=[
435
+ progress_box, progress_text_output, progress_bar_output,
436
+ result_summary_box, result_summary_output,
437
+ error_box, error_output, error_details_output,
438
+ details_box, detailed_results_df
439
+ ]
440
  ).then(
441
+ # After evaluation, refresh the leaderboard
442
  load_leaderboard, inputs=[leaderboard_type_toggle], outputs=[leaderboard_table_output]
443
  )
444
+
445
+ # --- Leaderboard Loading Logic ---
446
  demo.load(
447
  fn=load_leaderboard,
448
  inputs=[leaderboard_type_toggle],
 
461
  show_progress='full'
462
  )
463
 
 
464
  if __name__ == "__main__":
465
+ demo.launch(debug=True)