tathagataraha commited on
Commit
930ed8c
·
2 Parent(s): 197145f b2c9d9e
.gitignore CHANGED
@@ -15,4 +15,4 @@ eval-results-local/
15
  medic-harness-requests/
16
  medic-harness-results/
17
  logs/
18
- newharness/results/
 
15
  medic-harness-requests/
16
  medic-harness-results/
17
  logs/
18
+ newharness/results/
app.py CHANGED
@@ -53,8 +53,23 @@ from src.display.utils import (
53
  Precision,
54
  WeightType,
55
  fields,
56
- render_generation_templates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  )
 
58
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, PRIVATE_REPO
59
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
60
  from src.submission.submit import add_new_eval, PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG
@@ -106,6 +121,24 @@ _, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQ
106
  healthbench_hard_leaderboard_df = healthbench_hard_original_df.copy()
107
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  # breakpoint()
110
  # # Token based results
111
  # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
@@ -121,7 +154,7 @@ healthbench_hard_leaderboard_df = healthbench_hard_original_df.copy()
121
  pending_eval_queue_df,
122
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
123
 
124
- # breakpoint()
125
  def update_df(shown_columns, subset="datasets"):
126
  # changes to be made here
127
  if subset == "datasets":
@@ -148,16 +181,27 @@ def update_df(shown_columns, subset="datasets"):
148
  elif subset == "healthbench_hard":
149
  leaderboard_table_df = healthbench_hard_leaderboard_df.copy()
150
  hidden_leader_board_df = healthbench_hard_original_df
151
- # else:
152
- # match evaluation_metric:
153
- # case "Span Based":
154
- # leaderboard_table_df = span_based_types_leaderboard_df.copy()
155
- # hidden_leader_board_df = span_based_types_original_df
156
- # case "Token Based":
157
- # leaderboard_table_df = token_based_types_leaderboard_df.copy()
158
- # hidden_leader_board_df = token_based_types_original_df
159
- # case _:
160
- # pass
 
 
 
 
 
 
 
 
 
 
 
161
 
162
 
163
  value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
@@ -267,128 +311,132 @@ def filter_models(
267
  demo = gr.Blocks(css=custom_css)
268
  with demo:
269
  print("hello")
270
- if PRIVATE_REPO:
271
- gr.HTML(TITLE)
272
  gr.HTML(LOGO)
273
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
274
 
275
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
276
  with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
277
- with gr.Row():
278
- with gr.Column():
279
- with gr.Row():
280
- search_bar = gr.Textbox(
281
- placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
282
- show_label=False,
283
- elem_id="search-bar",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
  )
285
- with gr.Row():
286
- shown_columns = gr.CheckboxGroup(
287
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
288
- value=[
289
- c.name
290
- for c in fields(AutoEvalColumn)
291
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)
292
- ],
293
- label="Select columns to show",
294
- elem_id="column-select",
295
- interactive=True,
296
  )
297
- # with gr.Row():
298
- # deleted_models_visibility = gr.Checkbox(
299
- # value=False, label="Show gated/private/deleted models", interactive=True
300
- # )
301
- with gr.Column(min_width=320):
302
- # with gr.Box(elem_id="box-filter"):
303
- filter_columns_type = gr.CheckboxGroup(
304
- label="Model Types",
305
- choices=[t.to_str() for t in ModelType],
306
- value=[t.to_str() for t in ModelType],
307
- interactive=True,
308
- elem_id="filter-columns-type",
309
- )
310
- # filter_columns_architecture = gr.CheckboxGroup(
311
- # label="Architecture Types",
312
- # choices=[i.value.name for i in ModelArch],
313
- # value=[i.value.name for i in ModelArch],
314
- # interactive=True,
315
- # elem_id="filter-columns-architecture",
316
- # )
317
- filter_domain_specific = gr.CheckboxGroup(
318
- label="Domain Specificity",
319
- choices=["🏥 Clinical models", "Generic models"],
320
- value=["🏥 Clinical models", "Generic models"],
321
- interactive=True,
322
- elem_id="filter-columns-type",
323
- )
324
- filter_columns_size = gr.CheckboxGroup(
325
- label="Model sizes (in billions of parameters)",
326
- choices=list(NUMERIC_INTERVALS.keys()),
327
- value=list(NUMERIC_INTERVALS.keys()),
328
- interactive=True,
329
- elem_id="filter-columns-size",
330
- )
331
 
332
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="open_ended")
 
 
 
 
 
 
 
 
 
 
 
333
 
334
- leaderboard_table = gr.components.Dataframe(
335
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
336
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
337
- datatype=TYPES,
338
- elem_id="leaderboard-table",
339
- interactive=False,
340
- visible=True,
341
- )
 
 
 
 
 
 
 
 
 
 
 
342
 
343
- # Dummy leaderboard for handling the case when the user uses backspace key
344
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
345
- value=datasets_original_df[OPEN_ENDED_COLS],
346
- headers=OPEN_ENDED_COLS,
347
- datatype=TYPES,
348
- visible=False,
349
- )
350
 
351
-
352
- search_bar.submit(
353
- update_table,
354
- [
355
- hidden_leaderboard_table_for_search,
356
- shown_columns,
357
- search_bar,
358
- filter_columns_type,
359
- filter_domain_specific,
360
- filter_columns_size
361
- # filter_columns_architecture
362
- ],
363
- leaderboard_table,
364
- )
365
- for selector in [
366
- shown_columns,
367
- filter_columns_type,
368
- filter_domain_specific,
369
- # filter_columns_architecture,
370
- filter_columns_size,
371
- # deleted_models_visibility,
372
- ]:
373
- selector.change(
374
- update_table,
375
- [
376
- hidden_leaderboard_table_for_search,
377
- shown_columns,
378
- search_bar,
379
- filter_columns_type,
380
- filter_domain_specific,
381
- filter_columns_size
382
- # filter_columns_architecture,
383
- ],
384
- leaderboard_table,
385
- queue=True,
386
- )
387
- with gr.Accordion("💬 Generation templates", open=False):
388
- with gr.Accordion("Response generation", open=False):
389
- system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="response_generation")
390
- with gr.Accordion("Scoring Rubric", open=False):
391
- system_prompt, user_prompt = render_generation_templates(task="open_ended", generation_type="scoring_rubric")
392
  with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=2):
393
  gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
394
  with gr.Row():
@@ -506,6 +554,7 @@ with demo:
506
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
507
  with gr.Accordion("Cross Examination", open=False):
508
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
 
509
  with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=3):
510
  gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
511
  with gr.Tabs(elem_classes="tab-buttons2") as tabs:
@@ -736,6 +785,7 @@ with demo:
736
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
737
  with gr.Accordion("Cross Examination", open=False):
738
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
 
739
  with gr.TabItem("🏅 HealthBench", elem_id="llm-benchmark-tab-table", id=4):
740
  gr.Markdown(HEALTHBENCH_METRICS, elem_classes="markdown-text")
741
  with gr.Tabs(elem_classes="tab-buttons2") as tabs:
@@ -1073,7 +1123,206 @@ with demo:
1073
  with gr.Accordion("Scoring Rubric", open=False):
1074
  system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="scoring_rubric")
1075
 
1076
- with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-table", id=6):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1077
  with gr.Row():
1078
  with gr.Column():
1079
  with gr.Row():
@@ -1308,4 +1557,4 @@ with demo:
1308
  scheduler = BackgroundScheduler()
1309
  scheduler.add_job(restart_space, "interval", seconds=1800)
1310
  scheduler.start()
1311
- demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
 
53
  Precision,
54
  WeightType,
55
  fields,
56
+ render_generation_templates,
57
+ OpenEndedArabic_COLS,
58
+ OpenEndedArabic_BENCHMARK_COLS,
59
+ OpenEndedFrench_COLS,
60
+ OpenEndedFrench_BENCHMARK_COLS,
61
+ OpenEndedPortuguese_COLS,
62
+ OpenEndedPortuguese_BENCHMARK_COLS,
63
+ OpenEndedRomanian_COLS,
64
+ OpenEndedRomanian_BENCHMARK_COLS,
65
+ OpenEndedGreek_COLS,
66
+ OpenEndedGreek_BENCHMARK_COLS,
67
+ OpenEndedSpanish_COLS,
68
+ OpenEndedSpanish_BENCHMARK_COLS,
69
+ ClosedEndedMultilingual_COLS,
70
+ ClosedEndedMultilingual_BENCHMARK_COLS,
71
  )
72
+
73
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, PRIVATE_REPO
74
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
75
  from src.submission.submit import add_new_eval, PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG
 
121
  healthbench_hard_leaderboard_df = healthbench_hard_original_df.copy()
122
 
123
 
124
+ _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
125
+ _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
126
+ _, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese")
127
+ _, open_ended_romanian_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, "score", "open_ended_romanian")
128
+ _, open_ended_greek_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, "score", "open_ended_greek")
129
+ _, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish")
130
+ _, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual")
131
+
132
+
133
+ open_ended_arabic_leaderboard_df = open_ended_arabic_df.copy()
134
+ open_ended_french_leaderboard_df = open_ended_french_df.copy()
135
+ open_ended_portuguese_leaderboard_df = open_ended_portuguese_df.copy()
136
+ open_ended_romanian_leaderboard_df = open_ended_romanian_df.copy()
137
+ open_ended_greek_leaderboard_df = open_ended_greek_df.copy()
138
+ open_ended_spanish_leaderboard_df = open_ended_spanish_df.copy()
139
+ closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
140
+
141
+
142
  # breakpoint()
143
  # # Token based results
144
  # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
 
154
  pending_eval_queue_df,
155
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
156
 
157
+ breakpoint()
158
  def update_df(shown_columns, subset="datasets"):
159
  # changes to be made here
160
  if subset == "datasets":
 
181
  elif subset == "healthbench_hard":
182
  leaderboard_table_df = healthbench_hard_leaderboard_df.copy()
183
  hidden_leader_board_df = healthbench_hard_original_df
184
+ elif subset == "open_ended_arabic":
185
+ leaderboard_table_df = open_ended_arabic_df.copy()
186
+ hidden_leader_board_df = open_ended_arabic_df
187
+ elif subset == "open_ended_french":
188
+ leaderboard_table_df = open_ended_french_df.copy()
189
+ hidden_leader_board_df = open_ended_french_df
190
+ elif subset == "open_ended_portuguese":
191
+ leaderboard_table_df = open_ended_portuguese_df.copy()
192
+ hidden_leader_board_df = open_ended_portuguese_df
193
+ elif subset == "open_ended_romanian":
194
+ leaderboard_table_df = open_ended_romanian_df.copy()
195
+ hidden_leader_board_df = open_ended_romanian_df
196
+ elif subset == "open_ended_greek":
197
+ leaderboard_table_df = open_ended_greek_df.copy()
198
+ hidden_leader_board_df = open_ended_greek_df
199
+ elif subset == "open_ended_spanish":
200
+ leaderboard_table_df = open_ended_spanish_df.copy()
201
+ hidden_leader_board_df = open_ended_spanish_df
202
+ elif subset == "closed_ended_multilingual":
203
+ leaderboard_table_df = closed_ended_multilingual_df.copy()
204
+ hidden_leader_board_df = closed_ended_multilingual_df
205
 
206
 
207
  value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
 
311
  demo = gr.Blocks(css=custom_css)
312
  with demo:
313
  print("hello")
 
 
314
  gr.HTML(LOGO)
315
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
316
 
317
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
318
  with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1):
319
+ with gr.Tabs(elem_classes="tab-buttons6") as language_tabs:
320
+ LANGUAGES = {
321
+ "🇺🇸 English": "open_ended",
322
+ "🇦🇪 Arabic": "open_ended_arabic",
323
+ "🇫🇷 French": "open_ended_french",
324
+ "🇪🇸 Spanish": "open_ended_spanish",
325
+ "🇵🇹 Portuguese": "open_ended_portuguese",
326
+ "🇷🇴 Romanian": "open_ended_romanian",
327
+ "🇬🇷 Greek": "open_ended_greek",
328
+ }
329
+
330
+ for idx, (label, subset) in enumerate(LANGUAGES.items()):
331
+ with gr.TabItem(label, elem_id=f"llm-benchmark-tab-open-{subset}", id=idx):
332
+ # Custom judge information for each language
333
+ if label == "🇺🇸 English":
334
+ judge_text = "**Note:** Llama 3.1 70B Instruct has been used as judge for English."
335
+ else:
336
+ judge_text = "**Note:** Qwen 2.5 72B Instruct has been used as judge for this language."
337
+
338
+ gr.Markdown(judge_text, elem_classes="markdown-text")
339
+
340
+ with gr.Row():
341
+ with gr.Column():
342
+ with gr.Row():
343
+ search_bar = gr.Textbox(
344
+ placeholder=f"🔍 Search for your model in {label}...",
345
+ show_label=False,
346
+ elem_id=f"search-bar-{subset}",
347
+ )
348
+ with gr.Row():
349
+ shown_columns = gr.CheckboxGroup(
350
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
351
+ value=[
352
+ c.name
353
+ for c in fields(AutoEvalColumn)
354
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)
355
+ ],
356
+ label="Select columns to show",
357
+ elem_id=f"column-select-{subset}",
358
+ interactive=True,
359
+ )
360
+ with gr.Column(min_width=320):
361
+ filter_columns_type = gr.CheckboxGroup(
362
+ label="Model Types",
363
+ choices=[t.to_str() for t in ModelType],
364
+ value=[t.to_str() for t in ModelType],
365
+ interactive=True,
366
+ elem_id=f"filter-columns-type-{subset}",
367
+ )
368
+ filter_domain_specific = gr.CheckboxGroup(
369
+ label="Domain Specificity",
370
+ choices=["🏥 Clinical models", "Generic models"],
371
+ value=["🏥 Clinical models", "Generic models"],
372
+ interactive=True,
373
+ elem_id=f"filter-columns-domain-{subset}",
374
+ )
375
+ filter_columns_size = gr.CheckboxGroup(
376
+ label="Model sizes (in billions of parameters)",
377
+ choices=list(NUMERIC_INTERVALS.keys()),
378
+ value=list(NUMERIC_INTERVALS.keys()),
379
+ interactive=True,
380
+ elem_id=f"filter-columns-size-{subset}",
381
+ )
382
+
383
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset=subset)
384
+
385
+ leaderboard_table = gr.Dataframe(
386
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
387
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
388
+ datatype=TYPES,
389
+ elem_id=f"leaderboard-table-{subset}",
390
+ interactive=False,
391
+ visible=True,
392
  )
393
+
394
+ hidden_leaderboard_table_for_search = gr.Dataframe(
395
+ value=datasets_original_df[OPEN_ENDED_COLS],
396
+ headers=OPEN_ENDED_COLS,
397
+ datatype=TYPES,
398
+ visible=False,
 
 
 
 
 
399
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
+ search_bar.submit(
402
+ update_table,
403
+ [
404
+ hidden_leaderboard_table_for_search,
405
+ shown_columns,
406
+ search_bar,
407
+ filter_columns_type,
408
+ filter_domain_specific,
409
+ filter_columns_size
410
+ ],
411
+ leaderboard_table,
412
+ )
413
 
414
+ for selector in [
415
+ shown_columns,
416
+ filter_columns_type,
417
+ filter_domain_specific,
418
+ filter_columns_size,
419
+ ]:
420
+ selector.change(
421
+ update_table,
422
+ [
423
+ hidden_leaderboard_table_for_search,
424
+ shown_columns,
425
+ search_bar,
426
+ filter_columns_type,
427
+ filter_domain_specific,
428
+ filter_columns_size
429
+ ],
430
+ leaderboard_table,
431
+ queue=True,
432
+ )
433
 
434
+ with gr.Accordion("💬 Generation templates", open=False):
435
+ with gr.Accordion("Response generation", open=False):
436
+ render_generation_templates(task="open_ended", generation_type="response_generation")
437
+ with gr.Accordion("Scoring Rubric", open=False):
438
+ render_generation_templates(task="open_ended", generation_type="scoring_rubric")
 
 
439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=2):
441
  gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
442
  with gr.Row():
 
554
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
555
  with gr.Accordion("Cross Examination", open=False):
556
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
557
+
558
  with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=3):
559
  gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
560
  with gr.Tabs(elem_classes="tab-buttons2") as tabs:
 
785
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
786
  with gr.Accordion("Cross Examination", open=False):
787
  system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
788
+
789
  with gr.TabItem("🏅 HealthBench", elem_id="llm-benchmark-tab-table", id=4):
790
  gr.Markdown(HEALTHBENCH_METRICS, elem_classes="markdown-text")
791
  with gr.Tabs(elem_classes="tab-buttons2") as tabs:
 
1123
  with gr.Accordion("Scoring Rubric", open=False):
1124
  system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="scoring_rubric")
1125
 
1126
+ with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-closed", id=6):
1127
+ with gr.Tabs(elem_classes="tab-buttons2") as closed_tabs:
1128
+ # ENGLISH TAB
1129
+ with gr.TabItem("English", elem_id="llm-benchmark-tab-closed-english", id=0):
1130
+ with gr.Row():
1131
+ with gr.Column():
1132
+ with gr.Row():
1133
+ search_bar = gr.Textbox(
1134
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
1135
+ show_label=False,
1136
+ elem_id="search-bar-closed-english",
1137
+ )
1138
+ with gr.Row():
1139
+ shown_columns = gr.CheckboxGroup(
1140
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
1141
+ value=[
1142
+ c.name
1143
+ for c in fields(AutoEvalColumn)
1144
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
1145
+ ],
1146
+ label="Select columns to show",
1147
+ elem_id="column-select-closed-english",
1148
+ interactive=True,
1149
+ )
1150
+ with gr.Column(min_width=320):
1151
+ filter_columns_type = gr.CheckboxGroup(
1152
+ label="Model Types",
1153
+ choices=[t.to_str() for t in ModelType],
1154
+ value=[t.to_str() for t in ModelType],
1155
+ interactive=True,
1156
+ elem_id="filter-columns-type-closed-english",
1157
+ )
1158
+ filter_domain_specific = gr.CheckboxGroup(
1159
+ label="Domain Specificity",
1160
+ choices=["🏥 Clinical models", "Generic models"],
1161
+ value=["🏥 Clinical models", "Generic models"],
1162
+ interactive=True,
1163
+ elem_id="filter-domain-specific-closed-english",
1164
+ )
1165
+ filter_columns_size = gr.CheckboxGroup(
1166
+ label="Model sizes (in billions of parameters)",
1167
+ choices=list(NUMERIC_INTERVALS.keys()),
1168
+ value=list(NUMERIC_INTERVALS.keys()),
1169
+ interactive=True,
1170
+ elem_id="filter-columns-size-closed-english",
1171
+ )
1172
+
1173
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
1174
+ leaderboard_table = gr.components.Dataframe(
1175
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1176
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1177
+ datatype=TYPES,
1178
+ elem_id="leaderboard-table-english",
1179
+ interactive=False,
1180
+ visible=True,
1181
+ )
1182
+
1183
+ # Dummy leaderboard for handling the case when the user uses backspace key
1184
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
1185
+ value=datasets_original_df[DATASET_COLS],
1186
+ headers=DATASET_COLS,
1187
+ datatype=TYPES,
1188
+ visible=False,
1189
+ )
1190
+
1191
+ search_bar.submit(
1192
+ update_table,
1193
+ [
1194
+ hidden_leaderboard_table_for_search,
1195
+ shown_columns,
1196
+ search_bar,
1197
+ filter_columns_type,
1198
+ filter_domain_specific,
1199
+ filter_columns_size
1200
+ ],
1201
+ leaderboard_table,
1202
+ )
1203
+
1204
+ for selector in [
1205
+ shown_columns,
1206
+ filter_columns_type,
1207
+ filter_domain_specific,
1208
+ filter_columns_size,
1209
+ ]:
1210
+ selector.change(
1211
+ update_table,
1212
+ [
1213
+ hidden_leaderboard_table_for_search,
1214
+ shown_columns,
1215
+ search_bar,
1216
+ filter_columns_type,
1217
+ filter_domain_specific,
1218
+ filter_columns_size
1219
+ ],
1220
+ leaderboard_table,
1221
+ queue=True,
1222
+ )
1223
+
1224
+ #MULTILINGUAL TAB - Same level as English tab
1225
+ with gr.TabItem("🌍 Multilingual", elem_id="llm-benchmark-tab-table9", id=1):
1226
+ with gr.Row():
1227
+ gr.Markdown("📊 **Dataset Information:** This tab uses the Global MMLU dataset filtering only the subcategory: medical (10.7%)")
1228
+
1229
+ with gr.Row():
1230
+ with gr.Column():
1231
+ with gr.Row():
1232
+ search_bar = gr.Textbox(
1233
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
1234
+ show_label=False,
1235
+ elem_id="search-bar",
1236
+ )
1237
+
1238
+ with gr.Row():
1239
+ shown_columns = gr.CheckboxGroup(
1240
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)],
1241
+ value=[
1242
+ c.name
1243
+ for c in fields(AutoEvalColumn)
1244
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)
1245
+ ],
1246
+ label="Select columns to show",
1247
+ elem_id="column-select",
1248
+ interactive=True,
1249
+ )
1250
+ with gr.Column(min_width=320):
1251
+ # with gr.Box(elem_id="box-filter"):
1252
+ filter_columns_type = gr.CheckboxGroup(
1253
+ label="Model Types",
1254
+ choices=[t.to_str() for t in ModelType],
1255
+ value=[t.to_str() for t in ModelType],
1256
+ interactive=True,
1257
+ elem_id="filter-columns-type",
1258
+ )
1259
+ filter_domain_specific = gr.CheckboxGroup(
1260
+ label="Domain Specificity",
1261
+ choices=["🏥 Clinical models", "Generic models"],
1262
+ value=["🏥 Clinical models", "Generic models"],
1263
+ interactive=True,
1264
+ elem_id="filter-columns-type",
1265
+ )
1266
+ filter_columns_size = gr.CheckboxGroup(
1267
+ label="Model sizes (in billions of parameters)",
1268
+ choices=list(NUMERIC_INTERVALS.keys()),
1269
+ value=list(NUMERIC_INTERVALS.keys()),
1270
+ interactive=True,
1271
+ elem_id="filter-columns-size",
1272
+ )
1273
+
1274
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="closed_ended_multilingual")
1275
+ leaderboard_table = gr.components.Dataframe(
1276
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1277
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1278
+ datatype=TYPES,
1279
+ elem_id="leaderboard-table",
1280
+ interactive=False,
1281
+ visible=True,
1282
+ )
1283
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
1284
+ value=datasets_original_df[ClosedEndedMultilingual_COLS],
1285
+ headers=ClosedEndedMultilingual_COLS,
1286
+ datatype=TYPES,
1287
+ visible=False,
1288
+ )
1289
+
1290
+ search_bar.submit(
1291
+ update_table,
1292
+ [
1293
+ hidden_leaderboard_table_for_search,
1294
+ shown_columns,
1295
+ search_bar,
1296
+ filter_columns_type,
1297
+ filter_domain_specific,
1298
+ filter_columns_size
1299
+ # filter_columns_architecture
1300
+ ],
1301
+ leaderboard_table,
1302
+ )
1303
+ for selector in [
1304
+ shown_columns,
1305
+ filter_columns_type,
1306
+ filter_domain_specific,
1307
+ # filter_columns_architecture,
1308
+ filter_columns_size,
1309
+ # deleted_models_visibility,
1310
+ ]:
1311
+ selector.change(
1312
+ update_table,
1313
+ [
1314
+ hidden_leaderboard_table_for_search,
1315
+ shown_columns,
1316
+ search_bar,
1317
+ filter_columns_type,
1318
+ filter_domain_specific,
1319
+ filter_columns_size
1320
+ # filter_columns_architecture,
1321
+ ],
1322
+ leaderboard_table,
1323
+ queue=True,
1324
+ )
1325
+
1326
  with gr.Row():
1327
  with gr.Column():
1328
  with gr.Row():
 
1557
  scheduler = BackgroundScheduler()
1558
  scheduler.add_job(restart_space, "interval", seconds=1800)
1559
  scheduler.start()
1560
+ demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
app_original.py ADDED
@@ -0,0 +1,1069 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from apscheduler.schedulers.background import BackgroundScheduler
6
+ from huggingface_hub import snapshot_download
7
+
8
+ from src.about import (
9
+ CITATION_BUTTON_LABEL,
10
+ CITATION_BUTTON_TEXT,
11
+ EVALUATION_QUEUE_TEXT,
12
+ INTRODUCTION_TEXT,
13
+ LLM_BENCHMARKS_TEXT_1,
14
+ LLM_BENCHMARKS_TEXT_2,
15
+ CROSS_EVALUATION_METRICS,
16
+ NOTE_GENERATION_METRICS,
17
+ # EVALUATION_EXAMPLE_IMG,
18
+ # LLM_BENCHMARKS_TEXT_2,
19
+ # ENTITY_DISTRIBUTION_IMG,
20
+ # LLM_BENCHMARKS_TEXT_3,
21
+ TITLE,
22
+ LOGO,
23
+ FIVE_PILLAR_DIAGRAM
24
+ )
25
+ from src.display.css_html_js import custom_css
26
+ # changes to be made here
27
+ from src.display.utils import (
28
+ DATASET_BENCHMARK_COLS,
29
+ OPEN_ENDED_BENCHMARK_COLS,
30
+ MED_SAFETY_BENCHMARK_COLS,
31
+ MEDICAL_SUMMARIZATION_BENCHMARK_COLS,
32
+ ACI_BENCHMARK_COLS,
33
+ SOAP_BENCHMARK_COLS,
34
+ #CLOSED_ENDED_ARABIC_BENCHMARK_COLS,
35
+ DATASET_COLS,
36
+ OPEN_ENDED_COLS,
37
+ MED_SAFETY_COLS,
38
+ MEDICAL_SUMMARIZATION_COLS,
39
+ ACI_COLS,
40
+ SOAP_COLS,
41
+ #CLOSED_ENDED_ARABIC_COLS,
42
+ EVAL_COLS,
43
+ EVAL_TYPES,
44
+ NUMERIC_INTERVALS,
45
+ TYPES,
46
+ AutoEvalColumn,
47
+ ModelType,
48
+ ModelArch,
49
+ PromptTemplateName,
50
+ Precision,
51
+ WeightType,
52
+ fields,
53
+ render_generation_templates,
54
+ OpenEndedArabic_COLS,
55
+ OpenEndedArabic_BENCHMARK_COLS,
56
+ OpenEndedFrench_COLS,
57
+ OpenEndedFrench_BENCHMARK_COLS,
58
+ OpenEndedPortuguese_COLS,
59
+ OpenEndedPortuguese_BENCHMARK_COLS,
60
+ OpenEndedRomanian_COLS,
61
+ OpenEndedRomanian_BENCHMARK_COLS,
62
+ OpenEndedGreek_COLS,
63
+ OpenEndedGreek_BENCHMARK_COLS,
64
+ OpenEndedSpanish_COLS,
65
+ OpenEndedSpanish_BENCHMARK_COLS,
66
+ ClosedEndedMultilingual_COLS,
67
+ ClosedEndedMultilingual_BENCHMARK_COLS,
68
+
69
+
70
+ )
71
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN, PRIVATE_REPO
72
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
73
+ from src.submission.submit import add_new_eval, PLACEHOLDER_DATASET_WISE_NORMALIZATION_CONFIG
74
+
75
+ def restart_space():
76
+ API.restart_space(repo_id=REPO_ID)
77
+
78
+
79
+ try:
80
+ print(EVAL_REQUESTS_PATH)
81
+ snapshot_download(
82
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
83
+ )
84
+ except Exception:
85
+ restart_space()
86
+ try:
87
+ print(EVAL_RESULTS_PATH)
88
+ snapshot_download(
89
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
90
+ )
91
+ except Exception:
92
+ restart_space()
93
+
94
+ # Span based results
95
+ # changes to be made here
96
+
97
+ _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
98
+ harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
99
+ print("Closed ended English results loaded")
100
+
101
+ _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
102
+ open_ended_leaderboard_df = open_ended_original_df.copy()
103
+ print("Open ended English results loaded")
104
+
105
+ _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
106
+ med_safety_leaderboard_df = med_safety_original_df.copy()
107
+ print("Med safety results loaded")
108
+
109
+ _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
110
+ medical_summarization_leaderboard_df = medical_summarization_original_df.copy()
111
+ print("Medical summarization results loaded")
112
+
113
+ _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
114
+ aci_leaderboard_df = aci_original_df.copy()
115
+ print("ACI results loaded")
116
+
117
+ _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
118
+ soap_leaderboard_df = soap_original_df.copy()
119
+ print("SOAP results loaded")
120
+
121
+ _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
122
+ _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
123
+ _, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese")
124
+ _, open_ended_romanian_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, "score", "open_ended_romanian")
125
+ _, open_ended_greek_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, "score", "open_ended_greek")
126
+ _, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish")
127
+ print("Open ended multilingual results loaded")
128
+
129
+ _, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual")
130
+ print("Closed ended multilingual results loaded")
131
+
132
+
133
+ open_ended_arabic_leaderboard_df = open_ended_arabic_df.copy()
134
+ open_ended_french_leaderboard_df = open_ended_french_df.copy()
135
+ open_ended_portuguese_leaderboard_df = open_ended_portuguese_df.copy()
136
+ open_ended_romanian_leaderboard_df = open_ended_romanian_df.copy()
137
+ open_ended_greek_leaderboard_df = open_ended_greek_df.copy()
138
+ open_ended_spanish_leaderboard_df = open_ended_spanish_df.copy()
139
+ closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
140
+
141
+
142
+ # if PRIVATE_REPO:
143
+ # _, closed_ended_arabic_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, CLOSED_ENDED_ARABIC_COLS, CLOSED_ENDED_ARABIC_BENCHMARK_COLS, "score", "closed_ended_arabic")
144
+ # closed_ended_arabic_leaderboard_df = closed_ended_arabic_original_df.copy()
145
+
146
+ # breakpoint()
147
+ # # Token based results
148
+ # _, token_based_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "datasets")
149
+ # token_based_datasets_leaderboard_df = token_based_datasets_original_df.copy()
150
+
151
+ # _, token_based_types_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, Clinical_TYPES_COLS, TYPES_BENCHMARK_COLS, "TokenBasedWithMacroAverage", "clinical_types")
152
+ # token_based_types_leaderboard_df = token_based_types_original_df.copy()
153
+
154
+
155
+ (
156
+ finished_eval_queue_df,
157
+ running_eval_queue_df,
158
+ pending_eval_queue_df,
159
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
160
+
161
+ breakpoint()
162
+ def update_df(shown_columns, subset="datasets"):
163
+ # changes to be made here
164
+ if subset == "datasets":
165
+ leaderboard_table_df = harness_datasets_leaderboard_df.copy()
166
+ hidden_leader_board_df = harness_datasets_original_df
167
+ elif subset == "open_ended":
168
+ leaderboard_table_df = open_ended_leaderboard_df.copy()
169
+ hidden_leader_board_df = open_ended_original_df
170
+ elif subset == "med_safety":
171
+ leaderboard_table_df = med_safety_leaderboard_df.copy()
172
+ hidden_leader_board_df = med_safety_original_df
173
+ elif subset == "medical_summarization":
174
+ leaderboard_table_df = medical_summarization_leaderboard_df.copy()
175
+ hidden_leader_board_df = medical_summarization_original_df
176
+ elif subset == "aci":
177
+ leaderboard_table_df = aci_leaderboard_df.copy()
178
+ hidden_leader_board_df = aci_original_df
179
+ elif subset == "soap":
180
+ leaderboard_table_df = soap_leaderboard_df.copy()
181
+ hidden_leader_board_df = soap_original_df
182
+ elif subset == "open_ended_arabic":
183
+ leaderboard_table_df = open_ended_arabic_df.copy()
184
+ hidden_leader_board_df = open_ended_arabic_df
185
+ elif subset == "open_ended_french":
186
+ leaderboard_table_df = open_ended_french_df.copy()
187
+ hidden_leader_board_df = open_ended_french_df
188
+ elif subset == "open_ended_portuguese":
189
+ leaderboard_table_df = open_ended_portuguese_df.copy()
190
+ hidden_leader_board_df = open_ended_portuguese_df
191
+ elif subset == "open_ended_romanian":
192
+ leaderboard_table_df = open_ended_romanian_df.copy()
193
+ hidden_leader_board_df = open_ended_romanian_df
194
+ elif subset == "open_ended_greek":
195
+ leaderboard_table_df = open_ended_greek_df.copy()
196
+ hidden_leader_board_df = open_ended_greek_df
197
+ elif subset == "open_ended_spanish":
198
+ leaderboard_table_df = open_ended_spanish_df.copy()
199
+ hidden_leader_board_df = open_ended_spanish_df
200
+ elif subset == "closed_ended_multilingual":
201
+ leaderboard_table_df = closed_ended_multilingual_df.copy()
202
+ hidden_leader_board_df = closed_ended_multilingual_df
203
+
204
+ # else:
205
+ # match evaluation_metric:
206
+ # case "Span Based":
207
+ # leaderboard_table_df = span_based_types_leaderboard_df.copy()
208
+ # hidden_leader_board_df = span_based_types_original_df
209
+ # case "Token Based":
210
+ # leaderboard_table_df = token_based_types_leaderboard_df.copy()
211
+ # hidden_leader_board_df = token_based_types_original_df
212
+ # case _:
213
+ # pass
214
+
215
+
216
+ value_cols = [c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns
217
+ # breakpoint()
218
+ return leaderboard_table_df[value_cols], hidden_leader_board_df
219
+
220
+
221
+ # Searching and filtering
222
+ def update_table(
223
+ hidden_df: pd.DataFrame,
224
+ columns: list,
225
+ query: str = "",
226
+ type_query: list = None,
227
+ domain_specific_query: list = None,
228
+ size_query: list = None,
229
+ precision_query: str = None,
230
+ show_deleted: bool = False,
231
+ ):
232
+ # breakpoint()
233
+ filtered_df = filter_models(hidden_df, type_query, domain_specific_query, size_query, precision_query, show_deleted)
234
+ # breakpoint()
235
+ filtered_df = filter_queries(query, filtered_df)
236
+ # breakpoint()
237
+ df = select_columns(filtered_df, columns, list(hidden_df.columns))
238
+ # breakpoint()
239
+ return df
240
+
241
+
242
+ def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
243
+ return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
244
+
245
+
246
+ def select_columns(df: pd.DataFrame, columns: list, cols:list) -> pd.DataFrame:
247
+ always_here_cols = [
248
+ AutoEvalColumn.model_type_symbol.name,
249
+ AutoEvalColumn.model.name,
250
+ ]
251
+ # We use COLS to maintain sorting
252
+ filtered_df = df[always_here_cols + [c for c in cols if c in df.columns and c in columns]]
253
+ return filtered_df
254
+
255
+
256
+ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
257
+ final_df = []
258
+ if query != "":
259
+ queries = [q.strip() for q in query.split(";")]
260
+ for _q in queries:
261
+ _q = _q.strip()
262
+ if _q != "":
263
+ temp_filtered_df = search_table(filtered_df, _q)
264
+ if len(temp_filtered_df) > 0:
265
+ final_df.append(temp_filtered_df)
266
+ if len(final_df) > 0:
267
+ filtered_df = pd.concat(final_df)
268
+ filtered_df = filtered_df.drop_duplicates(
269
+ subset=[
270
+ AutoEvalColumn.model.name,
271
+ # AutoEvalColumn.precision.name,
272
+ # AutoEvalColumn.revision.name,
273
+ ]
274
+ )
275
+
276
+ return filtered_df
277
+
278
+
279
+ def filter_models(
280
+ df: pd.DataFrame, type_query: list, domain_specific_query: list, size_query: list, precision_query: list, show_deleted: bool
281
+ ) -> pd.DataFrame:
282
+ # Show all models
283
+ # if show_deleted:
284
+ # filtered_df = df
285
+ # else: # Show only still on the hub models
286
+ # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
287
+
288
+ filtered_df = df
289
+
290
+ if type_query is not None:
291
+ type_name = [t.split(" ")[1] for t in type_query]
292
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type.name].isin(type_name)]
293
+
294
+ if domain_specific_query is not None:
295
+ domain_specifics = []
296
+ if "🏥 Clinical models" in domain_specific_query:
297
+ domain_specifics.append(True)
298
+ if "Generic models" in domain_specific_query:
299
+ domain_specifics.append(False)
300
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.is_domain_specific.name].isin(domain_specifics)]
301
+
302
+ # if architecture_query is not None:
303
+ # arch_types = [t for t in architecture_query]
304
+ # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(arch_types)]
305
+ # # filtered_df = filtered_df.loc[df[AutoEvalColumn.architecture.name].isin(architecture_query + ["None"])]
306
+
307
+ if precision_query is not None:
308
+ if AutoEvalColumn.precision.name in df.columns:
309
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
310
+
311
+ if size_query is not None:
312
+ numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
313
+ params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
314
+ mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
315
+ filtered_df = filtered_df.loc[mask]
316
+
317
+ return filtered_df
318
+
319
+
320
+ demo = gr.Blocks(css=custom_css)
321
+ with demo:
322
+ print("hello")
323
+ gr.HTML(LOGO)
324
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
325
+ with gr.Tabs(elem_classes="tab-buttons") as outer_tabs:
326
+ with gr.TabItem("🏅 Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=11):
327
+ with gr.Tabs(elem_classes="tab-buttons6") as language_tabs:
328
+ LANGUAGES = {
329
+ "🇺🇸 English": "open_ended",
330
+ "🇦🇪 Arabic": "open_ended_arabic",
331
+ "🇫🇷 French": "open_ended_french",
332
+ "🇪🇸 Spanish": "open_ended_spanish",
333
+ "🇵🇹 Portuguese": "open_ended_portuguese",
334
+ "🇷🇴 Romanian": "open_ended_romanian",
335
+ "🇬🇷 Greek": "open_ended_greek",
336
+ }
337
+
338
+ for idx, (label, subset) in enumerate(LANGUAGES.items()):
339
+ with gr.TabItem(label, elem_id=f"llm-benchmark-tab-open-{subset}", id=idx):
340
+ # Custom judge information for each language
341
+ if label == "🇺🇸 English":
342
+ judge_text = "**Note:** Llama 3.1 70B Instruct has been used as judge for English."
343
+ else:
344
+ judge_text = "**Note:** Qwen 2.5 72B Instruct has been used as judge for this language."
345
+
346
+ gr.Markdown(judge_text, elem_classes="markdown-text")
347
+
348
+ with gr.Row():
349
+ with gr.Column():
350
+ with gr.Row():
351
+ search_bar = gr.Textbox(
352
+ placeholder=f"🔍 Search for your model in {label}...",
353
+ show_label=False,
354
+ elem_id=f"search-bar-{subset}",
355
+ )
356
+ with gr.Row():
357
+ shown_columns = gr.CheckboxGroup(
358
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)],
359
+ value=[
360
+ c.name
361
+ for c in fields(AutoEvalColumn)
362
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)
363
+ ],
364
+ label="Select columns to show",
365
+ elem_id=f"column-select-{subset}",
366
+ interactive=True,
367
+ )
368
+ with gr.Column(min_width=320):
369
+ filter_columns_type = gr.CheckboxGroup(
370
+ label="Model Types",
371
+ choices=[t.to_str() for t in ModelType],
372
+ value=[t.to_str() for t in ModelType],
373
+ interactive=True,
374
+ elem_id=f"filter-columns-type-{subset}",
375
+ )
376
+ filter_domain_specific = gr.CheckboxGroup(
377
+ label="Domain Specificity",
378
+ choices=["🏥 Clinical models", "Generic models"],
379
+ value=["🏥 Clinical models", "Generic models"],
380
+ interactive=True,
381
+ elem_id=f"filter-columns-domain-{subset}",
382
+ )
383
+ filter_columns_size = gr.CheckboxGroup(
384
+ label="Model sizes (in billions of parameters)",
385
+ choices=list(NUMERIC_INTERVALS.keys()),
386
+ value=list(NUMERIC_INTERVALS.keys()),
387
+ interactive=True,
388
+ elem_id=f"filter-columns-size-{subset}",
389
+ )
390
+
391
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset=subset)
392
+
393
+ leaderboard_table = gr.Dataframe(
394
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
395
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
396
+ datatype=TYPES,
397
+ elem_id=f"leaderboard-table-{subset}",
398
+ interactive=False,
399
+ visible=True,
400
+ )
401
+
402
+ hidden_leaderboard_table_for_search = gr.Dataframe(
403
+ value=datasets_original_df[OPEN_ENDED_COLS],
404
+ headers=OPEN_ENDED_COLS,
405
+ datatype=TYPES,
406
+ visible=False,
407
+ )
408
+
409
+ search_bar.submit(
410
+ update_table,
411
+ [
412
+ hidden_leaderboard_table_for_search,
413
+ shown_columns,
414
+ search_bar,
415
+ filter_columns_type,
416
+ filter_domain_specific,
417
+ filter_columns_size
418
+ ],
419
+ leaderboard_table,
420
+ )
421
+
422
+ for selector in [
423
+ shown_columns,
424
+ filter_columns_type,
425
+ filter_domain_specific,
426
+ filter_columns_size,
427
+ ]:
428
+ selector.change(
429
+ update_table,
430
+ [
431
+ hidden_leaderboard_table_for_search,
432
+ shown_columns,
433
+ search_bar,
434
+ filter_columns_type,
435
+ filter_domain_specific,
436
+ filter_columns_size
437
+ ],
438
+ leaderboard_table,
439
+ queue=True,
440
+ )
441
+
442
+ with gr.Accordion("💬 Generation templates", open=False):
443
+ with gr.Accordion("Response generation", open=False):
444
+ render_generation_templates(task="open_ended", generation_type="response_generation")
445
+ with gr.Accordion("Scoring Rubric", open=False):
446
+ render_generation_templates(task="open_ended", generation_type="scoring_rubric")
447
+
448
+ with gr.TabItem("🏅 Med Safety", elem_id="llm-benchmark-tab-table", id=2):
449
+ with gr.Row():
450
+ with gr.Column():
451
+ with gr.Row():
452
+ search_bar = gr.Textbox(
453
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
454
+ show_label=False,
455
+ elem_id="search-bar-med-safety",
456
+ )
457
+ with gr.Row():
458
+ shown_columns = gr.CheckboxGroup(
459
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)],
460
+ value=[
461
+ c.name
462
+ for c in fields(AutoEvalColumn)
463
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)
464
+ ],
465
+ label="Select columns to show",
466
+ elem_id="column-select-med-safety",
467
+ interactive=True,
468
+ )
469
+ with gr.Column(min_width=320):
470
+ filter_columns_type = gr.CheckboxGroup(
471
+ label="Model Types",
472
+ choices=[t.to_str() for t in ModelType],
473
+ value=[t.to_str() for t in ModelType],
474
+ interactive=True,
475
+ elem_id="filter-columns-type-med-safety",
476
+ )
477
+ filter_domain_specific = gr.CheckboxGroup(
478
+ label="Domain Specificity",
479
+ choices=["🏥 Clinical models", "Generic models"],
480
+ value=["🏥 Clinical models", "Generic models"],
481
+ interactive=True,
482
+ elem_id="filter-domain-specific-med-safety",
483
+ )
484
+ filter_columns_size = gr.CheckboxGroup(
485
+ label="Model sizes (in billions of parameters)",
486
+ choices=list(NUMERIC_INTERVALS.keys()),
487
+ value=list(NUMERIC_INTERVALS.keys()),
488
+ interactive=True,
489
+ elem_id="filter-columns-size-med-safety",
490
+ )
491
+
492
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="med_safety")
493
+
494
+ leaderboard_table = gr.Dataframe(
495
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
496
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
497
+ datatype=TYPES,
498
+ elem_id="leaderboard-table-med-safety",
499
+ interactive=False,
500
+ visible=True,
501
+ )
502
+
503
+ hidden_leaderboard_table_for_search = gr.Dataframe(
504
+ value=datasets_original_df[MED_SAFETY_COLS],
505
+ headers=MED_SAFETY_COLS,
506
+ datatype=TYPES,
507
+ visible=False,
508
+ )
509
+
510
+ search_bar.submit(
511
+ update_table,
512
+ [
513
+ hidden_leaderboard_table_for_search,
514
+ shown_columns,
515
+ search_bar,
516
+ filter_columns_type,
517
+ filter_domain_specific,
518
+ filter_columns_size
519
+ ],
520
+ leaderboard_table,
521
+ )
522
+
523
+ for selector in [
524
+ shown_columns,
525
+ filter_columns_type,
526
+ filter_domain_specific,
527
+ filter_columns_size,
528
+ ]:
529
+ selector.change(
530
+ update_table,
531
+ [
532
+ hidden_leaderboard_table_for_search,
533
+ shown_columns,
534
+ search_bar,
535
+ filter_columns_type,
536
+ filter_domain_specific,
537
+ filter_columns_size
538
+ ],
539
+ leaderboard_table,
540
+ queue=True,
541
+ )
542
+
543
+ with gr.Accordion("💬 Generation templates", open=False):
544
+ with gr.Accordion("Response generation", open=False):
545
+ system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="response_generation")
546
+ with gr.Accordion("Scoring Rubric", open=False):
547
+ system_prompt, user_prompt = render_generation_templates(task="med_safety", generation_type="scoring_rubric")
548
+
549
+ with gr.TabItem("🏅 Medical Summarization", elem_id="llm-benchmark-tab-table", id=3):
550
+ gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text")
551
+ with gr.Row():
552
+ with gr.Column():
553
+ with gr.Row():
554
+ search_bar = gr.Textbox(
555
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
556
+ show_label=False,
557
+ elem_id="search-bar-med-summarization",
558
+ )
559
+ with gr.Row():
560
+ shown_columns = gr.CheckboxGroup(
561
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)],
562
+ value=[
563
+ c.name
564
+ for c in fields(AutoEvalColumn)
565
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)
566
+ ],
567
+ label="Select columns to show",
568
+ elem_id="column-select-med-summarization",
569
+ interactive=True,
570
+ )
571
+ with gr.Column(min_width=320):
572
+ filter_columns_type = gr.CheckboxGroup(
573
+ label="Model Types",
574
+ choices=[t.to_str() for t in ModelType],
575
+ value=[t.to_str() for t in ModelType],
576
+ interactive=True,
577
+ elem_id="filter-columns-type-med-summarization",
578
+ )
579
+ filter_domain_specific = gr.CheckboxGroup(
580
+ label="Domain Specificity",
581
+ choices=["🏥 Clinical models", "Generic models"],
582
+ value=["🏥 Clinical models", "Generic models"],
583
+ interactive=True,
584
+ elem_id="filter-domain-specific-med-summarization",
585
+ )
586
+ filter_columns_size = gr.CheckboxGroup(
587
+ label="Model sizes (in billions of parameters)",
588
+ choices=list(NUMERIC_INTERVALS.keys()),
589
+ value=list(NUMERIC_INTERVALS.keys()),
590
+ interactive=True,
591
+ elem_id="filter-columns-size-med-summarization",
592
+ )
593
+
594
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="medical_summarization")
595
+
596
+ leaderboard_table = gr.Dataframe(
597
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
598
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
599
+ datatype=TYPES,
600
+ elem_id="leaderboard-table-med-summarization",
601
+ interactive=False,
602
+ visible=True,
603
+ )
604
+
605
+ hidden_leaderboard_table_for_search = gr.Dataframe(
606
+ value=datasets_original_df[MEDICAL_SUMMARIZATION_COLS],
607
+ headers=MEDICAL_SUMMARIZATION_COLS,
608
+ datatype=TYPES,
609
+ visible=False,
610
+ )
611
+
612
+ search_bar.submit(
613
+ update_table,
614
+ [
615
+ hidden_leaderboard_table_for_search,
616
+ shown_columns,
617
+ search_bar,
618
+ filter_columns_type,
619
+ filter_domain_specific,
620
+ filter_columns_size
621
+ ],
622
+ leaderboard_table,
623
+ )
624
+
625
+ for selector in [
626
+ shown_columns,
627
+ filter_columns_type,
628
+ filter_domain_specific,
629
+ filter_columns_size,
630
+ ]:
631
+ selector.change(
632
+ update_table,
633
+ [
634
+ hidden_leaderboard_table_for_search,
635
+ shown_columns,
636
+ search_bar,
637
+ filter_columns_type,
638
+ filter_domain_specific,
639
+ filter_columns_size
640
+ ],
641
+ leaderboard_table,
642
+ queue=True,
643
+ )
644
+
645
+ with gr.Accordion("💬 Generation templates", open=False):
646
+ with gr.Accordion("Response generation", open=False):
647
+ system_prompt, user_prompt = render_generation_templates(task="medical_summarization", generation_type="response_generation")
648
+ with gr.Accordion("Question generation", open=False):
649
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
650
+ with gr.Accordion("Cross Examination", open=False):
651
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
652
+
653
+ with gr.TabItem("🏅 Note generation", elem_id="llm-benchmark-tab-table", id=4):
654
+ gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text")
655
+ with gr.Tabs(elem_classes="tab-buttons2") as note_tabs:
656
+ with gr.TabItem("ACI Bench", elem_id="llm-benchmark-tab-aci", id=0):
657
+ with gr.Row():
658
+ with gr.Column():
659
+ with gr.Row():
660
+ search_bar = gr.Textbox(
661
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
662
+ show_label=False,
663
+ elem_id="search-bar-aci",
664
+ )
665
+ with gr.Row():
666
+ shown_columns = gr.CheckboxGroup(
667
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)],
668
+ value=[
669
+ c.name
670
+ for c in fields(AutoEvalColumn)
671
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)
672
+ ],
673
+ label="Select columns to show",
674
+ elem_id="column-select-aci",
675
+ interactive=True,
676
+ )
677
+ with gr.Column(min_width=320):
678
+ filter_columns_type = gr.CheckboxGroup(
679
+ label="Model Types",
680
+ choices=[t.to_str() for t in ModelType],
681
+ value=[t.to_str() for t in ModelType],
682
+ interactive=True,
683
+ elem_id="filter-columns-type-aci",
684
+ )
685
+ filter_domain_specific = gr.CheckboxGroup(
686
+ label="Domain Specificity",
687
+ choices=["🏥 Clinical models", "Generic models"],
688
+ value=["🏥 Clinical models", "Generic models"],
689
+ interactive=True,
690
+ elem_id="filter-domain-specific-aci",
691
+ )
692
+ filter_columns_size = gr.CheckboxGroup(
693
+ label="Model sizes (in billions of parameters)",
694
+ choices=list(NUMERIC_INTERVALS.keys()),
695
+ value=list(NUMERIC_INTERVALS.keys()),
696
+ interactive=True,
697
+ elem_id="filter-columns-size-aci",
698
+ )
699
+
700
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="aci")
701
+
702
+ leaderboard_table = gr.Dataframe(
703
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
704
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
705
+ datatype=TYPES,
706
+ elem_id="leaderboard-table-aci",
707
+ interactive=False,
708
+ visible=True,
709
+ )
710
+
711
+ hidden_leaderboard_table_for_search = gr.Dataframe(
712
+ value=datasets_original_df[ACI_COLS],
713
+ headers=ACI_COLS,
714
+ datatype=TYPES,
715
+ visible=False,
716
+ )
717
+
718
+ search_bar.submit(
719
+ update_table,
720
+ [
721
+ hidden_leaderboard_table_for_search,
722
+ shown_columns,
723
+ search_bar,
724
+ filter_columns_type,
725
+ filter_domain_specific,
726
+ filter_columns_size
727
+ ],
728
+ leaderboard_table,
729
+ )
730
+
731
+ for selector in [
732
+ shown_columns,
733
+ filter_columns_type,
734
+ filter_domain_specific,
735
+ filter_columns_size,
736
+ ]:
737
+ selector.change(
738
+ update_table,
739
+ [
740
+ hidden_leaderboard_table_for_search,
741
+ shown_columns,
742
+ search_bar,
743
+ filter_columns_type,
744
+ filter_domain_specific,
745
+ filter_columns_size
746
+ ],
747
+ leaderboard_table,
748
+ queue=True,
749
+ )
750
+
751
+ with gr.TabItem("SOAP Notes", elem_id="llm-benchmark-tab-soap", id=1):
752
+ with gr.Row():
753
+ with gr.Column():
754
+ with gr.Row():
755
+ search_bar = gr.Textbox(
756
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
757
+ show_label=False,
758
+ elem_id="search-bar-soap",
759
+ )
760
+ with gr.Row():
761
+ shown_columns = gr.CheckboxGroup(
762
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)],
763
+ value=[
764
+ c.name
765
+ for c in fields(AutoEvalColumn)
766
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)
767
+ ],
768
+ label="Select columns to show",
769
+ elem_id="column-select-soap",
770
+ interactive=True,
771
+ )
772
+ with gr.Column(min_width=320):
773
+ filter_columns_type = gr.CheckboxGroup(
774
+ label="Model Types",
775
+ choices=[t.to_str() for t in ModelType],
776
+ value=[t.to_str() for t in ModelType],
777
+ interactive=True,
778
+ elem_id="filter-columns-type-soap",
779
+ )
780
+ filter_domain_specific = gr.CheckboxGroup(
781
+ label="Domain Specificity",
782
+ choices=["🏥 Clinical models", "Generic models"],
783
+ value=["🏥 Clinical models", "Generic models"],
784
+ interactive=True,
785
+ elem_id="filter-domain-specific-soap",
786
+ )
787
+ filter_columns_size = gr.CheckboxGroup(
788
+ label="Model sizes (in billions of parameters)",
789
+ choices=list(NUMERIC_INTERVALS.keys()),
790
+ value=list(NUMERIC_INTERVALS.keys()),
791
+ interactive=True,
792
+ elem_id="filter-columns-size-soap",
793
+ )
794
+
795
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="soap")
796
+
797
+ leaderboard_table = gr.Dataframe(
798
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
799
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
800
+ datatype=TYPES,
801
+ elem_id="leaderboard-table-soap",
802
+ interactive=False,
803
+ visible=True,
804
+ )
805
+
806
+ hidden_leaderboard_table_for_search = gr.Dataframe(
807
+ value=datasets_original_df[SOAP_COLS],
808
+ headers=SOAP_COLS,
809
+ datatype=TYPES,
810
+ visible=False,
811
+ )
812
+
813
+ search_bar.submit(
814
+ update_table,
815
+ [
816
+ hidden_leaderboard_table_for_search,
817
+ shown_columns,
818
+ search_bar,
819
+ filter_columns_type,
820
+ filter_domain_specific,
821
+ filter_columns_size
822
+ ],
823
+ leaderboard_table,
824
+ )
825
+
826
+ for selector in [
827
+ shown_columns,
828
+ filter_columns_type,
829
+ filter_domain_specific,
830
+ filter_columns_size,
831
+ ]:
832
+ selector.change(
833
+ update_table,
834
+ [
835
+ hidden_leaderboard_table_for_search,
836
+ shown_columns,
837
+ search_bar,
838
+ filter_columns_type,
839
+ filter_domain_specific,
840
+ filter_columns_size
841
+ ],
842
+ leaderboard_table,
843
+ queue=True,
844
+ )
845
+
846
+ with gr.Accordion("💬 Generation templates", open=False):
847
+ with gr.Accordion("ACI-Bench Response generation", open=False):
848
+ system_prompt, user_prompt = render_generation_templates(task="aci", generation_type="response_generation")
849
+ with gr.Accordion("SOAP Notes Response generation", open=False):
850
+ system_prompt, user_prompt = render_generation_templates(task="soap", generation_type="response_generation")
851
+ with gr.Accordion("Question generation", open=False):
852
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="question_generation")
853
+ with gr.Accordion("Cross Examination", open=False):
854
+ system_prompt, user_prompt = render_generation_templates(task="ce", generation_type="cross_examination")
855
+
856
+ with gr.TabItem("🏅 Closed Ended Evaluation", elem_id="llm-benchmark-tab-closed", id=6):
857
+ with gr.Tabs(elem_classes="tab-buttons2") as closed_tabs:
858
+ # ENGLISH TAB
859
+ with gr.TabItem("English", elem_id="llm-benchmark-tab-closed-english", id=0):
860
+ with gr.Row():
861
+ with gr.Column():
862
+ with gr.Row():
863
+ search_bar = gr.Textbox(
864
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
865
+ show_label=False,
866
+ elem_id="search-bar-closed-english",
867
+ )
868
+ with gr.Row():
869
+ shown_columns = gr.CheckboxGroup(
870
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
871
+ value=[
872
+ c.name
873
+ for c in fields(AutoEvalColumn)
874
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
875
+ ],
876
+ label="Select columns to show",
877
+ elem_id="column-select-closed-english",
878
+ interactive=True,
879
+ )
880
+ with gr.Column(min_width=320):
881
+ filter_columns_type = gr.CheckboxGroup(
882
+ label="Model Types",
883
+ choices=[t.to_str() for t in ModelType],
884
+ value=[t.to_str() for t in ModelType],
885
+ interactive=True,
886
+ elem_id="filter-columns-type-closed-english",
887
+ )
888
+ filter_domain_specific = gr.CheckboxGroup(
889
+ label="Domain Specificity",
890
+ choices=["🏥 Clinical models", "Generic models"],
891
+ value=["🏥 Clinical models", "Generic models"],
892
+ interactive=True,
893
+ elem_id="filter-domain-specific-closed-english",
894
+ )
895
+ filter_columns_size = gr.CheckboxGroup(
896
+ label="Model sizes (in billions of parameters)",
897
+ choices=list(NUMERIC_INTERVALS.keys()),
898
+ value=list(NUMERIC_INTERVALS.keys()),
899
+ interactive=True,
900
+ elem_id="filter-columns-size-closed-english",
901
+ )
902
+
903
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
904
+ leaderboard_table = gr.components.Dataframe(
905
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
906
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
907
+ datatype=TYPES,
908
+ elem_id="leaderboard-table-english",
909
+ interactive=False,
910
+ visible=True,
911
+ )
912
+
913
+ # Dummy leaderboard for handling the case when the user uses backspace key
914
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
915
+ value=datasets_original_df[DATASET_COLS],
916
+ headers=DATASET_COLS,
917
+ datatype=TYPES,
918
+ visible=False,
919
+ )
920
+
921
+ search_bar.submit(
922
+ update_table,
923
+ [
924
+ hidden_leaderboard_table_for_search,
925
+ shown_columns,
926
+ search_bar,
927
+ filter_columns_type,
928
+ filter_domain_specific,
929
+ filter_columns_size
930
+ ],
931
+ leaderboard_table,
932
+ )
933
+
934
+ for selector in [
935
+ shown_columns,
936
+ filter_columns_type,
937
+ filter_domain_specific,
938
+ filter_columns_size,
939
+ ]:
940
+ selector.change(
941
+ update_table,
942
+ [
943
+ hidden_leaderboard_table_for_search,
944
+ shown_columns,
945
+ search_bar,
946
+ filter_columns_type,
947
+ filter_domain_specific,
948
+ filter_columns_size
949
+ ],
950
+ leaderboard_table,
951
+ queue=True,
952
+ )
953
+
954
+ #MULTILINGUAL TAB - Same level as English tab
955
+ with gr.TabItem("🌍 Multilingual", elem_id="llm-benchmark-tab-table9", id=1):
956
+ with gr.Row():
957
+ gr.Markdown("📊 **Dataset Information:** This tab uses the Global MMLU dataset filtering only the subcategory: medical (10.7%)")
958
+
959
+ with gr.Row():
960
+ with gr.Column():
961
+ with gr.Row():
962
+ search_bar = gr.Textbox(
963
+ placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
964
+ show_label=False,
965
+ elem_id="search-bar",
966
+ )
967
+
968
+ with gr.Row():
969
+ shown_columns = gr.CheckboxGroup(
970
+ choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)],
971
+ value=[
972
+ c.name
973
+ for c in fields(AutoEvalColumn)
974
+ if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)
975
+ ],
976
+ label="Select columns to show",
977
+ elem_id="column-select",
978
+ interactive=True,
979
+ )
980
+ with gr.Column(min_width=320):
981
+ # with gr.Box(elem_id="box-filter"):
982
+ filter_columns_type = gr.CheckboxGroup(
983
+ label="Model Types",
984
+ choices=[t.to_str() for t in ModelType],
985
+ value=[t.to_str() for t in ModelType],
986
+ interactive=True,
987
+ elem_id="filter-columns-type",
988
+ )
989
+ filter_domain_specific = gr.CheckboxGroup(
990
+ label="Domain Specificity",
991
+ choices=["🏥 Clinical models", "Generic models"],
992
+ value=["🏥 Clinical models", "Generic models"],
993
+ interactive=True,
994
+ elem_id="filter-columns-type",
995
+ )
996
+ filter_columns_size = gr.CheckboxGroup(
997
+ label="Model sizes (in billions of parameters)",
998
+ choices=list(NUMERIC_INTERVALS.keys()),
999
+ value=list(NUMERIC_INTERVALS.keys()),
1000
+ interactive=True,
1001
+ elem_id="filter-columns-size",
1002
+ )
1003
+
1004
+ datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="closed_ended_multilingual")
1005
+ leaderboard_table = gr.components.Dataframe(
1006
+ value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1007
+ headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1008
+ datatype=TYPES,
1009
+ elem_id="leaderboard-table",
1010
+ interactive=False,
1011
+ visible=True,
1012
+ )
1013
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
1014
+ value=datasets_original_df[ClosedEndedMultilingual_COLS],
1015
+ headers=ClosedEndedMultilingual_COLS,
1016
+ datatype=TYPES,
1017
+ visible=False,
1018
+ )
1019
+
1020
+ search_bar.submit(
1021
+ update_table,
1022
+ [
1023
+ hidden_leaderboard_table_for_search,
1024
+ shown_columns,
1025
+ search_bar,
1026
+ filter_columns_type,
1027
+ filter_domain_specific,
1028
+ filter_columns_size
1029
+ # filter_columns_architecture
1030
+ ],
1031
+ leaderboard_table,
1032
+ )
1033
+ for selector in [
1034
+ shown_columns,
1035
+ filter_columns_type,
1036
+ filter_domain_specific,
1037
+ # filter_columns_architecture,
1038
+ filter_columns_size,
1039
+ # deleted_models_visibility,
1040
+ ]:
1041
+ selector.change(
1042
+ update_table,
1043
+ [
1044
+ hidden_leaderboard_table_for_search,
1045
+ shown_columns,
1046
+ search_bar,
1047
+ filter_columns_type,
1048
+ filter_domain_specific,
1049
+ filter_columns_size
1050
+ # filter_columns_architecture,
1051
+ ],
1052
+ leaderboard_table,
1053
+ queue=True,
1054
+ )
1055
+
1056
+ with gr.Row():
1057
+ with gr.Accordion("📙 Citation", open=False):
1058
+ citation_button = gr.Textbox(
1059
+ value=CITATION_BUTTON_TEXT,
1060
+ label=CITATION_BUTTON_LABEL,
1061
+ lines=20,
1062
+ elem_id="citation-button",
1063
+ show_copy_button=True,
1064
+ )
1065
+
1066
+ scheduler = BackgroundScheduler()
1067
+ scheduler.add_job(restart_space, "interval", seconds=1800)
1068
+ scheduler.start()
1069
+ demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'])
src/about.py CHANGED
@@ -40,6 +40,77 @@ class OpenEndedColumns(Enum):
40
  column3 = OpenEndedColumn("Score_intervals", "score", "Score 95% CI")
41
  # changes to be made here
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  @dataclass
44
  class MedSafetyColumn:
45
  benchmark: str
 
40
  column3 = OpenEndedColumn("Score_intervals", "score", "Score 95% CI")
41
  # changes to be made here
42
 
43
+
44
+ @dataclass
45
+ class OpenEndedMultilingualColumn:
46
+ benchmark: str
47
+ metric: str
48
+ col_name: str
49
+
50
+ class OpenEndedArabicColumn(Enum):
51
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
52
+ arabic_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
53
+ arabic_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
54
+ arabic_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
55
+ arabic_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
56
+
57
+
58
+ class OpenEndedFrenchColumn(Enum):
59
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
60
+ french_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
61
+ french_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
62
+ french_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
63
+ french_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
64
+
65
+
66
+ class OpenEndedSpanishColumn(Enum):
67
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
68
+ spanish_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
69
+ spanish_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
70
+ spanish_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
71
+ spanish_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
72
+
73
+
74
+ class OpenEndedPortugueseColumn(Enum):
75
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
76
+ porto_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
77
+ porto_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
78
+ porto_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
79
+ porto_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
80
+
81
+
82
+ class OpenEndedRomanianColumn(Enum):
83
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
84
+ rom_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
85
+ rom_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
86
+ rom_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
87
+ rom_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
88
+
89
+
90
+ class OpenEndedGreekColumn(Enum):
91
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
92
+ greek_column0 = OpenEndedMultilingualColumn("ELO", "score", "ELO")
93
+ greek_column1 = OpenEndedMultilingualColumn("ELO_intervals", "score", "ELO 95% CI")
94
+ greek_column2 = OpenEndedMultilingualColumn("Score", "score", "Score")
95
+ greek_column3 = OpenEndedMultilingualColumn("Score_intervals", "score", "Score 95% CI")
96
+
97
+
98
+
99
+ @dataclass
100
+ class ClosedEndedMultilingualColumn:
101
+ benchmark: str
102
+ metric: str
103
+ col_name: str
104
+
105
+
106
+ class ClosedEndedMultilingualColumns(Enum):
107
+ mtask0 = ClosedEndedMultilingualColumn("Global-MMLU-Arabic", "accuracy", "🇦🇪Arabic")
108
+ mtask1 = ClosedEndedMultilingualColumn("Global-MMLU-French", "accuracy", "🇫🇷French")
109
+ mtask2 = ClosedEndedMultilingualColumn("Global-MMLU-Spanish", "accuracy", "🇪🇸Spanish")
110
+ mtask3 = ClosedEndedMultilingualColumn("Global-MMLU-Portuguese", "accuracy", "🇵🇹Portuguese")
111
+ mtask4 = ClosedEndedMultilingualColumn("Global-MMLU-Romanian", "accuracy", "🇷🇴Romanian")
112
+ mtask5 = ClosedEndedMultilingualColumn("Global-MMLU-Greek", "accuracy", "🇬🇷Greek")
113
+
114
  @dataclass
115
  class MedSafetyColumn:
116
  benchmark: str
src/display/utils.py CHANGED
@@ -4,7 +4,7 @@ from enum import Enum
4
  import pandas as pd
5
 
6
  # changes to be made here
7
- from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns
8
  from src.envs import PRIVATE_REPO
9
  import json
10
  import gradio as gr
@@ -34,16 +34,22 @@ class ColumnContent:
34
  closed_ended_arabic_col: bool = False
35
  healthbench_col: bool = False
36
  healthbench_hard_col: bool = False
 
 
 
 
 
 
 
37
 
38
 
39
  ## Leaderboard columns
40
- auto_eval_column_dict = []
41
  # Init
42
  auto_eval_column_dict = []
43
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
44
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
45
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
46
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, closed_ended_arabic_col=True, invariant=False)])
47
  auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
48
  for task in HarnessTasks:
49
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
@@ -72,6 +78,20 @@ for column in HealthbenchHardColumns:
72
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, healthbench_hard_col=True, invariant=False)])
73
  else:
74
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, healthbench_hard_col=True, invariant=False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
77
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
@@ -234,6 +254,27 @@ HEALTHBENCH_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (
234
  HEALTHBENCH_HARD_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.healthbench_hard_col or c.invariant)]
235
 
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
238
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
239
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
@@ -251,6 +292,18 @@ SOAP_BENCHMARK_COLS = [t.value.col_name for t in SOAPColumns]
251
  HEALTHBENCH_BENCHMARK_COLS = [t.value.col_name for t in HealthbenchColumns]
252
  HEALTHBENCH_HARD_BENCHMARK_COLS = [t.value.col_name for t in HealthbenchHardColumns]
253
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  NUMERIC_INTERVALS = {
255
  "?": pd.Interval(-100, 0, closed="right"),
256
  "~1.5": pd.Interval(0, 2, closed="right"),
 
4
  import pandas as pd
5
 
6
  # changes to be made here
7
+ from src.about import HarnessTasks, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, ClosedEndedMultilingualColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn
8
  from src.envs import PRIVATE_REPO
9
  import json
10
  import gradio as gr
 
34
  closed_ended_arabic_col: bool = False
35
  healthbench_col: bool = False
36
  healthbench_hard_col: bool = False
37
+ open_ended_arabic_col: bool = False
38
+ open_ended_french_col: bool = False
39
+ open_ended_spanish_col: bool = False
40
+ open_ended_portuguese_col: bool = False
41
+ open_ended_romanian_col: bool = False
42
+ open_ended_greek_col: bool = False
43
+ closed_ended_multilingual_col: bool = False
44
 
45
 
46
  ## Leaderboard columns
 
47
  # Init
48
  auto_eval_column_dict = []
49
  auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
50
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
51
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, True)])
52
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average", "number", True, False, dataset_task_col=True, closed_ended_multilingual_col=True, invariant=False)])
53
  auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall Score", "number", True, False, medical_summarization_col=True, aci_col=True, soap_col=True, invariant=False)])
54
  for task in HarnessTasks:
55
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True, False, dataset_task_col=True, invariant=False)])
 
78
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", False, False, healthbench_hard_col=True, invariant=False)])
79
  else:
80
  auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, healthbench_hard_col=True, invariant=False)])
81
+ for column in OpenEndedArabicColumn:
82
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_arabic_col=True, invariant=False)])
83
+ for column in OpenEndedFrenchColumn:
84
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_french_col=True, invariant=False)])
85
+ for column in OpenEndedSpanishColumn:
86
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_spanish_col=True, invariant=False)])
87
+ for column in OpenEndedPortugueseColumn:
88
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_portuguese_col=True, invariant=False)])
89
+ for column in OpenEndedRomanianColumn:
90
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_romanian_col=True, invariant=False)])
91
+ for column in OpenEndedGreekColumn:
92
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, open_ended_greek_col=True, invariant=False)])
93
+ for column in ClosedEndedMultilingualColumns:
94
+ auto_eval_column_dict.append([column.name, ColumnContent, ColumnContent(column.value.col_name, "number", True, False, closed_ended_multilingual_col=True, invariant=False)])
95
 
96
  auto_eval_column_dict.append(["is_domain_specific", ColumnContent, ColumnContent("Is Domain Specific", "bool", False)])
97
  auto_eval_column_dict.append(["use_chat_template", ColumnContent, ColumnContent("Uses Chat Template", "bool", False)])
 
254
  HEALTHBENCH_HARD_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.healthbench_hard_col or c.invariant)]
255
 
256
 
257
+ OpenEndedArabic_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_arabic_col or c.invariant)]
258
+ OpenEndedFrench_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_french_col or c.invariant)]
259
+ OpenEndedSpanish_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_spanish_col or c.invariant)]
260
+ OpenEndedPortuguese_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_portuguese_col or c.invariant)]
261
+ OpenEndedRomanian_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_romanian_col or c.invariant)]
262
+ OpenEndedGreek_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.open_ended_greek_col or c.invariant)]
263
+
264
+
265
+
266
+ ClosedEndedMultilingual_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_multilingual_col or c.invariant)]
267
+
268
+
269
+
270
+ # if PRIVATE_REPO:
271
+ #CLOSED_ENDED_ARABIC_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.closed_ended_arabic_col or c.invariant)]
272
+ # CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and (c.cross_examination_col or c.invariant)]
273
+ # DATASET_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.cross_examination_col]
274
+ # OPEN_ENDED_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.dataset_task_col and not c.med_safety_col and not c.cross_examination_col]
275
+ # MED_SAFETY_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.dataset_task_col and not c.cross_examination_col]
276
+ # CROSS_EXAMINATION_COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.open_ended_col and not c.med_safety_col and not c.dataset_task_col]
277
+
278
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
279
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
280
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
292
  HEALTHBENCH_BENCHMARK_COLS = [t.value.col_name for t in HealthbenchColumns]
293
  HEALTHBENCH_HARD_BENCHMARK_COLS = [t.value.col_name for t in HealthbenchHardColumns]
294
 
295
+
296
+ #changed this
297
+ OpenEndedArabic_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedArabicColumn]
298
+ OpenEndedFrench_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedFrenchColumn]
299
+ OpenEndedPortuguese_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedPortugueseColumn]
300
+ OpenEndedSpanish_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedSpanishColumn]
301
+ OpenEndedRomanian_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedRomanianColumn]
302
+ OpenEndedGreek_BENCHMARK_COLS = [t.value.col_name for t in OpenEndedGreekColumn]
303
+
304
+
305
+ ClosedEndedMultilingual_BENCHMARK_COLS = [t.value.col_name for t in ClosedEndedMultilingualColumns]
306
+
307
  NUMERIC_INTERVALS = {
308
  "?": pd.Interval(-100, 0, closed="right"),
309
  "~1.5": pd.Interval(0, 2, closed="right"),
src/leaderboard/instr.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ in about
2
+ from app, to read evals, to utils to about ( to define the tasks and the colums ( so for close-ended define the languages and for open-ended ( use the same code with 95%CI, Elo rating...)))
3
+ define a class for open-ended-multilingual ( 6 times for all) the and close-ended mulitlingual globalmmlu
4
+ 6 columns for open-ended and one different for multili
5
+
6
+ in utils:
7
+
8
+ i should define the columns for languages again ( here we dont care about the hidden parts but we need to define in the beginning )
9
+
10
+ in read_evals
11
+
12
+ definition of the results of the data frames, and the definition of the int
13
+
14
+ for the front end:
15
+
16
+ in the app.py,i should add the gr.tabitem for open-ended, follow the healthbench and add the languages same logic as "ALL"
src/leaderboard/read_evals.py CHANGED
@@ -9,7 +9,7 @@ import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
- from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns
13
  from src.submission.check_validity import is_model_on_hub
14
  from src.envs import PRIVATE_REPO
15
 
@@ -31,6 +31,13 @@ class EvalResult:
31
  soap_results: dict
32
  healthbench_results: dict
33
  healthbench_hard_results: dict
 
 
 
 
 
 
 
34
  is_domain_specific: bool
35
  use_chat_template: bool
36
  # clinical_type_results:dict
@@ -108,7 +115,7 @@ class EvalResult:
108
  open_ended_results = {}
109
  if "open-ended" in data["results"]:
110
  for task in OpenEndedColumns:
111
- task = task.value
112
  # We average all scores of a given metric (not all metrics are present in all files)
113
  accs = data["results"]["open-ended"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended"]["overall"] else None
114
  open_ended_results[task.benchmark] = accs
@@ -196,6 +203,109 @@ class EvalResult:
196
  accs = data["results"]["healthbench-hard"]["Theme Scores"][task.benchmark]
197
  healthbench_hard_results[task.benchmark] = accs
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  return self(
200
  eval_name=result_key,
201
  full_model=full_model,
@@ -210,6 +320,13 @@ class EvalResult:
210
  soap_results=soap_results,
211
  healthbench_results=healthbench_results,
212
  healthbench_hard_results=healthbench_hard_results,
 
 
 
 
 
 
 
213
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
214
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
215
  precision=precision,
@@ -322,6 +439,43 @@ class EvalResult:
322
  for task in HealthbenchHardColumns:
323
  data_dict[task.value.col_name] = self.healthbench_hard_results[task.value.benchmark]
324
  return data_dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
  def get_request_file_for_model(requests_path, model_name, precision):
327
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
 
9
 
10
  from src.display.formatting import make_clickable_model
11
  # changes to be made here
12
+ from src.display.utils import AutoEvalColumn, ModelType, ModelArch, Precision, HarnessTasks, WeightType, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, ClosedEndedMultilingualColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn
13
  from src.submission.check_validity import is_model_on_hub
14
  from src.envs import PRIVATE_REPO
15
 
 
31
  soap_results: dict
32
  healthbench_results: dict
33
  healthbench_hard_results: dict
34
+ open_ended_arabic_results: dict
35
+ open_ended_french_results: dict
36
+ open_ended_spanish_results: dict
37
+ open_ended_portuguese_results: dict
38
+ open_ended_romanian_results: dict
39
+ open_ended_greek_results: dict
40
+ closed_ended_multilingual_results: dict
41
  is_domain_specific: bool
42
  use_chat_template: bool
43
  # clinical_type_results:dict
 
115
  open_ended_results = {}
116
  if "open-ended" in data["results"]:
117
  for task in OpenEndedColumns:
118
+ task = task.value
119
  # We average all scores of a given metric (not all metrics are present in all files)
120
  accs = data["results"]["open-ended"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended"]["overall"] else None
121
  open_ended_results[task.benchmark] = accs
 
203
  accs = data["results"]["healthbench-hard"]["Theme Scores"][task.benchmark]
204
  healthbench_hard_results[task.benchmark] = accs
205
 
206
+ open_ended_arabic_results = {}
207
+ if "open-ended-arabic" in data["results"]:
208
+ for task in OpenEndedArabicColumn:
209
+ task = task.value
210
+ # We average all scores of a given metric (not all metrics are present in all files)
211
+ accs = data["results"]["open-ended-arabic"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-arabic"]["overall"] else None
212
+ open_ended_arabic_results[task.benchmark] = accs
213
+ if open_ended_arabic_results["ELO_intervals"] is not None and open_ended_arabic_results["Score_intervals"] is not None:
214
+ open_ended_arabic_results["ELO_intervals"] = "+" + str(open_ended_arabic_results["ELO_intervals"][1]) + "/-" + str(abs(float(open_ended_arabic_results["ELO_intervals"][0])))
215
+ open_ended_arabic_results["Score_intervals"] = "+" + str(open_ended_arabic_results["Score_intervals"][1]) + "/-" + str(abs(float(open_ended_arabic_results["Score_intervals"][0])))
216
+ open_ended_french_results = {}
217
+ if "open-ended-french" in data["results"]:
218
+ for task in OpenEndedFrenchColumn:
219
+ task = task.value
220
+ # We average all scores of a given metric (not all metrics are present in all files)
221
+ accs = data["results"]["open-ended-french"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-french"]["overall"] else None
222
+ open_ended_french_results[task.benchmark] = accs
223
+ if open_ended_french_results["ELO_intervals"] is not None and open_ended_french_results["Score_intervals"] is not None:
224
+ open_ended_french_results["ELO_intervals"] = "+" + str(open_ended_french_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_french_results["ELO_intervals"][0]))
225
+ open_ended_french_results["Score_intervals"] = "+" + str(open_ended_french_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_french_results["Score_intervals"][0]))
226
+ open_ended_spanish_results = {}
227
+ if "open-ended-spanish" in data["results"]:
228
+ for task in OpenEndedSpanishColumn:
229
+ task = task.value
230
+ # We average all scores of a given metric (not all metrics are present in all files)
231
+ accs = data["results"]["open-ended-spanish"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-spanish"]["overall"] else None
232
+ open_ended_spanish_results[task.benchmark] = accs
233
+ if open_ended_spanish_results["ELO_intervals"] is not None and open_ended_spanish_results["Score_intervals"] is not None:
234
+ open_ended_spanish_results["ELO_intervals"] = "+" + str(open_ended_spanish_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_spanish_results["ELO_intervals"][0]))
235
+ open_ended_spanish_results["Score_intervals"] = "+" + str(open_ended_spanish_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_spanish_results["Score_intervals"][0]))
236
+ open_ended_portuguese_results = {}
237
+ if "open-ended-portuguese" in data["results"]:
238
+ for task in OpenEndedPortugueseColumn:
239
+ task = task.value
240
+ # We average all scores of a given metric (not all metrics are present in all files)
241
+ accs = data["results"]["open-ended-portuguese"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-portuguese"]["overall"] else None
242
+ open_ended_portuguese_results[task.benchmark] = accs
243
+ if open_ended_portuguese_results["ELO_intervals"] is not None and open_ended_portuguese_results["Score_intervals"] is not None:
244
+ open_ended_portuguese_results["ELO_intervals"] = "+" + str(open_ended_portuguese_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_portuguese_results["ELO_intervals"][0]))
245
+ open_ended_portuguese_results["Score_intervals"] = "+" + str(open_ended_portuguese_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_portuguese_results["Score_intervals"][0]))
246
+ open_ended_romanian_results = {}
247
+ if "open-ended-romanian" in data["results"]:
248
+ for task in OpenEndedRomanianColumn:
249
+ task = task.value
250
+ # We average all scores of a given metric (not all metrics are present in all files)
251
+ accs = data["results"]["open-ended-romanian"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-romanian"]["overall"] else None
252
+ open_ended_romanian_results[task.benchmark] = accs
253
+ if open_ended_romanian_results["ELO_intervals"] is not None and open_ended_romanian_results["Score_intervals"] is not None:
254
+ open_ended_romanian_results["ELO_intervals"] = "+" + str(open_ended_romanian_results["ELO_intervals"][1]) + "/-" + str(abs(open_ended_romanian_results["ELO_intervals"][0]))
255
+ open_ended_romanian_results["Score_intervals"] = "+" + str(open_ended_romanian_results["Score_intervals"][1]) + "/-" + str(abs(open_ended_romanian_results["Score_intervals"][0]))
256
+ open_ended_greek_results = {}
257
+ if "open-ended-greek" in data["results"]:
258
+ for task in OpenEndedGreekColumn:
259
+ task = task.value
260
+ # We average all scores of a given metric (not all metrics are present in all files)
261
+ accs = data["results"]["open-ended-greek"]["overall"][task.benchmark] if task.benchmark in data["results"]["open-ended-greek"]["overall"] else None
262
+ open_ended_greek_results[task.benchmark] = accs
263
+ if open_ended_greek_results["ELO_intervals"] is not None and open_ended_greek_results["Score_intervals"] is not None:
264
+ open_ended_greek_results["ELO_intervals"] = "+" + str(open_ended_greek_results["ELO_intervals"][1]) + "/-" + str(abs(float(open_ended_greek_results["ELO_intervals"][0])))
265
+ open_ended_greek_results["Score_intervals"] = "+" + str(open_ended_greek_results["Score_intervals"][1]) + "/-" + str(abs(float(open_ended_greek_results["Score_intervals"][0])))
266
+ closed_ended_multilingual_results = {}
267
+ if "closed-ended-multilingual" in data["results"]:
268
+ for task in ClosedEndedMultilingualColumns:
269
+ task = task.value
270
+ accs = data["results"]["closed-ended-multilingual"][task.benchmark]["accuracy"] if task.benchmark in data["results"]["closed-ended-multilingual"] else None
271
+ closed_ended_multilingual_results[task.benchmark] = accs
272
+
273
+ # #add the
274
+ # closed_ended_arabic_results = {}
275
+ # if PRIVATE_REPO and "closed-ended-arabic" in data["results"]:
276
+ # for task in ClosedEndedArabicColumns:
277
+ # task = task.value
278
+ # # We average all scores of a given metric (not all metrics are present in all files)
279
+ # try:
280
+ # accs = np.array([v.get(task.metric, None) for k, v in data["results"]["closed-ended-arabic"].items() if task.benchmark == k])
281
+ # except:
282
+ # # breakpoint()
283
+ # accs = np.array([])
284
+ # if accs.size == 0 or any([acc is None for acc in accs]):
285
+ # continue
286
+ # mean_acc = np.mean(accs) # * 100.0
287
+ # closed_ended_arabic_results[task.benchmark] = mean_acc
288
+
289
+
290
+ # if open_ended_results == {} or med_safety_results == {} or medical_summarization_results == {} or aci_results == {} or soap_results == {}:
291
+ # open_ended_results = {}
292
+ # med_safety_results = {}
293
+ # medical_summarization_results = {}
294
+ # aci_results = {}
295
+ # soap_results = {}
296
+ # types_results = {}
297
+ # for clinical_type in ClinicalTypes:
298
+ # clinical_type = clinical_type.value
299
+
300
+ # # We average all scores of a given metric (not all metrics are present in all files)
301
+ # accs = np.array([v.get(clinical_type.metric, None) for k, v in data[evaluation_metric]["clinical_type_results"].items() if clinical_type.benchmark == k])
302
+ # if accs.size == 0 or any([acc is None for acc in accs]):
303
+ # continue
304
+
305
+ # mean_acc = np.mean(accs) # * 100.0
306
+ # types_results[clinical_type.benchmark] = mean_acc
307
+ # if "deepseek-ai/DeepSeek-R1-Distill-Llama-70B" in json_filepath:
308
+ # breakpoint()
309
  return self(
310
  eval_name=result_key,
311
  full_model=full_model,
 
320
  soap_results=soap_results,
321
  healthbench_results=healthbench_results,
322
  healthbench_hard_results=healthbench_hard_results,
323
+ open_ended_arabic_results=open_ended_arabic_results,
324
+ open_ended_french_results=open_ended_french_results,
325
+ open_ended_spanish_results=open_ended_spanish_results,
326
+ open_ended_portuguese_results=open_ended_portuguese_results,
327
+ open_ended_romanian_results=open_ended_romanian_results,
328
+ open_ended_greek_results=open_ended_greek_results,
329
+ closed_ended_multilingual_results=closed_ended_multilingual_results,
330
  is_domain_specific=config.get("is_domain_specific", False), # Assuming a default value
331
  use_chat_template=config.get("use_chat_template", False), # Assuming a default value
332
  precision=precision,
 
439
  for task in HealthbenchHardColumns:
440
  data_dict[task.value.col_name] = self.healthbench_hard_results[task.value.benchmark]
441
  return data_dict
442
+ if subset == "open_ended_arabic":
443
+ if len(self.open_ended_arabic_results) > 0:
444
+ for task in OpenEndedArabicColumn:
445
+ data_dict[task.value.col_name] = self.open_ended_arabic_results[task.value.benchmark]
446
+ return data_dict
447
+ if subset == "open_ended_french":
448
+ if len(self.open_ended_french_results) > 0:
449
+ for task in OpenEndedFrenchColumn:
450
+ data_dict[task.value.col_name] = self.open_ended_french_results[task.value.benchmark]
451
+ return data_dict
452
+ if subset == "open_ended_spanish":
453
+ if len(self.open_ended_spanish_results) > 0:
454
+ for task in OpenEndedSpanishColumn:
455
+ data_dict[task.value.col_name] = self.open_ended_spanish_results[task.value.benchmark]
456
+ return data_dict
457
+ if subset == "open_ended_portuguese":
458
+ if len(self.open_ended_portuguese_results) > 0:
459
+ for task in OpenEndedPortugueseColumn:
460
+ data_dict[task.value.col_name] = self.open_ended_portuguese_results[task.value.benchmark]
461
+ return data_dict
462
+ if subset == "open_ended_romanian":
463
+ if len(self.open_ended_romanian_results) > 0:
464
+ for task in OpenEndedRomanianColumn:
465
+ data_dict[task.value.col_name] = self.open_ended_romanian_results[task.value.benchmark]
466
+ return data_dict
467
+ if subset == "open_ended_greek":
468
+ if len(self.open_ended_greek_results) > 0:
469
+ for task in OpenEndedGreekColumn:
470
+ data_dict[task.value.col_name] = self.open_ended_greek_results[task.value.benchmark]
471
+ return data_dict
472
+ if subset == "closed_ended_multilingual":
473
+ average = sum([v for v in self.closed_ended_multilingual_results.values() if v is not None]) / len(ClosedEndedMultilingualColumns)
474
+ data_dict[AutoEvalColumn.average.name] = average
475
+ if len(self.closed_ended_multilingual_results) > 0:
476
+ for task in ClosedEndedMultilingualColumns:
477
+ data_dict[task.value.col_name] = self.closed_ended_multilingual_results[task.value.benchmark]
478
+ return data_dict
479
 
480
  def get_request_file_for_model(requests_path, model_name, precision):
481
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
src/populate.py CHANGED
@@ -5,7 +5,7 @@ import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
  from src.envs import PRIVATE_REPO
11
 
@@ -14,15 +14,16 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
14
  raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
15
  # print(raw_data)
16
  # raise Exception("stop")
 
 
17
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
18
-
19
  df = pd.DataFrame.from_records(all_data_json)
20
  # changes to be made here
21
  if subset == "datasets":
22
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
23
  elif subset == "med_safety":
24
  df = df.sort_values(by=["Harmfulness Score"], ascending=True)
25
- elif subset == "open_ended":
26
  df = df.sort_values(by=["ELO"], ascending=False)
27
  elif subset == "medical_summarization":
28
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
@@ -36,6 +37,8 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
36
  df = df.sort_values(by=["Overall Score"], ascending=False)
37
  elif subset == "healthbench_hard":
38
  df = df.sort_values(by=["Overall Score"], ascending=False)
 
 
39
  cols = list(set(df.columns).intersection(set(cols)))
40
  df = df[cols].round(decimals=2)
41
  # filter out if any of the benchmarks have not been produced
 
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
  # changes to be made here
8
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn, ClosedEndedMultilingualColumns
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
  from src.envs import PRIVATE_REPO
11
 
 
14
  raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
15
  # print(raw_data)
16
  # raise Exception("stop")
17
+ # if subset.startswith("healthbench"):
18
+ # breakpoint()
19
  all_data_json = [v.to_dict(subset=subset) for v in raw_data]
 
20
  df = pd.DataFrame.from_records(all_data_json)
21
  # changes to be made here
22
  if subset == "datasets":
23
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
24
  elif subset == "med_safety":
25
  df = df.sort_values(by=["Harmfulness Score"], ascending=True)
26
+ elif subset.startswith("open_ended"):
27
  df = df.sort_values(by=["ELO"], ascending=False)
28
  elif subset == "medical_summarization":
29
  df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
 
37
  df = df.sort_values(by=["Overall Score"], ascending=False)
38
  elif subset == "healthbench_hard":
39
  df = df.sort_values(by=["Overall Score"], ascending=False)
40
+ elif subset == "closed_ended_multilingual":
41
+ df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
42
  cols = list(set(df.columns).intersection(set(cols)))
43
  df = df[cols].round(decimals=2)
44
  # filter out if any of the benchmarks have not been produced
src/submission/check_validity.py CHANGED
@@ -49,7 +49,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
49
  return True, None, config
50
 
51
  except ValueError as e:
52
- print(e)
53
  return (
54
  False,
55
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
@@ -57,7 +57,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
57
  )
58
 
59
  except Exception as e:
60
- print(e)
61
  return False, "was not found on hub!", None
62
 
63
  def get_model_size(model_info: ModelInfo, precision: str=None):
 
49
  return True, None, config
50
 
51
  except ValueError as e:
52
+ # print(e)
53
  return (
54
  False,
55
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
 
57
  )
58
 
59
  except Exception as e:
60
+ # print(e)
61
  return False, "was not found on hub!", None
62
 
63
  def get_model_size(model_info: ModelInfo, precision: str=None):