Spaces:
Restarting
Restarting
File size: 6,200 Bytes
9ae8d89 0a14325 930ed8c 9ae8d89 20dad4a 9ae8d89 09b313f 9ae8d89 d8147b8 09b313f 930ed8c ca44f9b 9ae8d89 0a14325 0da5ee3 0a14325 ba515db d83f3a1 0da5ee3 553b217 2a7ac72 553b217 2a7ac72 553b217 2a7ac72 20dad4a fb84311 d83f3a1 4b6eb81 09b313f 9ae8d89 09b313f 9ae8d89 b50c184 9ae8d89 0a14325 d86ca68 0a14325 553b217 20dad4a 9ae8d89 09b313f 9ae8d89 09b313f b50c184 9ae8d89 d86ca68 0a14325 553b217 20dad4a 9ae8d89 d86ca68 0a14325 553b217 20dad4a 553b217 d86ca68 9ae8d89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import json
import os
import pandas as pd
from src.display.formatting import has_no_nan_values, make_clickable_model
# changes to be made here
from src.display.utils import AutoEvalColumn, EvalQueueColumn, OpenEndedColumns, MedSafetyColumns, MedicalSummarizationColumns, ACIColumns, SOAPColumns, HealthbenchColumns, HealthbenchHardColumns, OpenEndedArabicColumn, OpenEndedFrenchColumn, OpenEndedSpanishColumn, OpenEndedPortugueseColumn, OpenEndedRomanianColumn, OpenEndedGreekColumn, ClosedEndedMultilingualColumns
from src.leaderboard.read_evals import get_raw_eval_results
from src.envs import PRIVATE_REPO
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list, evaluation_metric:str, subset:str) -> pd.DataFrame:
"""Creates a dataframe from all the individual experiment results"""
raw_data = get_raw_eval_results(results_path, requests_path, evaluation_metric)
# print(raw_data)
# raise Exception("stop")
# if subset.startswith("healthbench"):
# breakpoint()
all_data_json = [v.to_dict(subset=subset) for v in raw_data if not v.full_model.startswith("/models_llm")]
df = pd.DataFrame.from_records(all_data_json)
# changes to be made here
if subset == "datasets":
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
elif subset == "med_safety":
df = df.sort_values(by=["Harmfulness Score"], ascending=True)
elif subset.startswith("open_ended"):
df = df.sort_values(by=["ELO"], ascending=False)
elif subset == "medical_summarization":
df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
elif subset == "aci":
df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
elif subset == "soap":
df = df.sort_values(by=[AutoEvalColumn.overall.name], ascending=False)
elif subset == "closed_ended_arabic":
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
elif subset == "healthbench":
df = df.sort_values(by=["Overall Score"], ascending=False)
elif subset == "healthbench_hard":
df = df.sort_values(by=["Overall Score"], ascending=False)
elif subset == "closed_ended_multilingual":
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
cols = list(set(df.columns).intersection(set(cols)))
df = df[cols].round(decimals=2)
# filter out if any of the benchmarks have not been produced
df = df[has_no_nan_values(df, benchmark_cols)]
return raw_data, df
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
"""Creates the different dataframes for the evaluation queues requestes"""
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
all_evals = []
for entry in entries:
if ".json" in entry:
file_path = os.path.join(save_path, entry)
with open(file_path) as fp:
data = json.load(fp)
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"]) if not data["private"] else data["model_name"]
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
# changes to be made here
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
if PRIVATE_REPO:
data[EvalQueueColumn.closed_ended_arabic_status.name] = data["status"]["closed-ended-arabic"]
all_evals.append(data)
elif ".md" not in entry:
# this is a folder
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
for sub_entry in sub_entries:
file_path = os.path.join(save_path, entry, sub_entry)
with open(file_path) as fp:
data = json.load(fp)
# print(data)
data[EvalQueueColumn.model.name] = make_clickable_model(data["model_name"]) if not data["private"] else data["model_name"]
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
data[EvalQueueColumn.closed_ended_status.name] = data["status"]["closed-ended"]
data[EvalQueueColumn.open_ended_status.name] = data["status"]["open-ended"]
data[EvalQueueColumn.med_safety_status.name] = data["status"]["med-safety"]
data[EvalQueueColumn.medical_summarization_status.name] = data["status"]["medical-summarization"]
data[EvalQueueColumn.note_generation_status.name] = data["status"]["note-generation"]
if PRIVATE_REPO:
data[EvalQueueColumn.closed_ended_arabic_status.name] = data["status"]["closed-ended-arabic"]
all_evals.append(data)
# breakpoint()
pending_list = []
running_list = []
finished_list = []
for run in all_evals:
# changes to be made here
status_list = [run["status"]["closed-ended"], run["status"]["open-ended"], run["status"]["med-safety"], run["status"]["medical-summarization"], run["status"]["note-generation"]]
if PRIVATE_REPO:
status_list.append(run["status"]["closed-ended-arabic"])
# status_list = status_list
if "RUNNING" in status_list:
running_list.append(run)
elif "PENDING" in status_list or "RERUN" in status_list:
pending_list.append(run)
else:
finished_list.append(run)
# breakpoint()
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
df_running = pd.DataFrame.from_records(running_list, columns=cols)
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
return df_finished[cols], df_running[cols], df_pending[cols]
|