In [2]:
import json
from pathlib import Path

import gradio as gr
import pandas as pd

In [3]:
def get_leaderboard_df():
    filepaths = list(Path("eval_results").rglob("*.json"))

    # Parse filepaths to get unique models
    models = set()
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        model_revision = "_".join(path_parts[1:4])
        models.add(model_revision)

    # Initialize DataFrame
    df = pd.DataFrame(index=list(models))

    # Extract data from each file and populate the DataFrame
    for filepath in filepaths:
        path_parts = Path(filepath).parts
        date = filepath.stem.split("_")[-1][:-3].split("T")[0]
        model_revision = "_".join(path_parts[1:4]) + "_" + date
        task = path_parts[4].capitalize()
        df.loc[model_revision, "Date"] = date

        with open(filepath, "r") as file:
            data = json.load(file)
            first_result_key = next(iter(data["results"]))  # gets the first key in 'results'
            # TruthfulQA has two metrics, so we need to pick the `mc2` one that's reported on the leaderboard
            if task.lower() == "truthfulqa":
                value = data["results"][first_result_key]["truthfulqa_mc2"]
            # IFEval has several metrics but we report just the prompt-loose-acc one
            elif task.lower() == "ifeval":
                value = data["results"][first_result_key]["prompt_level_loose_acc"]
            # MMLU has several metrics but we report just the average one
            elif task.lower() == "mmlu":
                value = data["results"]["lighteval|mmlu:_average|5"]["acc"]
            # HellaSwag and ARC reports acc_norm
            elif task.lower() in ["hellaswag", "arc"]:
                value = data["results"][first_result_key]["acc_norm"]
            else:
                first_metric_key = next(
                    iter(data["results"][first_result_key])
                )  # gets the first key in the first result
                value = data["results"][first_result_key][first_metric_key]  # gets the value of the first metric
            df.loc[model_revision, task] = value

    # Put IFEval in first column
    ifeval_col = df.pop("Ifeval")
    df.insert(1, "Ifeval", ifeval_col)
    # Drop rows where every entry is NaN
    df = df.dropna(how="all", axis=0, subset=[c for c in df.columns if c != "Date"])
    df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
    # Convert all values to percentage
    df[df.select_dtypes(include=["number"]).columns] *= 100.0
    df = df.sort_values(by=["Average"], ascending=False)
    df = df.reset_index().rename(columns={"index": "Model"}).round(2)
    # Strip off date from model name
    df["Model"] = df["Model"].apply(lambda x: x.rsplit("_", 1)[0])
    return df

In [4]:
df = get_leaderboard_df()

In [5]:
df

Unnamed: 0,Model,Date,Average,Ifeval,Truthfulqa,Winogrande,Gsm8k,Mmlu,Hellaswag,Arc
0,NousResearch_Nous-Hermes-2-Yi-34B_main,2024-03-04,74.01,,61.44,80.58,,76.24,83.79,68.00
1,deepseek-ai_deepseek-llm-67b-chat_main,2024-03-05,71.62,55.27,,,76.12,71.18,83.94,
2,NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main,2024-03-02,70.43,59.33,64.76,78.53,62.17,71.96,85.42,70.82
3,mistralai_Mixtral-8x7B-Instruct-v0.1_main,2024-03-02,69.80,55.08,70.79,73.56,59.89,70.60,86.68,72.01
4,deepseek-ai_deepseek-llm-67b-chat_main,2024-03-04,67.03,,57.78,79.16,,,,64.16
...,...,...,...,...,...,...,...,...,...,...
269,HuggingFaceH4_starcoder2-15b-ift_v18.0,2024-03-10,11.23,21.63,,,0.83,,,
270,HuggingFaceH4_mistral-7b-ift_v49.0,2024-03-07,10.07,20.15,,,0.00,,,
271,HuggingFaceH4_starchat-beta_main,2024-03-12,8.13,8.13,,,,,,
272,HuggingFaceH4_starcoder2-15b-ift_v7.0,2024-03-10,7.88,12.57,,,3.18,,,


In [14]:
new_df = df.drop(["Date", "Average"], axis=1).groupby("Model").max().reset_index()
new_df

Unnamed: 0,Model,Ifeval,Truthfulqa,Winogrande,Gsm8k,Mmlu,Hellaswag,Arc
0,HuggingFaceH4_mistral-7b-ift_v41.0,44.36,49.35,72.93,37.30,60.82,79.70,58.36
1,HuggingFaceH4_mistral-7b-ift_v41.1,47.32,47.89,72.69,36.32,60.34,79.57,57.51
2,HuggingFaceH4_mistral-7b-ift_v41.10,32.72,51.05,72.45,25.93,59.75,81.92,59.22
3,HuggingFaceH4_mistral-7b-ift_v41.11,37.89,51.05,64.56,17.59,57.60,77.65,55.89
4,HuggingFaceH4_mistral-7b-ift_v41.12,37.89,45.94,63.30,21.15,58.50,74.94,52.73
...,...,...,...,...,...,...,...,...
258,mistralai_Mistral-7B-Instruct-v0.2_main,53.97,70.68,68.82,38.13,59.43,83.45,65.70
259,mistralai_Mixtral-8x7B-Instruct-v0.1_main,55.08,70.79,73.56,59.89,70.60,86.68,72.01
260,openchat_openchat-3.5-0106_main,54.71,57.55,72.53,66.19,63.72,80.10,61.01
261,stabilityai_stablelm-zephyr-3b_main,34.75,46.19,58.41,40.18,45.18,71.57,45.82


In [16]:
df[["Model", "Date"]].merge(new_df, on="Model", how="left")

Unnamed: 0,Model,Date,Ifeval,Truthfulqa,Winogrande,Gsm8k,Mmlu,Hellaswag,Arc
0,NousResearch_Nous-Hermes-2-Yi-34B_main,2024-03-04,39.00,61.44,80.58,67.93,76.24,83.79,68.00
1,deepseek-ai_deepseek-llm-67b-chat_main,2024-03-05,55.27,57.78,79.16,76.12,71.18,83.94,64.16
2,NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO_main,2024-03-02,59.33,64.76,78.53,62.17,71.96,85.42,70.82
3,mistralai_Mixtral-8x7B-Instruct-v0.1_main,2024-03-02,55.08,70.79,73.56,59.89,70.60,86.68,72.01
4,deepseek-ai_deepseek-llm-67b-chat_main,2024-03-04,55.27,57.78,79.16,76.12,71.18,83.94,64.16
...,...,...,...,...,...,...,...,...,...
269,HuggingFaceH4_starcoder2-15b-ift_v18.0,2024-03-10,21.63,,,0.83,,,
270,HuggingFaceH4_mistral-7b-ift_v49.0,2024-03-07,20.15,,,0.00,,,
271,HuggingFaceH4_starchat-beta_main,2024-03-12,8.13,,,,,,
272,HuggingFaceH4_starcoder2-15b-ift_v7.0,2024-03-10,12.57,,,3.18,,,
