File size: 10,466 Bytes
96c67b7 12f59da 15fe18d 96c67b7 12f59da 9346f1c 12f59da 4596a70 12f59da 96c67b7 814a536 12f59da 8b1f7a0 12f59da 54eae7e d777fa4 12f59da 2a73469 96c67b7 12f59da 1b8a36b a885f09 12f59da 96c67b7 2a73469 614ee1f 12f59da 96c67b7 12f59da 96c67b7 12f59da d7a42ca 12f59da d7a42ca 12f59da 15fe18d 96c67b7 814a536 4103566 12f59da 4103566 814a536 12f59da 814a536 4103566 04447ea d7a42ca 96c67b7 d7a42ca 04447ea d7a42ca 04447ea 12f59da 4103566 beaaa9e f6475aa e2c374a 96c67b7 15fe18d 9d4ad0a 15fe18d e2c374a 15fe18d e2c374a 15fe18d 12f59da 01233b7 6829d60 96c67b7 6829d60 10f9b3c 8daa060 12f59da 96c67b7 12f59da 96c67b7 7e8ac0e e2c374a 12f59da 8daa060 f7d1b51 15fe18d f7d1b51 12f59da 71f25ab f7d1b51 818f024 f7d1b51 12f59da 10f9b3c d7a42ca 10f9b3c 814a536 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 |
import logging
import os
import re
import sys
import gradio as gr
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from datasets import load_dataset
from datasets.data_files import EmptyDatasetError
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
from huggingface_hub import HfApi
from src import about
from src.display.css_html_js import custom_css
from src.plots import plot_cost_efficiency, plot_parameter_efficiency
from src.schema import AutoEvalColumn, EvalResult, fields
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
level=logging.INFO,
)
# 1. Initialization
_hf_token = os.environ.get("HF_TOKEN")
if not _hf_token:
raise ValueError("HF_TOKEN not set!")
api = HfApi(token=_hf_token)
REPO_ID = "UD-Filipino/filbench-leaderboard"
REPO_RESULTS = "UD-Filipino/filbench-results"
SUBMISSION_RESULTS = "UD-Filipino/filbench-results-submission"
def restart_space():
api.restart_space(repo_id=REPO_ID)
# 2. Load and populate leaderboard data
def get_results(
source: str, aggregate: bool = False, submissions: str = None
) -> tuple[pd.DataFrame, list]:
"""Load results from a given source and return a DataFrame with the relevant columns.
If `aggregate` is True, it returns the aggregated results.
source (str): The source dataset to load results from.
aggregate (bool): Whether to return aggregated results or not.
submissions (str, optional): The submissions dataset to load results from.
RETURNS (tuple[pd.DataFrame, list]): A tuple containing the DataFrame with results and a list of master columns.
"""
results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
raw_data = [EvalResult.init_from_dict(result) for result in results]
if submissions:
try:
submission_results = (
load_dataset(
submissions, split="train", download_mode="force_redownload"
)
.to_pandas()
.to_dict(orient="records")
)
except EmptyDatasetError:
logging.info("Empty dataset for submissions, skipping...")
submission_results = []
if len(submission_results) == 0:
logging.info("No external submissions found!")
else:
logging.info(f"Found {len(submission_results)} submission/s!")
raw_data += [
EvalResult.init_from_dict(result, is_submission=True)
for result in submission_results
]
all_data_json = [v.to_dict() for v in raw_data]
df = pd.DataFrame.from_records(all_data_json)
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
df["Incomplete"] = ~df.isna().any(axis=1)
master_columns = []
for col in fields(AutoEvalColumn):
if col.meta:
master_columns.append(col.name)
if aggregate:
if col.aggregate:
master_columns.append(col.name)
else:
if not col.aggregate:
master_columns.append(col.name)
cols = [
c.name
for c in fields(AutoEvalColumn)
if not c.hidden and c.name in master_columns
]
cols.append("Incomplete")
df = df[cols].round(decimals=2)
return df, master_columns
def init_leaderboard(
source: str, aggregate: bool = False, submissions: str = None
) -> Leaderboard:
df, master_columns = get_results(
source=source, aggregate=aggregate, submissions=submissions
)
return Leaderboard(
value=df,
datatype=[c.type for c in fields(AutoEvalColumn) if c.name in master_columns],
select_columns=SelectColumns(
default_selection=[
c.name
for c in fields(AutoEvalColumn)
if c.displayed_by_default and c.name in master_columns
],
cant_deselect=[
c.name
for c in fields(AutoEvalColumn)
if c.never_hidden and c.name in master_columns
],
label="Select Columns to Display:",
),
filter_columns=[
# fmt: off
ColumnFilter("Incomplete", type="boolean", label="Hide incomplete evaluations", default=True),
ColumnFilter("Submission", type="boolean", label="Show only submitted results", default=False),
# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model type"),
ColumnFilter(AutoEvalColumn.multilingual.name, type="checkboxgroup", label="Multilinguality"),
ColumnFilter(AutoEvalColumn.param_size.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)", default=[-1, 83]),
# fmt: on
],
search_columns=[AutoEvalColumn.model.name],
hide_columns=[
c.name
for c in fields(AutoEvalColumn)
if c.hidden and c.name in master_columns
],
interactive=False,
)
def get_clean_df() -> pd.DataFrame:
df, _ = get_results(
source=REPO_RESULTS, aggregate=False, submissions=SUBMISSION_RESULTS
)
df_agg, _ = get_results(
source=REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS
)
# Cleanup
def extract_names(html_string):
match = re.search(r"<a[^>]*>(.*?)</a>", html_string)
if match:
extracted_text = match.group(1) # "some value"
return extracted_text
def remove_emojis(string):
emoji_pattern = re.compile(
"["
"\U0001f600-\U0001f64f" # emoticons
"\U0001f300-\U0001f5ff" # symbols & pictographs
"\U0001f680-\U0001f6ff" # transport & map symbols
"\U0001f700-\U0001f77f" # alchemical symbols
"\U0001f780-\U0001f7ff" # Geometric Shapes Extended
"\U0001f800-\U0001f8ff" # Supplemental Arrows-C
"\U0001f900-\U0001f9ff" # Supplemental Symbols and Pictographs
"\U0001fa00-\U0001fa6f" # Chess Symbols
"\U0001fa70-\U0001faff" # Symbols and Pictographs Extended-A
"\U00002702-\U000027b0" # Dingbats
"\U000024c2-\U0001f251"
"]+",
flags=re.UNICODE,
)
return emoji_pattern.sub(r"", string).strip()
df["Model"] = df["Model"].apply(extract_names)
df = df.rename(columns={col: remove_emojis(col).strip() for col in df.columns})
df["Multilingual"] = df["Multilingual"].apply(remove_emojis)
df["Model Type"] = df["Model Type"].apply(remove_emojis)
df = df.reset_index(drop=True)
# Cleanup the aggregated dataset
df_agg["Model"] = df_agg["Model"].apply(extract_names)
df_agg = df_agg.rename(
columns={col: remove_emojis(col).strip() for col in df_agg.columns}
)
df_agg = df_agg.reset_index(drop=True)
df_agg = df_agg[
[
"Model",
"Cultural Knowledge",
"Classical NLP",
"Reading Comprehension",
"Generation",
]
]
df_agg = df_agg.rename(
columns={col: f"agg_{col}" for col in df_agg.columns if col != "Model"}
)
df_merge = df.merge(df_agg, on="Model")
return df_merge
def download_results():
df = get_clean_df()
filepath = "filbench_results.csv"
df.to_csv(filepath, index=False)
return filepath
# 3. Actual setup of the HF Space
demo = gr.Blocks(css=custom_css)
with demo:
with gr.Column(scale=6):
num_models = len(
get_results(REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS)[0]
)
gr.Markdown(about.TOP_TEXT.format(str(num_models)))
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem(
"π
FilBench Leaderboard", elem_id="llm-benchmark-tab-table", id=0
):
leaderboard = init_leaderboard(
REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS
)
with gr.TabItem(
"π FilBench - Detailed", elem_id="llm-benchmark-tab-table", id=1
):
leaderboard = init_leaderboard(
REPO_RESULTS, aggregate=False, submissions=SUBMISSION_RESULTS
)
with gr.TabItem("π Analysis", id=2):
df = get_clean_df()
with gr.Row():
with gr.Column():
gr.Markdown("## Parameter-Efficiency Plot")
plot_parameter_efficiency(df)
gr.Markdown(
"Model performance on FilBench with respect to their parameter size. "
"For mixture-of-experts models, we plot their full parameter count. "
"In general, we find that model size and performance are positively correlated."
)
with gr.Column():
gr.Markdown("## Cost-Efficiency Plot")
plot_cost_efficiency(df)
gr.Markdown(
"Model performance on FilBench with respect to their per-token output cost ($/1M tokens). "
"We use the token-pricing as published in [OpenRouter](https://openrouter.ai/models). "
"For models not in OpenRouter, we either exlude them from the chart or use the cost of the base model it was finetuned from."
)
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=3):
gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.Row():
download_button = gr.DownloadButton("Download results (CSV)")
download_button.click(download_results, outputs=download_button)
with gr.Accordion("π Citation", open=False):
citation_button = gr.Textbox(
value=about.CITATION_BUTTON_TEXT,
label=about.CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=3600)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()
|