Spaces:

UD-Filipino
/

filbench-leaderboard

Running

Lj Miranda

Add ability to incorporate external submissions (#7)

96c67b7 unverified 4 months ago

10.5 kB

	import logging
	import os
	import re
	import sys

	import gradio as gr
	import pandas as pd
	from apscheduler.schedulers.background import BackgroundScheduler
	from datasets import load_dataset
	from datasets.data_files import EmptyDatasetError
	from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
	from huggingface_hub import HfApi

	from src import about
	from src.display.css_html_js import custom_css
	from src.plots import plot_cost_efficiency, plot_parameter_efficiency
	from src.schema import AutoEvalColumn, EvalResult, fields

	logging.basicConfig(
	format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	handlers=[logging.StreamHandler(sys.stdout)],
	level=logging.INFO,
	)

	# 1. Initialization
	_hf_token = os.environ.get("HF_TOKEN")
	if not _hf_token:
	raise ValueError("HF_TOKEN not set!")
	api = HfApi(token=_hf_token)


	REPO_ID = "UD-Filipino/filbench-leaderboard"
	REPO_RESULTS = "UD-Filipino/filbench-results"
	SUBMISSION_RESULTS = "UD-Filipino/filbench-results-submission"


	def restart_space():
	api.restart_space(repo_id=REPO_ID)


	# 2. Load and populate leaderboard data
	def get_results(
	source: str, aggregate: bool = False, submissions: str = None
	) -> tuple[pd.DataFrame, list]:
	"""Load results from a given source and return a DataFrame with the relevant columns.

	If `aggregate` is True, it returns the aggregated results.

	source (str): The source dataset to load results from.
	aggregate (bool): Whether to return aggregated results or not.
	submissions (str, optional): The submissions dataset to load results from.
	RETURNS (tuple[pd.DataFrame, list]): A tuple containing the DataFrame with results and a list of master columns.
	"""
	results = load_dataset(source, split="train").to_pandas().to_dict(orient="records")
	raw_data = [EvalResult.init_from_dict(result) for result in results]

	if submissions:
	try:
	submission_results = (
	load_dataset(
	submissions, split="train", download_mode="force_redownload"
	)
	.to_pandas()
	.to_dict(orient="records")
	)
	except EmptyDatasetError:
	logging.info("Empty dataset for submissions, skipping...")
	submission_results = []
	if len(submission_results) == 0:
	logging.info("No external submissions found!")
	else:
	logging.info(f"Found {len(submission_results)} submission/s!")

	raw_data += [
	EvalResult.init_from_dict(result, is_submission=True)
	for result in submission_results
	]

	all_data_json = [v.to_dict() for v in raw_data]
	df = pd.DataFrame.from_records(all_data_json)
	df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
	df["Incomplete"] = ~df.isna().any(axis=1)
	master_columns = []
	for col in fields(AutoEvalColumn):
	if col.meta:
	master_columns.append(col.name)
	if aggregate:
	if col.aggregate:
	master_columns.append(col.name)
	else:
	if not col.aggregate:
	master_columns.append(col.name)

	cols = [
	c.name
	for c in fields(AutoEvalColumn)
	if not c.hidden and c.name in master_columns
	]
	cols.append("Incomplete")
	df = df[cols].round(decimals=2)
	return df, master_columns


	def init_leaderboard(
	source: str, aggregate: bool = False, submissions: str = None
	) -> Leaderboard:
	df, master_columns = get_results(
	source=source, aggregate=aggregate, submissions=submissions
	)

	return Leaderboard(
	value=df,
	datatype=[c.type for c in fields(AutoEvalColumn) if c.name in master_columns],
	select_columns=SelectColumns(
	default_selection=[
	c.name
	for c in fields(AutoEvalColumn)
	if c.displayed_by_default and c.name in master_columns
	],
	cant_deselect=[
	c.name
	for c in fields(AutoEvalColumn)
	if c.never_hidden and c.name in master_columns
	],
	label="Select Columns to Display:",
	),
	filter_columns=[
	# fmt: off
	ColumnFilter("Incomplete", type="boolean", label="Hide incomplete evaluations", default=True),
	ColumnFilter("Submission", type="boolean", label="Show only submitted results", default=False),
	# ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
	ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model type"),
	ColumnFilter(AutoEvalColumn.multilingual.name, type="checkboxgroup", label="Multilinguality"),
	ColumnFilter(AutoEvalColumn.param_size.name, type="slider", min=0.01, max=150, label="Select the number of parameters (B)", default=[-1, 83]),
	# fmt: on
	],
	search_columns=[AutoEvalColumn.model.name],
	hide_columns=[
	c.name
	for c in fields(AutoEvalColumn)
	if c.hidden and c.name in master_columns
	],
	interactive=False,
	)


	def get_clean_df() -> pd.DataFrame:
	df, _ = get_results(
	source=REPO_RESULTS, aggregate=False, submissions=SUBMISSION_RESULTS
	)
	df_agg, _ = get_results(
	source=REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS
	)

	# Cleanup
	def extract_names(html_string):
	match = re.search(r"<a[^>]>(.?)</a>", html_string)
	if match:
	extracted_text = match.group(1) # "some value"
	return extracted_text

	def remove_emojis(string):
	emoji_pattern = re.compile(
	"["
	"\U0001f600-\U0001f64f" # emoticons
	"\U0001f300-\U0001f5ff" # symbols & pictographs
	"\U0001f680-\U0001f6ff" # transport & map symbols
	"\U0001f700-\U0001f77f" # alchemical symbols
	"\U0001f780-\U0001f7ff" # Geometric Shapes Extended
	"\U0001f800-\U0001f8ff" # Supplemental Arrows-C
	"\U0001f900-\U0001f9ff" # Supplemental Symbols and Pictographs
	"\U0001fa00-\U0001fa6f" # Chess Symbols
	"\U0001fa70-\U0001faff" # Symbols and Pictographs Extended-A
	"\U00002702-\U000027b0" # Dingbats
	"\U000024c2-\U0001f251"
	"]+",
	flags=re.UNICODE,
	)
	return emoji_pattern.sub(r"", string).strip()

	df["Model"] = df["Model"].apply(extract_names)
	df = df.rename(columns={col: remove_emojis(col).strip() for col in df.columns})
	df["Multilingual"] = df["Multilingual"].apply(remove_emojis)
	df["Model Type"] = df["Model Type"].apply(remove_emojis)
	df = df.reset_index(drop=True)

	# Cleanup the aggregated dataset
	df_agg["Model"] = df_agg["Model"].apply(extract_names)
	df_agg = df_agg.rename(
	columns={col: remove_emojis(col).strip() for col in df_agg.columns}
	)
	df_agg = df_agg.reset_index(drop=True)
	df_agg = df_agg[
	[
	"Model",
	"Cultural Knowledge",
	"Classical NLP",
	"Reading Comprehension",
	"Generation",
	]
	]
	df_agg = df_agg.rename(
	columns={col: f"agg_{col}" for col in df_agg.columns if col != "Model"}
	)
	df_merge = df.merge(df_agg, on="Model")
	return df_merge


	def download_results():
	df = get_clean_df()
	filepath = "filbench_results.csv"
	df.to_csv(filepath, index=False)
	return filepath


	# 3. Actual setup of the HF Space
	demo = gr.Blocks(css=custom_css)
	with demo:
	with gr.Column(scale=6):
	num_models = len(
	get_results(REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS)[0]
	)
	gr.Markdown(about.TOP_TEXT.format(str(num_models)))

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem(
	"🏅 FilBench Leaderboard", elem_id="llm-benchmark-tab-table", id=0
	):
	leaderboard = init_leaderboard(
	REPO_RESULTS, aggregate=True, submissions=SUBMISSION_RESULTS
	)

	with gr.TabItem(
	"🔍 FilBench - Detailed", elem_id="llm-benchmark-tab-table", id=1
	):
	leaderboard = init_leaderboard(
	REPO_RESULTS, aggregate=False, submissions=SUBMISSION_RESULTS
	)

	with gr.TabItem("📊 Analysis", id=2):
	df = get_clean_df()
	with gr.Row():
	with gr.Column():
	gr.Markdown("## Parameter-Efficiency Plot")
	plot_parameter_efficiency(df)
	gr.Markdown(
	"Model performance on FilBench with respect to their parameter size. "
	"For mixture-of-experts models, we plot their full parameter count. "
	"In general, we find that model size and performance are positively correlated."
	)
	with gr.Column():
	gr.Markdown("## Cost-Efficiency Plot")
	plot_cost_efficiency(df)
	gr.Markdown(
	"Model performance on FilBench with respect to their per-token output cost ($/1M tokens). "
	"We use the token-pricing as published in [OpenRouter](https://openrouter.ai/models). "
	"For models not in OpenRouter, we either exlude them from the chart or use the cost of the base model it was finetuned from."
	)

	with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=3):
	gr.Markdown(about.LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

	with gr.Row():
	download_button = gr.DownloadButton("Download results (CSV)")
	download_button.click(download_results, outputs=download_button)

	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=about.CITATION_BUTTON_TEXT,
	label=about.CITATION_BUTTON_LABEL,
	lines=20,
	elem_id="citation-button",
	show_copy_button=True,
	)


	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=3600)
	scheduler.start()
	demo.queue(default_concurrency_limit=40).launch()