Spaces:

evilfreelancer
/

rrr-leaderboard

Running

rrr-leaderboard / app.py

pasha

Update

c7a3d75 about 1 month ago

12.1 kB

	import re
	import streamlit as st
	import pandas as pd
	import altair as alt

	# Load CSV file
	DATA_FILE = "test_all.csv"
	df = pd.read_csv(DATA_FILE)

	# Normalize column names
	df.columns = df.columns.str.strip()

	# Page header
	st.title("🇷🇺 Russian Router Ranking (RRR)")
	st.markdown("""
	This leaderboard evaluates Large Language Models (LLMs) on their ability to perform **text routing and classification
	tasks in Russian. Models are assessed based on their capability to return answers in a structured output** format
	(JSON), which is essential for automation and system integration in real-world applications.

	The dataset used is [rrr-benchmark](https://huggingface.co/datasets/evilfreelancer/rrr-benchmark), which focuses on
	practical routing tasks across various domains.

	Source code and details: [GitHub Repository](https://github.com/EvilFreelancer/rrr-benchmark)
	""")
	st.markdown("""
	<style>
	.scrollable-table {
	max-height: 600px;
	overflow-y: auto;
	overflow-x: auto;
	border: 1px solid #ddd;
	margin-bottom: 25px;
	}
	.sortable-header {
	cursor: pointer;
	background-color: #f0f2f6 !important;
	color: #262730 !important;
	padding: 8px 12px !important;
	border: 1px solid #ddd !important;
	user-select: none;
	position: relative;
	}
	.sortable-header:hover {
	background-color: #e6e9f0 !important;
	color: #262730 !important;
	}
	.sort-indicator {
	margin-left: 5px;
	font-size: 12px;
	color: #666;
	}
	.tooltip-icon {
	margin-left: 5px;
	color: #666;
	cursor: help;
	font-size: 14px;
	}
	</style>
	""", unsafe_allow_html=True)


	# Utility function to numerically sort model sizes (e.g., 7b < 13b < 65b)
	def model_size_sort_key(size: str):
	if not isinstance(size, str):
	return float('inf')
	match = re.match(r"(\d+(?:\.\d+)?)([mb])", size.lower())
	if not match:
	return float('inf')
	num, unit = match.groups()
	multiplier = 1e6 if unit == 'm' else 1e9
	return float(num) * multiplier


	# Sidebar filters
	with st.sidebar:
	st.header("Filters")

	# Model name filter (case-insensitive sort)
	model_name_options = sorted(df["model_name"].dropna().unique(), key=str.lower)
	model_name = st.multiselect("Select model:", options=model_name_options)

	# Model size filter (numerical sort)
	model_size_options = sorted(df["model_size"].dropna().unique(), key=model_size_sort_key)
	model_size = st.multiselect("Select size:", options=model_size_options)

	# Quantization level filter (default alphabetical sort)
	model_quant = st.multiselect("Select quantization:", options=sorted(df["model_quant"].dropna().unique()))


	# Function to create model URL from model field
	def get_model_url(model_field, model_name):
	# Create URL with model name embedded for regex extraction
	if model_field.startswith("hf.co/"):
	# Remove tag after colon if present (e.g., hf.co/model:tag -> hf.co/model)
	if ":" in model_field:
	model_field = model_field.split(":")[0]
	base_url = f"https://{model_field}"
	# Add model name as URL fragment for regex extraction
	return f"{base_url}#{model_name}"
	else:
	base_url = f"https://ollama.com/library/{model_field}"
	# Add model name as URL fragment for regex extraction
	return f"{base_url}#{model_name}"


	# Function to render interactive table
	def render_interactive_table(data, split_name):
	if data.empty:
	st.info(f"No data available for {split_name} split yet.")
	return

	# Apply sidebar filters
	filtered_df = data.copy()
	if model_name:
	filtered_df = filtered_df[filtered_df["model_name"].isin(model_name)]
	if model_size:
	filtered_df = filtered_df[filtered_df["model_size"].isin(model_size)]
	if model_quant:
	filtered_df = filtered_df[filtered_df["model_quant"].isin(model_quant)]

	if filtered_df.empty:
	st.warning("No data matches the selected filters.")
	return

	# Prepare display dataframe
	display_df = filtered_df.copy()

	# Convert accuracy to percentage (multiply by 100)
	display_df["accuracy"] = display_df["accuracy"] * 100

	# Create numerical size for proper sorting (hidden column)
	display_df["size_numeric"] = display_df["model_size"].apply(model_size_sort_key)

	# Create model URLs with embedded model names
	display_df["Model_URL"] = display_df.apply(lambda row: get_model_url(row["model"], row["model_name"]), axis=1)

	# Clean up and select needed columns
	display_df = display_df[[
	"Model_URL", "model_size", "size_numeric", "model_quant",
	"accuracy", "avg_response_time", "avg_token_count"
	]].copy()

	# Rename columns
	display_df = display_df.rename(columns={
	"Model_URL": "Model",
	"model_size": "Size", # Use original size format (1b, 7b, 16b)
	"model_quant": "Quant",
	"accuracy": "Accuracy",
	"avg_response_time": "Avg Time",
	"avg_token_count": "Avg Tokens"
	})

	# Sort by accuracy by default (descending)
	display_df = display_df.sort_values("Accuracy", ascending=False).reset_index(drop=True)

	# Column configuration
	column_config = {
	"Model": st.column_config.LinkColumn(
	"Model",
	help="Click to open model page",
	width="medium",
	display_text=r".#(.)" # Extract model name after # symbol
	),
	"Size": st.column_config.TextColumn(
	"Size",
	help="Model size (parameters count)",
	width="small"
	),
	"size_numeric": None, # Hide this column but keep it for sorting
	"Quant": st.column_config.TextColumn(
	"Quant",
	help="Quantization level",
	width="small"
	),
	"Accuracy": st.column_config.NumberColumn(
	"Accuracy (%)",
	help="Accuracy score (higher is better)",
	format="%.2f",
	width="small"
	),
	"Avg Time": st.column_config.NumberColumn(
	"Avg Time (s)",
	help="Average response time in seconds (lower is better)",
	format="%.3f",
	width="small"
	),
	"Avg Tokens": st.column_config.NumberColumn(
	"Avg Tokens",
	help="Average number of tokens in response",
	format="%.1f",
	width="small"
	)
	}

	# Display the table
	st.data_editor(
	display_df,
	column_config=column_config,
	hide_index=True,
	use_container_width=True,
	disabled=True
	)


	# Function to render averaged scores table
	def render_averaged_table():
	if "dataset_split" not in df.columns:
	st.info("Dataset does not contain 'dataset_split' column.")
	return

	# Filter out generic split for averaging
	non_generic_df = df[df["dataset_split"] != "generic"]

	if non_generic_df.empty:
	st.info("No non-generic data available for averaging.")
	return

	# Apply sidebar filters first
	filtered_df = non_generic_df.copy()
	if model_name:
	filtered_df = filtered_df[filtered_df["model_name"].isin(model_name)]
	if model_size:
	filtered_df = filtered_df[filtered_df["model_size"].isin(model_size)]
	if model_quant:
	filtered_df = filtered_df[filtered_df["model_quant"].isin(model_quant)]

	if filtered_df.empty:
	st.warning("No data matches the selected filters.")
	return

	# Calculate averages grouped by model
	avg_df = (
	filtered_df
	.groupby(["model_name", "model", "model_size", "model_quant"], as_index=False)
	.agg({
	"accuracy": "mean",
	"avg_response_time": "mean",
	"avg_token_count": "mean"
	})
	)

	render_interactive_table(avg_df, "Average Scores")

	# Add accuracy chart by model and split
	st.markdown("### 📊 Accuracy by Model and Number of Routes")
	st.markdown("Shows accuracy performance across different number of routes")

	# Prepare data for chart - group by model_name AND model_size for unique variations
	chart_data = (
	filtered_df
	.groupby(["model_name", "model_size", "dataset_split"], as_index=False)
	.agg({"accuracy": "mean"})
	)

	# Create unique model identifier combining name and size
	chart_data["model_variant"] = chart_data["model_name"] + " (" + chart_data["model_size"] + ")"

	# Convert accuracy to percentage for display
	chart_data["accuracy"] = chart_data["accuracy"] * 100

	# Ensure accuracy is within 0-100 range
	chart_data["accuracy"] = chart_data["accuracy"].clip(0, 100)

	if not chart_data.empty:
	# Create pivot table for chart using model_variant as columns
	pivot_data = chart_data.pivot(index="dataset_split", columns="model_variant", values="accuracy")

	# Reorder index to show logical progression of route complexity
	route_order = ["routes_3", "routes_5", "routes_7", "routes_9"]
	pivot_data = pivot_data.reindex([split for split in route_order if split in pivot_data.index])

	# Rename index to be more readable (X-axis labels)
	index_rename = {
	"routes_3": "3",
	"routes_5": "5",
	"routes_7": "7",
	"routes_9": "9"
	}
	pivot_data = pivot_data.rename(index=index_rename)

	# Display line chart with fixed Y-axis
	# Prepare data for Altair
	chart_df = pivot_data.reset_index().melt(id_vars="dataset_split", var_name="model_variant",
	value_name="accuracy")

	# Create Altair line chart with fixed Y-axis
	chart = alt.Chart(chart_df).mark_line(point=True).add_selection(
	alt.selection_multi(fields=['model_variant'])
	).encode(
	x=alt.X('dataset_split:O', title='Number of Routes', sort=['3', '5', '7', '9']),
	y=alt.Y('accuracy:Q', title='Accuracy (%)', scale=alt.Scale(domain=[0, 100])),
	color=alt.Color('model_variant:N', title='Model (Size)'),
	tooltip=['dataset_split:O', 'model_variant:N', 'accuracy:Q']
	).properties(
	height=400,
	title="Accuracy Performance Across Route Complexity"
	)

	st.altair_chart(chart, use_container_width=True)
	else:
	st.info("No data available for chart display.")


	# Dataset splits configuration
	splits_config = {
	"average": {
	"name": "Average Scores",
	"description": "Average metrics for each model across all route datasets (excluding Generic)"
	},
	"routes_3": {
	"name": "3 Routes",
	"description": "Synthetic dataset with exactly 3 route options per item (simple complexity)"
	},
	"routes_5": {
	"name": "5 Routes",
	"description": "Synthetic dataset with exactly 5 route options per item (medium complexity)"
	},
	"routes_7": {
	"name": "7 Routes",
	"description": "Synthetic dataset with exactly 7 route options per item (high complexity)"
	},
	"routes_9": {
	"name": "9 Routes",
	"description": "Synthetic dataset with exactly 9 route options per item (maximum complexity)"
	},
	"generic": {
	"name": "Generic",
	"description": "Original dataset with variable number of routes per item (2-9 routes)"
	}
	}

	# Build tab names
	tab_names = [splits_config[split]["name"] for split in splits_config.keys()]
	tabs = st.tabs(tab_names)

	# Render each dataset split
	for i, (split_key, split_config) in enumerate(splits_config.items()):
	with tabs[i]:
	st.markdown(f"{split_config['description']}")
	st.markdown("Click on column headers to sort the table")

	if split_key == "average":
	render_averaged_table()
	else:
	split_data = df[df["dataset_split"] == split_key] if "dataset_split" in df.columns else pd.DataFrame()
	render_interactive_table(split_data, split_config["name"])