Spaces:
Running
Running
import re | |
import streamlit as st | |
import pandas as pd | |
import altair as alt | |
# Load CSV file | |
DATA_FILE = "test_all.csv" | |
df = pd.read_csv(DATA_FILE) | |
# Normalize column names | |
df.columns = df.columns.str.strip() | |
# Page header | |
st.title("🇷🇺 Russian Router Ranking (RRR)") | |
st.markdown(""" | |
This leaderboard evaluates Large Language Models (LLMs) on their ability to perform **text routing and classification | |
tasks in Russian**. Models are assessed based on their capability to return answers in a **structured output** format | |
(JSON), which is essential for automation and system integration in real-world applications. | |
The dataset used is [rrr-benchmark](https://huggingface.co/datasets/evilfreelancer/rrr-benchmark), which focuses on | |
practical routing tasks across various domains. | |
Source code and details: [GitHub Repository](https://github.com/EvilFreelancer/rrr-benchmark) | |
""") | |
st.markdown(""" | |
<style> | |
.scrollable-table { | |
max-height: 600px; | |
overflow-y: auto; | |
overflow-x: auto; | |
border: 1px solid #ddd; | |
margin-bottom: 25px; | |
} | |
.sortable-header { | |
cursor: pointer; | |
background-color: #f0f2f6 !important; | |
color: #262730 !important; | |
padding: 8px 12px !important; | |
border: 1px solid #ddd !important; | |
user-select: none; | |
position: relative; | |
} | |
.sortable-header:hover { | |
background-color: #e6e9f0 !important; | |
color: #262730 !important; | |
} | |
.sort-indicator { | |
margin-left: 5px; | |
font-size: 12px; | |
color: #666; | |
} | |
.tooltip-icon { | |
margin-left: 5px; | |
color: #666; | |
cursor: help; | |
font-size: 14px; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Utility function to numerically sort model sizes (e.g., 7b < 13b < 65b) | |
def model_size_sort_key(size: str): | |
if not isinstance(size, str): | |
return float('inf') | |
match = re.match(r"(\d+(?:\.\d+)?)([mb])", size.lower()) | |
if not match: | |
return float('inf') | |
num, unit = match.groups() | |
multiplier = 1e6 if unit == 'm' else 1e9 | |
return float(num) * multiplier | |
# Sidebar filters | |
with st.sidebar: | |
st.header("Filters") | |
# Model name filter (case-insensitive sort) | |
model_name_options = sorted(df["model_name"].dropna().unique(), key=str.lower) | |
model_name = st.multiselect("Select model:", options=model_name_options) | |
# Model size filter (numerical sort) | |
model_size_options = sorted(df["model_size"].dropna().unique(), key=model_size_sort_key) | |
model_size = st.multiselect("Select size:", options=model_size_options) | |
# Quantization level filter (default alphabetical sort) | |
model_quant = st.multiselect("Select quantization:", options=sorted(df["model_quant"].dropna().unique())) | |
# Function to create model URL from model field | |
def get_model_url(model_field, model_name): | |
# Create URL with model name embedded for regex extraction | |
if model_field.startswith("hf.co/"): | |
# Remove tag after colon if present (e.g., hf.co/model:tag -> hf.co/model) | |
if ":" in model_field: | |
model_field = model_field.split(":")[0] | |
base_url = f"https://{model_field}" | |
# Add model name as URL fragment for regex extraction | |
return f"{base_url}#{model_name}" | |
else: | |
base_url = f"https://ollama.com/library/{model_field}" | |
# Add model name as URL fragment for regex extraction | |
return f"{base_url}#{model_name}" | |
# Function to render interactive table | |
def render_interactive_table(data, split_name): | |
if data.empty: | |
st.info(f"No data available for {split_name} split yet.") | |
return | |
# Apply sidebar filters | |
filtered_df = data.copy() | |
if model_name: | |
filtered_df = filtered_df[filtered_df["model_name"].isin(model_name)] | |
if model_size: | |
filtered_df = filtered_df[filtered_df["model_size"].isin(model_size)] | |
if model_quant: | |
filtered_df = filtered_df[filtered_df["model_quant"].isin(model_quant)] | |
if filtered_df.empty: | |
st.warning("No data matches the selected filters.") | |
return | |
# Prepare display dataframe | |
display_df = filtered_df.copy() | |
# Convert accuracy to percentage (multiply by 100) | |
display_df["accuracy"] = display_df["accuracy"] * 100 | |
# Create numerical size for proper sorting (hidden column) | |
display_df["size_numeric"] = display_df["model_size"].apply(model_size_sort_key) | |
# Create model URLs with embedded model names | |
display_df["Model_URL"] = display_df.apply(lambda row: get_model_url(row["model"], row["model_name"]), axis=1) | |
# Clean up and select needed columns | |
display_df = display_df[[ | |
"Model_URL", "model_size", "size_numeric", "model_quant", | |
"accuracy", "avg_response_time", "avg_token_count" | |
]].copy() | |
# Rename columns | |
display_df = display_df.rename(columns={ | |
"Model_URL": "Model", | |
"model_size": "Size", # Use original size format (1b, 7b, 16b) | |
"model_quant": "Quant", | |
"accuracy": "Accuracy", | |
"avg_response_time": "Avg Time", | |
"avg_token_count": "Avg Tokens" | |
}) | |
# Sort by accuracy by default (descending) | |
display_df = display_df.sort_values("Accuracy", ascending=False).reset_index(drop=True) | |
# Column configuration | |
column_config = { | |
"Model": st.column_config.LinkColumn( | |
"Model", | |
help="Click to open model page", | |
width="medium", | |
display_text=r".*#(.*)" # Extract model name after # symbol | |
), | |
"Size": st.column_config.TextColumn( | |
"Size", | |
help="Model size (parameters count)", | |
width="small" | |
), | |
"size_numeric": None, # Hide this column but keep it for sorting | |
"Quant": st.column_config.TextColumn( | |
"Quant", | |
help="Quantization level", | |
width="small" | |
), | |
"Accuracy": st.column_config.NumberColumn( | |
"Accuracy (%)", | |
help="Accuracy score (higher is better)", | |
format="%.2f", | |
width="small" | |
), | |
"Avg Time": st.column_config.NumberColumn( | |
"Avg Time (s)", | |
help="Average response time in seconds (lower is better)", | |
format="%.3f", | |
width="small" | |
), | |
"Avg Tokens": st.column_config.NumberColumn( | |
"Avg Tokens", | |
help="Average number of tokens in response", | |
format="%.1f", | |
width="small" | |
) | |
} | |
# Display the table | |
st.data_editor( | |
display_df, | |
column_config=column_config, | |
hide_index=True, | |
use_container_width=True, | |
disabled=True | |
) | |
# Function to render averaged scores table | |
def render_averaged_table(): | |
if "dataset_split" not in df.columns: | |
st.info("Dataset does not contain 'dataset_split' column.") | |
return | |
# Filter out generic split for averaging | |
non_generic_df = df[df["dataset_split"] != "generic"] | |
if non_generic_df.empty: | |
st.info("No non-generic data available for averaging.") | |
return | |
# Apply sidebar filters first | |
filtered_df = non_generic_df.copy() | |
if model_name: | |
filtered_df = filtered_df[filtered_df["model_name"].isin(model_name)] | |
if model_size: | |
filtered_df = filtered_df[filtered_df["model_size"].isin(model_size)] | |
if model_quant: | |
filtered_df = filtered_df[filtered_df["model_quant"].isin(model_quant)] | |
if filtered_df.empty: | |
st.warning("No data matches the selected filters.") | |
return | |
# Calculate averages grouped by model | |
avg_df = ( | |
filtered_df | |
.groupby(["model_name", "model", "model_size", "model_quant"], as_index=False) | |
.agg({ | |
"accuracy": "mean", | |
"avg_response_time": "mean", | |
"avg_token_count": "mean" | |
}) | |
) | |
render_interactive_table(avg_df, "Average Scores") | |
# Add accuracy chart by model and split | |
st.markdown("### 📊 Accuracy by Model and Number of Routes") | |
st.markdown("*Shows accuracy performance across different number of routes*") | |
# Prepare data for chart - group by model_name AND model_size for unique variations | |
chart_data = ( | |
filtered_df | |
.groupby(["model_name", "model_size", "dataset_split"], as_index=False) | |
.agg({"accuracy": "mean"}) | |
) | |
# Create unique model identifier combining name and size | |
chart_data["model_variant"] = chart_data["model_name"] + " (" + chart_data["model_size"] + ")" | |
# Convert accuracy to percentage for display | |
chart_data["accuracy"] = chart_data["accuracy"] * 100 | |
# Ensure accuracy is within 0-100 range | |
chart_data["accuracy"] = chart_data["accuracy"].clip(0, 100) | |
if not chart_data.empty: | |
# Create pivot table for chart using model_variant as columns | |
pivot_data = chart_data.pivot(index="dataset_split", columns="model_variant", values="accuracy") | |
# Reorder index to show logical progression of route complexity | |
route_order = ["routes_3", "routes_5", "routes_7", "routes_9"] | |
pivot_data = pivot_data.reindex([split for split in route_order if split in pivot_data.index]) | |
# Rename index to be more readable (X-axis labels) | |
index_rename = { | |
"routes_3": "3", | |
"routes_5": "5", | |
"routes_7": "7", | |
"routes_9": "9" | |
} | |
pivot_data = pivot_data.rename(index=index_rename) | |
# Display line chart with fixed Y-axis | |
# Prepare data for Altair | |
chart_df = pivot_data.reset_index().melt(id_vars="dataset_split", var_name="model_variant", | |
value_name="accuracy") | |
# Create Altair line chart with fixed Y-axis | |
chart = alt.Chart(chart_df).mark_line(point=True).add_selection( | |
alt.selection_multi(fields=['model_variant']) | |
).encode( | |
x=alt.X('dataset_split:O', title='Number of Routes', sort=['3', '5', '7', '9']), | |
y=alt.Y('accuracy:Q', title='Accuracy (%)', scale=alt.Scale(domain=[0, 100])), | |
color=alt.Color('model_variant:N', title='Model (Size)'), | |
tooltip=['dataset_split:O', 'model_variant:N', 'accuracy:Q'] | |
).properties( | |
height=400, | |
title="Accuracy Performance Across Route Complexity" | |
) | |
st.altair_chart(chart, use_container_width=True) | |
else: | |
st.info("No data available for chart display.") | |
# Dataset splits configuration | |
splits_config = { | |
"average": { | |
"name": "Average Scores", | |
"description": "Average metrics for each model across all route datasets (excluding Generic)" | |
}, | |
"routes_3": { | |
"name": "3 Routes", | |
"description": "Synthetic dataset with exactly 3 route options per item (simple complexity)" | |
}, | |
"routes_5": { | |
"name": "5 Routes", | |
"description": "Synthetic dataset with exactly 5 route options per item (medium complexity)" | |
}, | |
"routes_7": { | |
"name": "7 Routes", | |
"description": "Synthetic dataset with exactly 7 route options per item (high complexity)" | |
}, | |
"routes_9": { | |
"name": "9 Routes", | |
"description": "Synthetic dataset with exactly 9 route options per item (maximum complexity)" | |
}, | |
"generic": { | |
"name": "Generic", | |
"description": "Original dataset with variable number of routes per item (2-9 routes)" | |
} | |
} | |
# Build tab names | |
tab_names = [splits_config[split]["name"] for split in splits_config.keys()] | |
tabs = st.tabs(tab_names) | |
# Render each dataset split | |
for i, (split_key, split_config) in enumerate(splits_config.items()): | |
with tabs[i]: | |
st.markdown(f"**{split_config['description']}**") | |
st.markdown("*Click on column headers to sort the table*") | |
if split_key == "average": | |
render_averaged_table() | |
else: | |
split_data = df[df["dataset_split"] == split_key] if "dataset_split" in df.columns else pd.DataFrame() | |
render_interactive_table(split_data, split_config["name"]) | |