CodeReviewBench / app.py
Alex
merge_resolve
9404fa8
"""
CodeReview Leaderboard - Inspired by CodeReviewBench
A comprehensive leaderboard for code review generation models
"""
import os
import json
import tempfile
import logging
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from apscheduler.schedulers.background import BackgroundScheduler
import numpy as np
from gradio.themes.utils import fonts, colors
from dataclasses import fields, dataclass
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
CODEREVIEW_COLUMN,
DISPLAY_COLS,
METRIC_COLS,
HIDDEN_COLS,
NEVER_HIDDEN_COLS,
CATEGORIES,
COMMENT_LANGUAGES,
EXAMPLE_CATEGORIES,
TOPICS,
ModelType,
Mode,
Precision,
WeightType,
ReviewModelType,
get_all_column_choices,
get_default_visible_columns,
)
from src.display.formatting import styled_message, styled_error, styled_warning
from src.envs import (
ADMIN_USERNAME,
ADMIN_PASSWORD,
RESULTS_DATASET_ID,
SUBMITTER_TOKEN,
TOKEN,
DATA_PATH,
)
from src.populate import get_leaderboard_df, get_category_leaderboard_df
from src.submission.submit import process_submission
# Configure logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# Ensure data directory exists
os.makedirs(DATA_PATH, exist_ok=True)
# Available benchmark versions
BENCHMARK_VERSIONS = ["v0"]
CURRENT_VERSION = "v0"
# Initialize leaderboard data
try:
logger.info("Initializing leaderboard data...")
LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
except Exception as e:
logger.error(f"Error loading leaderboard data: {e}")
LEADERBOARD_DF = pd.DataFrame()
custom_theme = gr.themes.Default(
primary_hue=colors.slate,
secondary_hue=colors.slate,
neutral_hue=colors.neutral,
font=(fonts.GoogleFont("Inter"), "sans-serif"),
).set(
# font_size="16px",
body_background_fill="#0f0f10",
body_background_fill_dark="#0f0f10",
body_text_color="#f4f4f5",
body_text_color_subdued="#a1a1aa",
block_background_fill="#1e1e1e", # Cooler Grey
block_border_color="#333333", # Cooler Grey
block_shadow="none",
# Swapped primary and secondary button styles
button_primary_background_fill="#121212", # Changed to specific color for Refresh button
button_primary_text_color="#f4f4f5",
button_primary_border_color="#333333", # Keep border grey or change to #121212?
button_secondary_background_fill="#f4f4f5",
button_secondary_text_color="#0f0f10",
button_secondary_border_color="#f4f4f5",
input_background_fill="#1e1e1e", # Cooler Grey
input_border_color="#333333", # Cooler Grey
input_placeholder_color="#71717a",
table_border_color="#333333", # Cooler Grey
table_even_background_fill="#2d2d2d", # Cooler Grey (Slightly lighter)
table_odd_background_fill="#1e1e1e", # Cooler Grey
table_text_color="#f4f4f5",
link_text_color="#ffffff",
border_color_primary="#333333", # Cooler Grey
background_fill_secondary="#333333", # Cooler Grey
color_accent="#f4f4f5",
border_color_accent="#333333", # Cooler Grey
button_primary_background_fill_hover="#424242", # Cooler Grey
block_title_text_color="#f4f4f5",
accordion_text_color="#f4f4f5",
panel_background_fill="#1e1e1e", # Cooler Grey
panel_border_color="#333333", # Cooler Grey
# Explicitly setting primary/secondary/accent colors/borders
background_fill_primary="#0f0f10",
background_fill_primary_dark="#0f0f10",
background_fill_secondary_dark="#333333", # Cooler Grey
border_color_primary_dark="#333333", # Cooler Grey
border_color_accent_dark="#333333", # Cooler Grey
border_color_accent_subdued="#424242", # Cooler Grey
border_color_accent_subdued_dark="#424242", # Cooler Grey
color_accent_soft="#a1a1aa",
color_accent_soft_dark="#a1a1aa",
# Explicitly setting input hover/focus states
input_background_fill_dark="#1e1e1e", # Cooler Grey
input_background_fill_focus="#424242", # Cooler Grey
input_background_fill_focus_dark="#424242", # Cooler Grey
input_background_fill_hover="#2d2d2d", # Cooler Grey
input_background_fill_hover_dark="#2d2d2d", # Cooler Grey
input_border_color_dark="#333333", # Cooler Grey
input_border_color_focus="#f4f4f5",
input_border_color_focus_dark="#f4f4f5",
input_border_color_hover="#424242", # Cooler Grey
input_border_color_hover_dark="#424242", # Cooler Grey
input_placeholder_color_dark="#71717a",
# Explicitly set dark variants for table backgrounds
table_even_background_fill_dark="#2d2d2d", # Cooler Grey
table_odd_background_fill_dark="#1e1e1e", # Cooler Grey
# Explicitly set dark text variants
body_text_color_dark="#f4f4f5",
body_text_color_subdued_dark="#a1a1aa",
block_title_text_color_dark="#f4f4f5",
accordion_text_color_dark="#f4f4f5",
table_text_color_dark="#f4f4f5",
# Explicitly set dark panel/block variants
panel_background_fill_dark="#1e1e1e", # Cooler Grey
panel_border_color_dark="#333333", # Cooler Grey
block_background_fill_dark="#1e1e1e", # Cooler Grey
block_border_color_dark="#333333", # Cooler Grey
)
@dataclass
class ColumnInfo:
"""Information about a column in the leaderboard."""
name: str
display_name: str
type: str = "text"
hidden: bool = False
never_hidden: bool = False
displayed_by_default: bool = True
def update_column_choices(df):
"""Update column choices based on what's actually in the dataframe"""
if df is None or df.empty:
return get_all_column_choices()
# Get columns that actually exist in the dataframe
existing_columns = list(df.columns)
# Get all possible columns with their display names
all_columns = get_all_column_choices()
# Filter to only include columns that exist in the dataframe
valid_columns = [
(col_name, display_name)
for col_name, display_name in all_columns
if col_name in existing_columns
]
# Return default if there are no valid columns
if not valid_columns:
return get_all_column_choices()
return valid_columns
# Update the column_selector initialization
def get_initial_columns():
"""Get initial columns to show in the dropdown"""
try:
# Get available columns in the main dataframe
available_cols = list(LEADERBOARD_DF.columns)
logger.info(f"Available columns in LEADERBOARD_DF: {available_cols}")
# If dataframe is empty, use default visible columns
if not available_cols:
return get_default_visible_columns()
# Get default visible columns that actually exist in the dataframe
valid_defaults = [
col for col in get_default_visible_columns() if col in available_cols
]
# If none of the defaults exist, return all available columns
if not valid_defaults:
return available_cols
return valid_defaults
except Exception as e:
logger.error(f"Error getting initial columns: {e}")
return get_default_visible_columns()
def init_leaderboard(dataframe, visible_columns=None):
"""
Initialize a standard Gradio Dataframe component for the leaderboard.
"""
if dataframe is None or dataframe.empty:
# Create an empty dataframe with the right columns
columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
dataframe = pd.DataFrame(columns=columns)
logger.warning("Initializing empty leaderboard")
# Lowercase model_name for display
if "model_name" in dataframe.columns:
dataframe = dataframe.copy()
dataframe["model_name"] = dataframe["model_name"].str.lower()
if "model_type" in dataframe.columns:
dataframe = dataframe.copy()
dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
if "review_model_type" in dataframe.columns:
dataframe = dataframe.copy()
dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
# print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
# Determine which columns to display
display_column_names = [
getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
]
hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
# Columns that should always be shown
always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
# Use provided visible columns if specified, otherwise use default
if visible_columns is None:
# Determine which columns to show initially
visible_columns = [
col for col in display_column_names if col not in hidden_column_names
]
# Always include the never-hidden columns
for col in always_visible:
if col not in visible_columns and col in dataframe.columns:
visible_columns.append(col)
# Make sure we only include columns that actually exist in the dataframe
visible_columns = [col for col in visible_columns if col in dataframe.columns]
# Map GuardBench column types to Gradio's expected datatype strings
# Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
type_mapping = {
"text": "str",
"number": "number",
"bool": "bool",
"date": "date",
"markdown": "markdown",
"html": "html",
"image": "image",
}
# Create a list of datatypes in the format Gradio expects
datatypes = []
for col in visible_columns:
# Find the corresponding CODEREVIEW_COLUMN entry
col_type = None
for display_col in DISPLAY_COLS:
if getattr(CODEREVIEW_COLUMN, display_col).name == col:
orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
# Map to Gradio's expected types
col_type = type_mapping.get(orig_type, "str")
break
# Default to 'str' if type not found or not mappable
if col_type is None:
col_type = "str"
datatypes.append(col_type)
# Create a dummy column for search functionality if it doesn't exist
if "search_dummy" not in dataframe.columns:
dataframe["search_dummy"] = dataframe.apply(
lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
axis=1,
)
# Select only the visible columns for display
visible_columns.remove("model_name")
visible_columns = ["model_name"] + visible_columns
display_df = dataframe[visible_columns].copy()
# print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
# print(display_df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head() if all(c in display_df.columns for c in ['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']) else "Relevant columns not present")
# print(f"-------------------------------------------------------------")
# Round numeric columns to 3 decimal places for display
numeric_cols = display_df.select_dtypes(include=np.number).columns
for col in numeric_cols:
# Avoid rounding integer columns like counts
if not pd.api.types.is_integer_dtype(display_df[col]):
# Format floats to exactly 3 decimal places, preserving trailing zeros
display_df[col] = display_df[col].apply(
lambda x: f"{x:.3f}" if pd.notna(x) else None
)
column_info_map = {
f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
}
column_mapping = {
col: column_info_map.get(col, ColumnInfo(col, col)).display_name
for col in visible_columns
}
# Rename columns in the DataFrame
display_df.rename(columns=column_mapping, inplace=True)
# Apply styling - note: styling might need adjustment if it relies on column names
styler = display_df.style.set_properties(**{"text-align": "right"}).set_properties(
subset=["Model"], **{"width": "200px"}
)
return gr.Dataframe(
value=styler,
datatype=datatypes,
interactive=False,
wrap=True,
height=2500,
elem_id="leaderboard-table",
row_count=len(display_df),
)
def search_filter_leaderboard(
df, search_query="", comment_languages=None, version=CURRENT_VERSION
):
"""
Filter the leaderboard based on search query and comment languages.
"""
if df is None or df.empty:
return df
filtered_df = df.copy()
# Add search dummy column if it doesn't exist
if "search_dummy" not in filtered_df.columns:
filtered_df["search_dummy"] = filtered_df.apply(
lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
axis=1,
)
# Apply comment language filter (assuming there's a comment_language column in the data)
if comment_languages and len(comment_languages) > 0:
# Look for a comment language column in the dataframe
comment_lang_cols = [col for col in filtered_df.columns if 'comment_language' in col.lower()]
if comment_lang_cols:
filtered_df = filtered_df[
filtered_df[comment_lang_cols[0]].isin(comment_languages)
]
# Apply search query
if search_query:
search_terms = [
term.strip() for term in search_query.split(";") if term.strip()
]
if search_terms:
combined_mask = None
for term in search_terms:
mask = filtered_df["search_dummy"].str.contains(
term, case=False, na=False
)
if combined_mask is None:
combined_mask = mask
else:
combined_mask = combined_mask | mask
if combined_mask is not None:
filtered_df = filtered_df[combined_mask]
# Drop the search dummy column before returning
visible_columns = [col for col in filtered_df.columns if col != "search_dummy"]
return filtered_df[visible_columns]
def refresh_data_with_filters(
version=CURRENT_VERSION, search_query="", comment_languages=None, selected_columns=None
):
"""
Refresh the leaderboard data and update all components with filtering.
Ensures we handle cases where dataframes might have limited columns.
"""
global LEADERBOARD_DF
try:
logger.info(f"Performing refresh of leaderboard data with filters...")
# Get new data
main_df = get_leaderboard_df(version=version)
LEADERBOARD_DF = main_df
category_dfs = [
get_category_leaderboard_df(category, version=version)
for category in CATEGORIES
]
selected_columns = [
x.lower()
.replace(" ", "_")
.replace("(", "")
.replace(")", "")
.replace("_recall", "_recall_binary")
.replace("_precision", "_precision_binary")
for x in selected_columns
]
# Log the actual columns we have
logger.info(f"Main dataframe columns: {list(main_df.columns)}")
# Apply filters to each dataframe
filtered_main_df = search_filter_leaderboard(
main_df, search_query, comment_languages, version
)
filtered_category_dfs = [
search_filter_leaderboard(df, search_query, comment_languages, version)
for df in category_dfs
]
# Get available columns from the dataframe
available_columns = list(filtered_main_df.columns)
# Filter selected columns to only those available in the data
if selected_columns:
# Convert display names to internal names first
internal_selected_columns = [
x.lower()
.replace(" ", "_")
.replace("(", "")
.replace(")", "")
.replace("_recall", "_recall_binary")
.replace("_precision", "_precision_binary")
for x in selected_columns
]
valid_selected_columns = [
col for col in internal_selected_columns if col in available_columns
]
if not valid_selected_columns and "model_name" in available_columns:
# Fallback if conversion/filtering leads to empty selection
valid_selected_columns = ["model_name"] + [
col
for col in get_default_visible_columns()
if col in available_columns
]
else:
# If no columns were selected in the dropdown, use default visible columns that exist
valid_selected_columns = [
col for col in get_default_visible_columns() if col in available_columns
]
# Initialize dataframes for display with valid selected columns
main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
# For category dataframes, get columns that actually exist in each one
category_dataframes = []
for df in filtered_category_dfs:
df_columns = list(df.columns)
df_valid_columns = [
col for col in valid_selected_columns if col in df_columns
]
if not df_valid_columns and "model_name" in df_columns:
df_valid_columns = ["model_name"] + get_default_visible_columns()
category_dataframes.append(init_leaderboard(df, df_valid_columns))
return main_dataframe, *category_dataframes
except Exception as e:
logger.error(f"Error in refresh with filters: {e}")
# Return the current leaderboards on error
return leaderboard, *[
tab.children[0] for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
]
def submit_results(
model_name: str,
base_model: str,
revision: str,
precision: str,
weight_type: str,
model_type: str,
mode: str,
submission_file: tempfile._TemporaryFileWrapper,
version: str,
review_model_type: ReviewModelType,
programming_language: str,
comment_language: str,
):
"""
Handle submission of results with model metadata.
"""
if submission_file is None:
return styled_error("No submission file provided")
if not model_name:
return styled_error("Model name is required")
if not model_type:
return styled_error("Please select a model type")
if not mode:
return styled_error("Please select an inference mode")
file_path = submission_file.name
logger.info(f"Received submission for model {model_name}: {file_path}")
# Add metadata to the submission
metadata = {
"model_name": model_name,
"base_model": base_model,
"revision": revision if revision else "main",
"precision": precision,
"weight_type": weight_type,
"model_type": model_type,
"mode": mode,
"version": version,
"review_model_type": review_model_type,
"programming_language": programming_language,
"comment_language": comment_language,
}
# Process the submission
result = process_submission(file_path, metadata, version=version)
# Refresh the leaderboard data
global LEADERBOARD_DF
try:
logger.info(
f"Refreshing leaderboard data after submission for version {version}..."
)
LEADERBOARD_DF = get_leaderboard_df(version=version)
logger.info("Refreshed leaderboard data after submission")
except Exception as e:
logger.error(f"Error refreshing leaderboard data: {e}")
return result
def refresh_data(version=CURRENT_VERSION):
"""
Refresh the leaderboard data and update all components.
"""
try:
logger.info(f"Performing scheduled refresh of leaderboard data...")
# Get new data
main_df = get_leaderboard_df(version=version)
category_dfs = [
get_category_leaderboard_df(category, version=version)
for category in CATEGORIES
]
# For gr.Dataframe, we return the actual dataframes
return main_df, *category_dfs
except Exception as e:
logger.error(f"Error in scheduled refresh: {e}")
return None, *[None for _ in CATEGORIES]
def update_leaderboards(version):
"""
Update all leaderboard components with data for the selected version.
"""
try:
new_df = get_leaderboard_df(version=version)
category_dfs = [
get_category_leaderboard_df(category, version=version)
for category in CATEGORIES
]
return new_df, *category_dfs
except Exception as e:
logger.error(f"Error updating leaderboards for version {version}: {e}")
return None, *[None for _ in CATEGORIES]
def create_performance_plot(
selected_models, category, metric="f1_binary", version=CURRENT_VERSION
):
"""
Create a radar plot comparing model performance for selected models.
"""
if category == "All Results":
df = get_leaderboard_df(version=version)
else:
df = get_category_leaderboard_df(category, version=version)
if df.empty:
return go.Figure()
# Lowercase model_name in df and selected_models
df = df.copy()
df["model_name"] = df["model_name"].str.lower()
selected_models = [m.lower() for m in selected_models]
df = df[df["model_name"].isin(selected_models)]
metric_cols = [col for col in df.columns if metric in col]
fig = go.Figure()
colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
for idx, model in enumerate(selected_models):
model_data = df[df["model_name"] == model]
if not model_data.empty:
values = model_data[metric_cols].values[0].tolist()
values = values + [values[0]]
categories = [col.replace(f"_{metric}", "") for col in metric_cols]
# Replace 'jailbreaked' with 'jailbroken' in categories
categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
categories = categories + [categories[0]]
fig.add_trace(
go.Scatterpolar(
r=values,
theta=categories,
name=model,
line_color=colors[idx % len(colors)],
fill="toself",
)
)
fig.update_layout(
paper_bgcolor="#000000",
plot_bgcolor="#000000",
font={"color": "#ffffff"},
title={
"text": f"{category} - {metric.upper()} Score Comparison",
"font": {"color": "#ffffff", "size": 24},
},
polar=dict(
bgcolor="#000000",
radialaxis=dict(
visible=True,
range=[0, 1],
gridcolor="#333333",
linecolor="#333333",
tickfont={"color": "#ffffff"},
),
angularaxis=dict(
gridcolor="#333333",
linecolor="#333333",
tickfont={"color": "#ffffff"},
),
),
height=600,
showlegend=True,
legend=dict(
yanchor="top",
y=0.99,
xanchor="right",
x=0.99,
bgcolor="rgba(0,0,0,0.5)",
font={"color": "#ffffff"},
),
)
return fig
def update_model_choices(version):
"""
Update the list of available models for the given version.
"""
df = get_leaderboard_df(version=version)
if df.empty:
return []
return sorted(df["model_name"].str.lower().unique().tolist())
def update_visualization(selected_models, selected_category, selected_metric, version):
"""
Update the visualization based on user selections.
"""
if not selected_models:
return go.Figure()
return create_performance_plot(
selected_models, selected_category, selected_metric, version
)
# Create Gradio app
demo = gr.Blocks(css=custom_css, theme=custom_theme)
CATEGORY_DISPLAY_MAP = {
"Python": "Python",
"Java": "Java",
"Scala": "Scala",
"Go": "Go"
}
# Create reverse mapping for lookups
CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
with demo:
gr.HTML(TITLE)
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.Row():
tabs = gr.Tabs(elem_classes="tab-buttons")
with tabs:
with gr.TabItem("Leaderboard", elem_id="codereview-leaderboard-tab", id=0):
with gr.Row():
version_selector = gr.Dropdown(
choices=BENCHMARK_VERSIONS,
label="Benchmark Version",
value=CURRENT_VERSION,
interactive=True,
elem_classes="version-selector",
scale=1,
visible=False,
)
with gr.Row():
search_input = gr.Textbox(
placeholder="Search by models (use ; to split)",
label="Search",
elem_id="search-bar",
scale=2,
)
comment_language_filter = gr.Dropdown(
choices=["en", "ru"],
label="Comment Language",
multiselect=True,
value=[],
interactive=True,
scale=1,
)
programming_language_filter = gr.Dropdown(
choices=["Python", "Java", "Scala", "Go"],
label="Programming Language",
multiselect=True,
value=[],
interactive=True,
scale=1,
)
with gr.Row():
topic_filter = gr.Dropdown(
choices=TOPICS,
label="Topic",
multiselect=True,
value=[],
interactive=True,
scale=2,
)
column_selector = gr.Dropdown(
choices=get_all_column_choices(),
label="Columns",
multiselect=True,
value=get_initial_columns(),
interactive=True,
visible=False,
scale=1,
)
with gr.Row():
refresh_button = gr.Button(
"Refresh", scale=0, elem_id="refresh-button"
)
# Create tabs for each category
with gr.Tabs(elem_classes="category-tabs") as category_tabs:
# First tab for average metrics across all categories
with gr.TabItem("All Results", elem_id="overall-tab"):
leaderboard = init_leaderboard(LEADERBOARD_DF)
# Create a tab for each category using display names
for category in CATEGORIES:
display_name = CATEGORY_DISPLAY_MAP.get(category, category)
elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
with gr.TabItem(display_name, elem_id=elem_id):
category_df = get_category_leaderboard_df(
category, version=CURRENT_VERSION
)
category_leaderboard = init_leaderboard(category_df)
# Connect search and filter inputs to update function
def update_with_search_filters(
version=CURRENT_VERSION,
search_query="",
comment_languages=None,
selected_columns=None,
):
"""
Update the leaderboards with search and filter settings.
"""
return refresh_data_with_filters(
version, search_query, comment_languages, selected_columns
)
# Refresh button functionality
def refresh_and_update(
version, search_query, comment_languages, selected_columns
):
"""
Refresh data, update LEADERBOARD_DF, and return updated components.
"""
global LEADERBOARD_DF
main_df = get_leaderboard_df(version=version)
LEADERBOARD_DF = main_df # Update the global DataFrame
return refresh_data_with_filters(
version, search_query, comment_languages, selected_columns
)
refresh_button.click(
fn=refresh_and_update,
inputs=[
version_selector,
search_input,
comment_language_filter,
column_selector,
],
outputs=[leaderboard]
+ [
category_tabs.children[i].children[0]
for i in range(1, len(CATEGORIES) + 1)
],
)
# Search input functionality
search_input.change(
fn=refresh_data_with_filters,
inputs=[
version_selector,
search_input,
comment_language_filter,
column_selector,
],
outputs=[leaderboard]
+ [
category_tabs.children[i].children[0]
for i in range(1, len(CATEGORIES) + 1)
],
)
# Comment language filter functionality
comment_language_filter.change(
fn=refresh_data_with_filters,
inputs=[
version_selector,
search_input,
comment_language_filter,
column_selector,
],
outputs=[leaderboard]
+ [
category_tabs.children[i].children[0]
for i in range(1, len(CATEGORIES) + 1)
],
)
# Version selector functionality
version_selector.change(
fn=refresh_data_with_filters,
inputs=[
version_selector,
search_input,
comment_language_filter,
column_selector,
],
outputs=[leaderboard]
+ [
category_tabs.children[i].children[0]
for i in range(1, len(CATEGORIES) + 1)
],
)
# Update the update_columns function to handle updating all tabs at once
def update_columns(selected_columns):
"""
Update all leaderboards to show the selected columns.
Ensures all selected columns are preserved in the update.
"""
try:
logger.info(f"Updating columns to show: {selected_columns}")
# If no columns are selected, use default visible columns
if not selected_columns or len(selected_columns) == 0:
selected_columns = get_default_visible_columns()
logger.info(
f"No columns selected, using defaults: {selected_columns}"
)
# Convert display names to internal names
internal_selected_columns = [
x.lower()
.replace(" ", "_")
.replace("(", "")
.replace(")", "")
.replace("_recall", "_recall_binary")
.replace("_precision", "_precision_binary")
for x in selected_columns
]
# Get the current data with ALL columns preserved
main_df = get_leaderboard_df(version=version_selector.value)
# Get category dataframes with ALL columns preserved
category_dfs = [
get_category_leaderboard_df(
category, version=version_selector.value
)
for category in CATEGORIES
]
# Log columns for debugging
logger.info(f"Main dataframe columns: {list(main_df.columns)}")
logger.info(
f"Selected columns (internal): {internal_selected_columns}"
)
# IMPORTANT: Make sure model_name is always included
if (
"model_name" in main_df.columns
and "model_name" not in internal_selected_columns
):
internal_selected_columns = [
"model_name"
] + internal_selected_columns
# Initialize the main leaderboard with the selected columns
# We're passing the internal_selected_columns directly to preserve the selection
main_leaderboard = init_leaderboard(
main_df, internal_selected_columns
)
# Initialize category dataframes with the same selected columns
# This ensures consistency across all tabs
category_leaderboards = []
for df in category_dfs:
# Use the same selected columns for each category
# init_leaderboard will automatically handle filtering to columns that exist
category_leaderboards.append(
init_leaderboard(df, internal_selected_columns)
)
return main_leaderboard, *category_leaderboards
except Exception as e:
logger.error(f"Error updating columns: {e}")
import traceback
logger.error(traceback.format_exc())
return leaderboard, *[
tab.children[0]
for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
]
# Connect column selector to update function
column_selector.change(
fn=update_columns,
inputs=[column_selector],
outputs=[leaderboard]
+ [
category_tabs.children[i].children[0]
for i in range(1, len(CATEGORIES) + 1)
],
)
# with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=1):
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Row():
# with gr.Column(scale=3):
# gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
with gr.Column(scale=1):
# Add version selector specifically for the submission tab
submission_version_selector = gr.Dropdown(
choices=BENCHMARK_VERSIONS,
label="Benchmark Version",
value=CURRENT_VERSION,
interactive=True,
elem_classes="version-selector",
visible=False,
)
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name")
mode_selector = gr.Dropdown(
choices=[m.name for m in Mode],
label="Mode",
multiselect=False,
value=None,
interactive=True,
)
revision_name_textbox = gr.Textbox(
label="Revision commit", placeholder="main"
)
model_type = gr.Dropdown(
choices=[
t.to_str("-")
for t in ModelType
if t != ModelType.Unknown and t != ModelType.ClosedSource
],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
review_model_type = gr.Dropdown(
choices=[t.name for t in ReviewModelType],
label="Review model type",
multiselect=False,
value=ReviewModelType.CUSTOM.name,
interactive=True,
)
programming_language_selector = gr.Dropdown(
choices=["Python", "Java", "Scala", "Go"],
label="Programming Language",
multiselect=False,
value=None,
interactive=True,
)
comment_language_selector = gr.Dropdown(
choices=["en", "ru"],
label="Comment Language",
multiselect=False,
value="en",
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[
i.name for i in Precision if i != Precision.Unknown
],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[i.name for i in WeightType],
label="Weights type",
multiselect=False,
value="Original",
interactive=True,
)
base_model_name_textbox = gr.Textbox(
label="Base model (for delta or adapter weights)"
)
with gr.Row():
file_input = gr.File(
label="Upload JSONL Results File", file_types=[".jsonl"]
)
submit_button = gr.Button("Submit Results")
result_output = gr.Markdown()
submit_button.click(
fn=submit_results,
inputs=[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
mode_selector,
file_input,
submission_version_selector,
review_model_type,
programming_language_selector,
comment_language_selector,
],
outputs=result_output,
)
# Version selector functionality
version_selector.change(
fn=update_leaderboards,
inputs=[version_selector],
outputs=[leaderboard]
+ [
category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
],
).then(
lambda version: refresh_data_with_filters(version),
inputs=[version_selector],
outputs=[leaderboard]
+ [
category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
],
)
# Set up the scheduler to refresh data periodically
scheduler = BackgroundScheduler()
scheduler.add_job(refresh_data, "interval", minutes=30)
scheduler.start()
# Launch the app
if __name__ == "__main__":
demo.launch()