Spaces:
Running
Running
""" | |
Utility classes and functions for the CodeReview Bench Leaderboard display. | |
""" | |
from dataclasses import dataclass, field, fields | |
from enum import Enum, auto | |
from typing import List, Optional | |
class Mode(Enum): | |
"""Inference mode for the review model.""" | |
CoT = auto() # Chain of Thought | |
Strict = auto() | |
def __str__(self): | |
"""String representation of the mode.""" | |
return self.name | |
class ModelType(Enum): | |
"""Model types for the leaderboard.""" | |
Unknown = auto() | |
OpenSource = auto() | |
ClosedSource = auto() | |
API = auto() | |
def to_str(self, separator: str = "-") -> str: | |
"""Convert enum to string with separator.""" | |
if self == ModelType.Unknown: | |
return "Unknown" | |
elif self == ModelType.OpenSource: | |
return f"Open{separator}Source" | |
elif self == ModelType.ClosedSource: | |
return f"Closed{separator}Source" | |
elif self == ModelType.API: | |
return "API" | |
return "Unknown" | |
class ReviewModelType(str, Enum): | |
"""Review model types for the leaderboard.""" | |
GPT_4 = "gpt-4" | |
GPT_3_5 = "gpt-3.5-turbo" | |
CLAUDE = "claude" | |
LLAMA = "llama" | |
GEMINI = "gemini" | |
CUSTOM = "custom" | |
def __str__(self): | |
"""String representation of the review model type.""" | |
return self.value | |
class Precision(Enum): | |
"""Model precision types.""" | |
Unknown = auto() | |
float16 = auto() | |
bfloat16 = auto() | |
float32 = auto() | |
int8 = auto() | |
int4 = auto() | |
NA = auto() | |
def __str__(self): | |
"""String representation of the precision type.""" | |
return self.name | |
class WeightType(Enum): | |
"""Model weight types.""" | |
Original = auto() | |
Delta = auto() | |
Adapter = auto() | |
def __str__(self): | |
"""String representation of the weight type.""" | |
return self.name | |
class ColumnInfo: | |
"""Information about a column in the leaderboard.""" | |
name: str | |
display_name: str | |
type: str = "text" | |
hidden: bool = False | |
never_hidden: bool = False | |
displayed_by_default: bool = True | |
class CodeReviewBenchColumn: | |
"""Columns for the CodeReview Bench leaderboard.""" | |
# Core metadata | |
model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="model_name", | |
display_name="Model", | |
never_hidden=True, | |
displayed_by_default=True | |
)) | |
mode: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="mode", | |
display_name="Mode", | |
displayed_by_default=True | |
)) | |
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="model_type", | |
display_name="Access_Type", | |
displayed_by_default=True | |
)) | |
submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="submission_date", | |
display_name="Submission_Date", | |
displayed_by_default=False | |
)) | |
version: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="version", | |
display_name="Version", | |
displayed_by_default=False | |
)) | |
review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="review_model_type", | |
display_name="Type", | |
displayed_by_default=False | |
)) | |
base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="base_model", | |
display_name="Base Model", | |
displayed_by_default=False | |
)) | |
revision: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="revision", | |
display_name="Revision", | |
displayed_by_default=False | |
)) | |
precision: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="precision", | |
display_name="Precision", | |
displayed_by_default=False | |
)) | |
weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="weight_type", | |
display_name="Weight Type", | |
displayed_by_default=False | |
)) | |
topic: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="topic", | |
display_name="Topic", | |
displayed_by_default=True | |
)) | |
# LLM-based multimetric scores | |
readability: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="readability", | |
display_name="Readability", | |
type="number", | |
displayed_by_default=True | |
)) | |
relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="relevance", | |
display_name="Relevance", | |
type="number", | |
displayed_by_default=True | |
)) | |
explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="explanation_clarity", | |
display_name="Explanation_Clarity", | |
type="number", | |
displayed_by_default=True | |
)) | |
problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="problem_identification", | |
display_name="Problem_Identification", | |
type="number", | |
displayed_by_default=True | |
)) | |
actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="actionability", | |
display_name="Actionability", | |
type="number", | |
displayed_by_default=True | |
)) | |
completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="completeness", | |
display_name="Completeness", | |
type="number", | |
displayed_by_default=True | |
)) | |
specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="specificity", | |
display_name="Specificity", | |
type="number", | |
displayed_by_default=True | |
)) | |
contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="contextual_adequacy", | |
display_name="Contextual_Adequacy", | |
type="number", | |
displayed_by_default=True | |
)) | |
consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="consistency", | |
display_name="Consistency", | |
type="number", | |
displayed_by_default=True | |
)) | |
brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="brevity", | |
display_name="Brevity", | |
type="number", | |
displayed_by_default=True | |
)) | |
# LLM-based-exact-match metrics | |
pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="pass_at_1", | |
display_name="Pass@1", | |
type="number", | |
displayed_by_default=True | |
)) | |
pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="pass_at_5", | |
display_name="Pass@5", | |
type="number", | |
displayed_by_default=True | |
)) | |
pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="pass_at_10", | |
display_name="Pass@10", | |
type="number", | |
displayed_by_default=True | |
)) | |
bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="bleu_at_10", | |
display_name="BLEU@10", | |
type="number", | |
displayed_by_default=True | |
)) | |
# Overall aggregated metrics | |
overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="overall_score", | |
display_name="Overall_Score", | |
type="number", | |
displayed_by_default=True | |
)) | |
multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="multimetric_average", | |
display_name="Multimetric_Average", | |
type="number", | |
displayed_by_default=True | |
)) | |
exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="exact_match_average", | |
display_name="Exact_Match_Average", | |
type="number", | |
displayed_by_default=True | |
)) | |
total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="total_evaluations", | |
display_name="Total_Evaluations", | |
type="number", | |
displayed_by_default=True | |
)) | |
# Language-specific metrics (Russian) | |
ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="ru_readability", | |
display_name="RU_Readability", | |
type="number", | |
displayed_by_default=False | |
)) | |
ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="ru_relevance", | |
display_name="RU_Relevance", | |
type="number", | |
displayed_by_default=False | |
)) | |
ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="ru_overall_score", | |
display_name="RU_Overall_Score", | |
type="number", | |
displayed_by_default=False | |
)) | |
# Language-specific metrics (English) | |
en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="en_readability", | |
display_name="EN_Readability", | |
type="number", | |
displayed_by_default=False | |
)) | |
en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="en_relevance", | |
display_name="EN_Relevance", | |
type="number", | |
displayed_by_default=False | |
)) | |
en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo( | |
name="en_overall_score", | |
display_name="EN_Overall_Score", | |
type="number", | |
displayed_by_default=False | |
)) | |
# Create instances for easy access | |
CODEREVIEW_COLUMN = CodeReviewBenchColumn() | |
# Extract column lists for different views | |
COLS = [f.name for f in fields(CODEREVIEW_COLUMN)] | |
DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN) | |
if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default] | |
# Manually reorder DISPLAY_COLS to put 'mode' after 'model_name' | |
def reorder_display_cols(): | |
cols = DISPLAY_COLS | |
if 'model_name' in cols and 'mode' in cols: | |
cols.remove('mode') | |
model_name_index = cols.index('model_name') | |
cols.insert(model_name_index + 1, 'mode') | |
return cols | |
DISPLAY_COLS = reorder_display_cols() | |
METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN) | |
if getattr(CODEREVIEW_COLUMN, f.name).type == "number"] | |
HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN) | |
if getattr(CODEREVIEW_COLUMN, f.name).hidden] | |
NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN) | |
if getattr(CODEREVIEW_COLUMN, f.name).never_hidden] | |
# Categories for CodeReview Bench (Programming Languages) | |
CATEGORIES = [ | |
'Python', | |
'Java', | |
'Scala', | |
'Go' | |
] | |
# Language taxonomies for CodeReview Bench | |
COMMENT_LANGUAGES = [ | |
'ru', # Russian | |
'en' # English | |
] | |
# Topics for CodeReview Bench | |
TOPICS = [ | |
'Code Reliability', | |
'Coding Standards', | |
'Code Organization', | |
'Performance Issues', | |
'Validation', | |
'Variables' | |
] | |
# Example categories | |
EXAMPLE_CATEGORIES = [ | |
'Bug_Fix', | |
'Code_Style', | |
'Performance', | |
'Security', | |
'Refactoring', | |
'Documentation', | |
'Testing', | |
'Architecture', | |
'Other' | |
] | |
# Metrics for CodeReview Bench | |
MULTIMETRIC_METRICS = [ | |
"readability", | |
"relevance", | |
"explanation_clarity", | |
"problem_identification", | |
"actionability", | |
"completeness", | |
"specificity", | |
"contextual_adequacy", | |
"consistency", | |
"brevity" | |
] | |
EXACT_MATCH_METRICS = [ | |
"pass_at_1", | |
"pass_at_5", | |
"pass_at_10", | |
"bleu_at_10" | |
] | |
def get_all_column_choices(): | |
""" | |
Get all available column choices for the multiselect dropdown. | |
Returns: | |
List of tuples with (column_name, display_name) for all columns. | |
""" | |
column_choices = [] | |
default_visible_columns = get_default_visible_columns() | |
for f in fields(CODEREVIEW_COLUMN): | |
column_info = getattr(CODEREVIEW_COLUMN, f.name) | |
# Create a tuple with both the internal name and display name | |
if column_info.name not in default_visible_columns: | |
column_choices.append((column_info.name, column_info.display_name)) | |
return column_choices | |
def get_default_visible_columns(): | |
""" | |
Get the list of column names that should be visible by default. | |
Returns: | |
List of column names that are displayed by default. | |
""" | |
return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN) | |
if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default] | |