CodeReviewBench / app.py
Alex
app fixed
2812333
raw
history blame
9.66 kB
import json
from pathlib import Path
from typing import List, Dict
import os
import gradio as gr
from pydantic import BaseModel, Field, field_validator
# --------------- Configuration ---------------
LEADERBOARD_PATH = Path("leaderboard_data.json")
# Initialize with default data
DEFAULT_DATA = [{
"model_name": "example/model",
"bleu": 0.5,
"llm_pass_1": 0.5,
"llm_pass_5": 0.5,
"llm_pass_10": 0.5,
"metrics": {
"readability": 5, "relevance": 5, "explanation_clarity": 5,
"problem_identification": 5, "actionability": 5, "completeness": 5,
"specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5
}
}]
# --------------- Data models ---------------
class Metrics(BaseModel):
readability: int
relevance: int
explanation_clarity: int = Field(alias="explanation_clarity")
problem_identification: int
actionability: int
completeness: int
specificity: int
contextual_adequacy: int
consistency: int
brevity: int
@field_validator("readability", "relevance", "explanation_clarity", "problem_identification", "actionability", "completeness", "specificity", "contextual_adequacy", "consistency", "brevity")
def metric_range(cls, v: int):
if not 0 <= v <= 10:
raise ValueError("Multi-metrics should be between 0 and 10")
return v
class LeaderboardEntry(BaseModel):
model_name: str
bleu: float
llm_pass_1: float
llm_pass_5: float
llm_pass_10: float
metrics: Metrics
@field_validator("bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10")
def score_range(cls, v: float):
if not 0.0 <= v <= 1.0:
raise ValueError("Scores should be between 0 and 1")
return v
# --------------- Persistence helpers ---------------
def _load_leaderboard() -> List[Dict]:
"""Load leaderboard data with persistent storage support."""
if not LEADERBOARD_PATH.exists():
# Create default example data
_save_leaderboard(DEFAULT_DATA)
return DEFAULT_DATA
try:
with LEADERBOARD_PATH.open("r", encoding="utf-8") as f:
data = json.load(f)
return data.get("leaderboard", [])
except Exception as e:
print(f"Error loading leaderboard: {e}")
return []
def _save_leaderboard(data: List[Dict]):
"""Save leaderboard data to persistent storage."""
try:
to_store = {"leaderboard": data}
with LEADERBOARD_PATH.open("w", encoding="utf-8") as f:
json.dump(to_store, f, indent=2)
except Exception as e:
print(f"Error saving leaderboard: {e}")
# --------------- Table data functions ---------------
def _table_data(data: List[Dict] = None) -> List[List]:
"""Get main metrics table data."""
if data is None:
data = _load_leaderboard()
if not data:
return []
data.sort(key=lambda x: x["llm_pass_1"], reverse=True)
table_rows = []
for entry in data:
row = [
entry["model_name"],
entry["bleu"],
entry["llm_pass_1"],
entry["llm_pass_5"],
entry["llm_pass_10"],
]
table_rows.append(row)
return table_rows
def _multimetric_table_data(data: List[Dict] = None) -> List[List]:
"""Get multi-metric table data."""
if data is None:
data = _load_leaderboard()
if not data:
return []
data.sort(key=lambda x: x["llm_pass_1"], reverse=True)
table_rows = []
for entry in data:
row = [
entry["model_name"],
entry["metrics"]["readability"],
entry["metrics"]["relevance"],
entry["metrics"]["explanation_clarity"],
entry["metrics"]["problem_identification"],
entry["metrics"]["actionability"],
entry["metrics"]["completeness"],
entry["metrics"]["specificity"],
entry["metrics"]["contextual_adequacy"],
entry["metrics"]["consistency"],
entry["metrics"]["brevity"],
]
table_rows.append(row)
return table_rows
# --------------- Gradio callbacks ---------------
def submit_model(
current_data: List[Dict],
model_name: str,
bleu: float,
llm_pass_1: float,
llm_pass_5: float,
llm_pass_10: float,
readability: int,
relevance: int,
explanation_clarity: int,
problem_identification: int,
actionability: int,
completeness: int,
specificity: int,
contextual_adequacy: int,
consistency: int,
brevity: int,
):
"""Validate and append a new model entry to the leaderboard."""
try:
entry = LeaderboardEntry(
model_name=model_name.strip(),
bleu=bleu,
llm_pass_1=llm_pass_1,
llm_pass_5=llm_pass_5,
llm_pass_10=llm_pass_10,
metrics={
"readability": readability,
"relevance": relevance,
"explanation_clarity": explanation_clarity,
"problem_identification": problem_identification,
"actionability": actionability,
"completeness": completeness,
"specificity": specificity,
"contextual_adequacy": contextual_adequacy,
"consistency": consistency,
"brevity": brevity,
},
)
except Exception as e:
return current_data, _table_data(current_data), _multimetric_table_data(current_data), f"❌ Submission failed: {e}"
# Use current data from state
data = current_data.copy() if current_data else []
# Replace existing model entry if any
data = [d for d in data if d["model_name"] != entry.model_name]
data.append(entry.dict())
_save_leaderboard(data)
return data, _table_data(data), _multimetric_table_data(data), "βœ… Submission recorded!"
# --------------- Interface ---------------
with gr.Blocks(title="CodeReview Leaderboard") as demo:
gr.Markdown("""# πŸ† CodeReview Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """)
# Initialize table data
initial_leaderboard_data = _load_leaderboard()
initial_data = _table_data(initial_leaderboard_data)
initial_multimetric_data = _multimetric_table_data(initial_leaderboard_data)
# State to store leaderboard data
leaderboard_state = gr.State(value=initial_leaderboard_data)
leaderboard_df = gr.Dataframe(
headers=["Model", "BLEU", "Pass@1", "Pass@5", "Pass@10"],
value=initial_data,
label="Main Metrics Leaderboard",
interactive=False,
)
multimetric_df = gr.Dataframe(
headers=["Model", "Readability", "Relevance", "Explanation Clarity", "Problem Identification", "Actionability", "Completeness", "Specificity", "Contextual Adequacy", "Consistency", "Brevity"],
value=initial_multimetric_data,
label="Multi-Metric Scores",
interactive=False,
)
gr.Markdown("## πŸ”„ Submit new model results")
with gr.Accordion("Submission form", open=False):
with gr.Row():
model_name_inp = gr.Text(label="Model name (org/model)", value="")
bleu_inp = gr.Number(label="BLEU", value=0.0, minimum=0.0, maximum=1.0)
pass1_inp = gr.Number(label="Pass@1", value=0.0, minimum=0.0, maximum=1.0)
pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0)
pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0)
gr.Markdown("### Multi-metric subjective scores (0 – 10)")
with gr.Row():
readability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Readability")
relevance_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Relevance")
explanation_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Explanation Clarity")
problem_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Problem Identification")
actionability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Actionability")
completeness_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Completeness")
specificity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Specificity")
contextual_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Contextual Adequacy")
consistency_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Consistency")
brevity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Brevity")
submit_btn = gr.Button("Submit")
status_markdown = gr.Markdown("")
submit_btn.click(
fn=submit_model,
inputs=[
leaderboard_state,
model_name_inp,
bleu_inp,
pass1_inp,
pass5_inp,
pass10_inp,
readability_inp,
relevance_inp,
explanation_inp,
problem_inp,
actionability_inp,
completeness_inp,
specificity_inp,
contextual_inp,
consistency_inp,
brevity_inp,
],
outputs=[leaderboard_state, leaderboard_df, multimetric_df, status_markdown],
api_name="submit_model",
)
# ----------------- Launch -----------------
if __name__ == "__main__":
demo.queue().launch()
# For HF Spaces runtime (gradio SDK) expose `demo`
app = demo