Spaces:
Sleeping
Sleeping
import json | |
from pathlib import Path | |
from typing import List, Dict | |
import os | |
import gradio as gr | |
from pydantic import BaseModel, Field, field_validator | |
# --------------- Configuration --------------- | |
LEADERBOARD_PATH = Path("leaderboard_data.json") | |
# Initialize with default data | |
DEFAULT_DATA = [{ | |
"model_name": "example/model", | |
"bleu": 0.5, | |
"llm_pass_1": 0.5, | |
"llm_pass_5": 0.5, | |
"llm_pass_10": 0.5, | |
"metrics": { | |
"readability": 5, "relevance": 5, "explanation_clarity": 5, | |
"problem_identification": 5, "actionability": 5, "completeness": 5, | |
"specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5 | |
} | |
}] | |
# --------------- Data models --------------- | |
class Metrics(BaseModel): | |
readability: int | |
relevance: int | |
explanation_clarity: int = Field(alias="explanation_clarity") | |
problem_identification: int | |
actionability: int | |
completeness: int | |
specificity: int | |
contextual_adequacy: int | |
consistency: int | |
brevity: int | |
def metric_range(cls, v: int): | |
if not 0 <= v <= 10: | |
raise ValueError("Multi-metrics should be between 0 and 10") | |
return v | |
class LeaderboardEntry(BaseModel): | |
model_name: str | |
bleu: float | |
llm_pass_1: float | |
llm_pass_5: float | |
llm_pass_10: float | |
metrics: Metrics | |
def score_range(cls, v: float): | |
if not 0.0 <= v <= 1.0: | |
raise ValueError("Scores should be between 0 and 1") | |
return v | |
# --------------- Persistence helpers --------------- | |
def _load_leaderboard() -> List[Dict]: | |
"""Load leaderboard data with persistent storage support.""" | |
if not LEADERBOARD_PATH.exists(): | |
# Create default example data | |
_save_leaderboard(DEFAULT_DATA) | |
return DEFAULT_DATA | |
try: | |
with LEADERBOARD_PATH.open("r", encoding="utf-8") as f: | |
data = json.load(f) | |
return data.get("leaderboard", []) | |
except Exception as e: | |
print(f"Error loading leaderboard: {e}") | |
return [] | |
def _save_leaderboard(data: List[Dict]): | |
"""Save leaderboard data to persistent storage.""" | |
try: | |
to_store = {"leaderboard": data} | |
with LEADERBOARD_PATH.open("w", encoding="utf-8") as f: | |
json.dump(to_store, f, indent=2) | |
except Exception as e: | |
print(f"Error saving leaderboard: {e}") | |
# --------------- Table data functions --------------- | |
def _table_data(data: List[Dict] = None) -> List[List]: | |
"""Get main metrics table data.""" | |
if data is None: | |
data = _load_leaderboard() | |
if not data: | |
return [] | |
data.sort(key=lambda x: x["llm_pass_1"], reverse=True) | |
table_rows = [] | |
for entry in data: | |
row = [ | |
entry["model_name"], | |
entry["bleu"], | |
entry["llm_pass_1"], | |
entry["llm_pass_5"], | |
entry["llm_pass_10"], | |
] | |
table_rows.append(row) | |
return table_rows | |
def _multimetric_table_data(data: List[Dict] = None) -> List[List]: | |
"""Get multi-metric table data.""" | |
if data is None: | |
data = _load_leaderboard() | |
if not data: | |
return [] | |
data.sort(key=lambda x: x["llm_pass_1"], reverse=True) | |
table_rows = [] | |
for entry in data: | |
row = [ | |
entry["model_name"], | |
entry["metrics"]["readability"], | |
entry["metrics"]["relevance"], | |
entry["metrics"]["explanation_clarity"], | |
entry["metrics"]["problem_identification"], | |
entry["metrics"]["actionability"], | |
entry["metrics"]["completeness"], | |
entry["metrics"]["specificity"], | |
entry["metrics"]["contextual_adequacy"], | |
entry["metrics"]["consistency"], | |
entry["metrics"]["brevity"], | |
] | |
table_rows.append(row) | |
return table_rows | |
# --------------- Gradio callbacks --------------- | |
def submit_model( | |
current_data: List[Dict], | |
model_name: str, | |
bleu: float, | |
llm_pass_1: float, | |
llm_pass_5: float, | |
llm_pass_10: float, | |
readability: int, | |
relevance: int, | |
explanation_clarity: int, | |
problem_identification: int, | |
actionability: int, | |
completeness: int, | |
specificity: int, | |
contextual_adequacy: int, | |
consistency: int, | |
brevity: int, | |
): | |
"""Validate and append a new model entry to the leaderboard.""" | |
try: | |
entry = LeaderboardEntry( | |
model_name=model_name.strip(), | |
bleu=bleu, | |
llm_pass_1=llm_pass_1, | |
llm_pass_5=llm_pass_5, | |
llm_pass_10=llm_pass_10, | |
metrics={ | |
"readability": readability, | |
"relevance": relevance, | |
"explanation_clarity": explanation_clarity, | |
"problem_identification": problem_identification, | |
"actionability": actionability, | |
"completeness": completeness, | |
"specificity": specificity, | |
"contextual_adequacy": contextual_adequacy, | |
"consistency": consistency, | |
"brevity": brevity, | |
}, | |
) | |
except Exception as e: | |
return current_data, _table_data(current_data), _multimetric_table_data(current_data), f"β Submission failed: {e}" | |
# Use current data from state | |
data = current_data.copy() if current_data else [] | |
# Replace existing model entry if any | |
data = [d for d in data if d["model_name"] != entry.model_name] | |
data.append(entry.dict()) | |
_save_leaderboard(data) | |
return data, _table_data(data), _multimetric_table_data(data), "β Submission recorded!" | |
# --------------- Interface --------------- | |
with gr.Blocks(title="CodeReview Leaderboard") as demo: | |
gr.Markdown("""# π CodeReview Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """) | |
# Initialize table data | |
initial_leaderboard_data = _load_leaderboard() | |
initial_data = _table_data(initial_leaderboard_data) | |
initial_multimetric_data = _multimetric_table_data(initial_leaderboard_data) | |
# State to store leaderboard data | |
leaderboard_state = gr.State(value=initial_leaderboard_data) | |
leaderboard_df = gr.Dataframe( | |
headers=["Model", "BLEU", "Pass@1", "Pass@5", "Pass@10"], | |
value=initial_data, | |
label="Main Metrics Leaderboard", | |
interactive=False, | |
) | |
multimetric_df = gr.Dataframe( | |
headers=["Model", "Readability", "Relevance", "Explanation Clarity", "Problem Identification", "Actionability", "Completeness", "Specificity", "Contextual Adequacy", "Consistency", "Brevity"], | |
value=initial_multimetric_data, | |
label="Multi-Metric Scores", | |
interactive=False, | |
) | |
gr.Markdown("## π Submit new model results") | |
with gr.Accordion("Submission form", open=False): | |
with gr.Row(): | |
model_name_inp = gr.Text(label="Model name (org/model)", value="") | |
bleu_inp = gr.Number(label="BLEU", value=0.0, minimum=0.0, maximum=1.0) | |
pass1_inp = gr.Number(label="Pass@1", value=0.0, minimum=0.0, maximum=1.0) | |
pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0) | |
pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0) | |
gr.Markdown("### Multi-metric subjective scores (0 β 10)") | |
with gr.Row(): | |
readability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Readability") | |
relevance_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Relevance") | |
explanation_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Explanation Clarity") | |
problem_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Problem Identification") | |
actionability_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Actionability") | |
completeness_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Completeness") | |
specificity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Specificity") | |
contextual_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Contextual Adequacy") | |
consistency_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Consistency") | |
brevity_inp = gr.Slider(minimum=0, maximum=10, value=5, step=1, label="Brevity") | |
submit_btn = gr.Button("Submit") | |
status_markdown = gr.Markdown("") | |
submit_btn.click( | |
fn=submit_model, | |
inputs=[ | |
leaderboard_state, | |
model_name_inp, | |
bleu_inp, | |
pass1_inp, | |
pass5_inp, | |
pass10_inp, | |
readability_inp, | |
relevance_inp, | |
explanation_inp, | |
problem_inp, | |
actionability_inp, | |
completeness_inp, | |
specificity_inp, | |
contextual_inp, | |
consistency_inp, | |
brevity_inp, | |
], | |
outputs=[leaderboard_state, leaderboard_df, multimetric_df, status_markdown], | |
api_name="submit_model", | |
) | |
# ----------------- Launch ----------------- | |
if __name__ == "__main__": | |
demo.queue().launch() | |
# For HF Spaces runtime (gradio SDK) expose `demo` | |
app = demo |