import json from pathlib import Path from typing import List, Dict import gradio as gr from pydantic import BaseModel, Field, validator # --------------- Configuration --------------- LEADERBOARD_PATH = Path("leaderboard_data.json") DEFAULT_MODEL_NAME = "example/model" # --------------- Data models --------------- class Metrics(BaseModel): readability: float relevance: float explanation_clarity: float = Field(alias="explanation_clarity") problem_identification: float actionability: float completeness: float specificity: float contextual_adequacy: float consistency: float brevity: float class LeaderboardEntry(BaseModel): model_name: str bleu: float llm_pass_1: float llm_pass_5: float llm_pass_10: float metrics: Metrics @validator("bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10", each_item=True) def score_range(cls, v: float): if not 0.0 <= v <= 1.0: raise ValueError("Scores should be between 0 and 1") return v # --------------- Persistence helpers --------------- def _load_leaderboard() -> List[Dict]: if not LEADERBOARD_PATH.exists(): return [] with LEADERBOARD_PATH.open("r", encoding="utf-8") as f: data = json.load(f) return data.get("leaderboard", []) def _save_leaderboard(data: List[Dict]): to_store = {"leaderboard": data} with LEADERBOARD_PATH.open("w", encoding="utf-8") as f: json.dump(to_store, f, indent=2) # --------------- Utility --------------- def _flatten_entry(entry: Dict) -> Dict: """Flatten nested metrics so that every metric is a column.""" flat = { "Model": entry["model_name"], "BLEU": entry["bleu"], "Pass@1": entry["llm_pass_1"], "Pass@5": entry["llm_pass_5"], "Pass@10": entry["llm_pass_10"], } for metric_name, score in entry["metrics"].items(): flat[metric_name.replace("_", " ").title()] = score return flat def _table_data() -> List[Dict]: data = _load_leaderboard() # Sort descending by pass@1 as requested data.sort(key=lambda x: x["llm_pass_1"], reverse=True) return [_flatten_entry(e) for e in data] # --------------- Gradio callbacks --------------- def submit_model( model_name: str, bleu: float, llm_pass_1: float, llm_pass_5: float, llm_pass_10: float, readability: float, relevance: float, explanation_clarity: float, problem_identification: float, actionability: float, completeness: float, specificity: float, contextual_adequacy: float, consistency: float, brevity: float, ): """Validate and append a new model entry to the leaderboard.""" try: entry = LeaderboardEntry( model_name=model_name.strip(), bleu=bleu, llm_pass_1=llm_pass_1, llm_pass_5=llm_pass_5, llm_pass_10=llm_pass_10, metrics={ "readability": readability, "relevance": relevance, "explanation_clarity": explanation_clarity, "problem_identification": problem_identification, "actionability": actionability, "completeness": completeness, "specificity": specificity, "contextual_adequacy": contextual_adequacy, "consistency": consistency, "brevity": brevity, }, ) except Exception as e: return gr.update(value=_table_data()), gr.update(value=f"❌ Submission failed: {e}") data = _load_leaderboard() # Replace existing model entry if any data = [d for d in data if d["model_name"] != entry.model_name] data.append(entry.dict()) _save_leaderboard(data) return gr.update(value=_table_data()), gr.update(value="✅ Submission recorded!") # --------------- Interface --------------- with gr.Blocks(title="Custom LLM Leaderboard") as demo: gr.Markdown("""# 🏆 LLM Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """) leaderboard_df = gr.Dataframe( headers=list(_table_data()[0].keys()) if _table_data() else [], value=_table_data(), label="Current Leaderboard", interactive=False, ) gr.Markdown("## 🔄 Submit new model results") with gr.Accordion("Submission form", open=False): with gr.Row(): model_name_inp = gr.Text(label="Model name (org/model)", value="") bleu_inp = gr.Number(label="BLEU", value=0.0, minimum=0.0, maximum=1.0) pass1_inp = gr.Number(label="Pass@1", value=0.0, minimum=0.0, maximum=1.0) pass5_inp = gr.Number(label="Pass@5", value=0.0, minimum=0.0, maximum=1.0) pass10_inp = gr.Number(label="Pass@10", value=0.0, minimum=0.0, maximum=1.0) gr.Markdown("### Multi-metric subjective scores (0.0 – 1.0)") with gr.Row(): readability_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Readability") relevance_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Relevance") explanation_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Explanation Clarity") problem_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Problem Identification") actionability_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Actionability") completeness_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Completeness") specificity_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Specificity") contextual_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Contextual Adequacy") consistency_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Consistency") brevity_inp = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Brevity") submit_btn = gr.Button("Submit") status_markdown = gr.Markdown("") submit_btn.click( fn=submit_model, inputs=[ model_name_inp, bleu_inp, pass1_inp, pass5_inp, pass10_inp, readability_inp, relevance_inp, explanation_inp, problem_inp, actionability_inp, completeness_inp, specificity_inp, contextual_inp, consistency_inp, brevity_inp, ], outputs=[leaderboard_df, status_markdown], ) # Expose app variable for Spaces app = demo