Alex commited on
Commit
527d3c4
Β·
1 Parent(s): 9d40219
Files changed (2) hide show
  1. README.md +27 -31
  2. app.py +34 -59
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: CodeReviewBench
3
  emoji: πŸ₯‡
4
  colorFrom: green
5
  colorTo: indigo
@@ -7,40 +7,36 @@ sdk: gradio
7
  app_file: app.py
8
  pinned: true
9
  license: mit
10
- short_description: Result of benchmark presented in paper CodeReviewBench
11
  sdk_version: 5.19.0
 
12
  ---
13
 
14
- # Start the configuration
15
-
16
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
17
-
18
- Results files should have the following format and be stored as json files:
19
- ```json
20
- {
21
- "config": {
22
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
23
- "model_name": "path of the model on the hub: org/model",
24
- "model_sha": "revision on the hub",
25
- },
26
- "results": {
27
- "task_name": {
28
- "metric_name": score,
29
- },
30
- "task_name2": {
31
- "metric_name": score,
32
- }
33
- }
34
- }
35
- ```
36
 
37
- Request files are created automatically by this tool.
 
38
 
39
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
40
 
41
- # Code logic for more complex edits
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
- You'll find
44
- - the main table' columns names and properties in `src/display/utils.py`
45
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
46
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
1
  ---
2
+ title: CodeReview Leaderboard
3
  emoji: πŸ₯‡
4
  colorFrom: green
5
  colorTo: indigo
 
7
  app_file: app.py
8
  pinned: true
9
  license: mit
10
+ short_description: CodeReview Leaderboard for evaluating code review models
11
  sdk_version: 5.19.0
12
+ storage: persistent
13
  ---
14
 
15
+ # CodeReview Leaderboard
16
+
17
+ A leaderboard for evaluating code review models with BLEU, Pass@K metrics, and multi-dimensional subjective scores.
18
+
19
+ ## Metrics
20
+
21
+ ### Main Metrics (0-1 scale)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ - **BLEU**: Text similarity score
24
+ - **Pass@1, Pass@5, Pass@10**: LLM-based exact match at different attempts
25
 
26
+ ### Multi-Metrics (0-10 scale)
27
 
28
+ - Readability, Relevance, Explanation Clarity
29
+ - Problem Identification, Actionability, Completeness
30
+ - Specificity, Contextual Adequacy, Consistency, Brevity
31
+
32
+ ## Submission
33
+
34
+ Submit your model results through the web interface or via API:
35
+
36
+ ```bash
37
+ curl -X POST https://kenkaneki--codereviewbench.hf.space/api/submit_model \
38
+ -H "Content-Type: application/json" \
39
+ -d '{"data": ["org/model", 0.68, 0.73, 0.82, 0.87, 8, 7, 8, 7, 6, 7, 6, 7, 6, 5]}'
40
+ ```
41
 
42
+ Results are sorted by **Pass@1** in descending order.
 
 
 
app.py CHANGED
@@ -7,7 +7,6 @@ from pydantic import BaseModel, Field, field_validator
7
 
8
  # --------------- Configuration ---------------
9
  LEADERBOARD_PATH = Path("leaderboard_data.json")
10
- DEFAULT_MODEL_NAME = "example/model"
11
 
12
  # --------------- Data models ---------------
13
  class Metrics(BaseModel):
@@ -47,44 +46,47 @@ class LeaderboardEntry(BaseModel):
47
  # --------------- Persistence helpers ---------------
48
 
49
  def _load_leaderboard() -> List[Dict]:
 
50
  if not LEADERBOARD_PATH.exists():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  return []
52
- with LEADERBOARD_PATH.open("r", encoding="utf-8") as f:
53
- data = json.load(f)
54
- return data.get("leaderboard", [])
55
 
56
 
57
  def _save_leaderboard(data: List[Dict]):
58
- to_store = {"leaderboard": data}
59
- with LEADERBOARD_PATH.open("w", encoding="utf-8") as f:
60
- json.dump(to_store, f, indent=2)
61
-
62
-
63
- # --------------- Utility ---------------
64
-
65
- def _flatten_entry(entry: Dict) -> Dict:
66
- """Flatten nested metrics so that every metric is a column."""
67
- flat = {
68
- "Model": entry["model_name"],
69
- "BLEU": entry["bleu"],
70
- "Pass@1": entry["llm_pass_1"],
71
- "Pass@5": entry["llm_pass_5"],
72
- "Pass@10": entry["llm_pass_10"],
73
- "Readability": entry["metrics"]["readability"],
74
- "Relevance": entry["metrics"]["relevance"],
75
- "Explanation Clarity": entry["metrics"]["explanation_clarity"],
76
- "Problem Identification": entry["metrics"]["problem_identification"],
77
- "Actionability": entry["metrics"]["actionability"],
78
- "Completeness": entry["metrics"]["completeness"],
79
- "Specificity": entry["metrics"]["specificity"],
80
- "Contextual Adequacy": entry["metrics"]["contextual_adequacy"],
81
- "Consistency": entry["metrics"]["consistency"],
82
- "Brevity": entry["metrics"]["brevity"],
83
- }
84
- return flat
85
 
 
86
 
87
  def _table_data() -> List[List]:
 
88
  data = _load_leaderboard()
89
  if not data:
90
  return []
@@ -104,6 +106,7 @@ def _table_data() -> List[List]:
104
 
105
 
106
  def _multimetric_table_data() -> List[List]:
 
107
  data = _load_leaderboard()
108
  if not data:
109
  return []
@@ -184,34 +187,6 @@ def submit_model(
184
  with gr.Blocks(title="CodeReview Leaderboard") as demo:
185
  gr.Markdown("""# πŸ† CodeReview Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """)
186
 
187
- # Create initial example data if file doesn't exist
188
- if not LEADERBOARD_PATH.exists():
189
- example_data = {
190
- "leaderboard": [
191
- {
192
- "model_name": "example/model",
193
- "bleu": 0.5,
194
- "llm_pass_1": 0.5,
195
- "llm_pass_5": 0.5,
196
- "llm_pass_10": 0.5,
197
- "metrics": {
198
- "readability": 5,
199
- "relevance": 5,
200
- "explanation_clarity": 5,
201
- "problem_identification": 5,
202
- "actionability": 5,
203
- "completeness": 5,
204
- "specificity": 5,
205
- "contextual_adequacy": 5,
206
- "consistency": 5,
207
- "brevity": 5
208
- }
209
- }
210
- ]
211
- }
212
- with LEADERBOARD_PATH.open("w", encoding="utf-8") as f:
213
- json.dump(example_data, f, indent=2)
214
-
215
  # Initialize table data
216
  initial_data = _table_data()
217
  initial_multimetric_data = _multimetric_table_data()
 
7
 
8
  # --------------- Configuration ---------------
9
  LEADERBOARD_PATH = Path("leaderboard_data.json")
 
10
 
11
  # --------------- Data models ---------------
12
  class Metrics(BaseModel):
 
46
  # --------------- Persistence helpers ---------------
47
 
48
  def _load_leaderboard() -> List[Dict]:
49
+ """Load leaderboard data with persistent storage support."""
50
  if not LEADERBOARD_PATH.exists():
51
+ # Create default example data
52
+ default_data = [{
53
+ "model_name": "example/model",
54
+ "bleu": 0.5,
55
+ "llm_pass_1": 0.5,
56
+ "llm_pass_5": 0.5,
57
+ "llm_pass_10": 0.5,
58
+ "metrics": {
59
+ "readability": 5, "relevance": 5, "explanation_clarity": 5,
60
+ "problem_identification": 5, "actionability": 5, "completeness": 5,
61
+ "specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5
62
+ }
63
+ }]
64
+ _save_leaderboard(default_data)
65
+ return default_data
66
+
67
+ try:
68
+ with LEADERBOARD_PATH.open("r", encoding="utf-8") as f:
69
+ data = json.load(f)
70
+ return data.get("leaderboard", [])
71
+ except Exception as e:
72
+ print(f"Error loading leaderboard: {e}")
73
  return []
 
 
 
74
 
75
 
76
  def _save_leaderboard(data: List[Dict]):
77
+ """Save leaderboard data to persistent storage."""
78
+ try:
79
+ to_store = {"leaderboard": data}
80
+ with LEADERBOARD_PATH.open("w", encoding="utf-8") as f:
81
+ json.dump(to_store, f, indent=2)
82
+ except Exception as e:
83
+ print(f"Error saving leaderboard: {e}")
84
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ # --------------- Table data functions ---------------
87
 
88
  def _table_data() -> List[List]:
89
+ """Get main metrics table data."""
90
  data = _load_leaderboard()
91
  if not data:
92
  return []
 
106
 
107
 
108
  def _multimetric_table_data() -> List[List]:
109
+ """Get multi-metric table data."""
110
  data = _load_leaderboard()
111
  if not data:
112
  return []
 
187
  with gr.Blocks(title="CodeReview Leaderboard") as demo:
188
  gr.Markdown("""# πŸ† CodeReview Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """)
189
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  # Initialize table data
191
  initial_data = _table_data()
192
  initial_multimetric_data = _multimetric_table_data()