Spaces:
Running
Running
merge_resolve
Browse files- .env.template +6 -0
- .gitignore +44 -5
- .gitmodules +3 -0
- .gradio/certificate.pem +31 -0
- README.md +130 -108
- app.py +1086 -335
- example_submission.jsonl +4 -0
- gradio_test.ipynb +32 -0
- leaderboard_data.json +28 -19
- requirements.txt +8 -19
- src/about.py +44 -32
- src/display/css_html_js.py +62 -271
- src/display/formatting.py +56 -167
- src/display/utils.py +412 -287
- src/envs.py +20 -99
- src/leaderboard/processor.py +258 -293
- src/populate.py +188 -0
- src/submission/submit.py +178 -380
.env.template
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
HF_TOKEN="your_huggingface_write_token"
|
2 |
+
OWNER="your_huggingface_username_or_org"
|
3 |
+
RESULTS_DATASET_ID="your_username/guardbench-results"
|
4 |
+
SUBMITTER_TOKEN="your_secret_submission_token"
|
5 |
+
ADMIN_USERNAME="admin"
|
6 |
+
ADMIN_PASSWORD="password" # Change this!
|
.gitignore
CHANGED
@@ -1,13 +1,52 @@
|
|
1 |
-
|
2 |
-
venv/
|
3 |
__pycache__/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
.env
|
5 |
-
|
6 |
-
|
|
|
|
|
|
|
|
|
|
|
7 |
.vscode/
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
|
|
9 |
eval-queue/
|
10 |
eval-results/
|
11 |
eval-queue-bk/
|
12 |
eval-results-bk/
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
|
|
2 |
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
env/
|
8 |
+
build/
|
9 |
+
develop-eggs/
|
10 |
+
dist/
|
11 |
+
downloads/
|
12 |
+
eggs/
|
13 |
+
.eggs/
|
14 |
+
lib/
|
15 |
+
lib64/
|
16 |
+
parts/
|
17 |
+
sdist/
|
18 |
+
var/
|
19 |
+
.venv/
|
20 |
+
*.egg-info/
|
21 |
+
.installed.cfg
|
22 |
+
*.egg
|
23 |
+
.gradio/
|
24 |
+
|
25 |
+
# Environment variables
|
26 |
.env
|
27 |
+
|
28 |
+
# Virtual Environment
|
29 |
+
venv/
|
30 |
+
ENV/
|
31 |
+
|
32 |
+
# IDE
|
33 |
+
.idea/
|
34 |
.vscode/
|
35 |
+
*.swp
|
36 |
+
*.swo
|
37 |
+
|
38 |
+
# OS
|
39 |
+
.DS_Store
|
40 |
+
Thumbs.db
|
41 |
|
42 |
+
# Hugging Face cache
|
43 |
eval-queue/
|
44 |
eval-results/
|
45 |
eval-queue-bk/
|
46 |
eval-results-bk/
|
47 |
+
|
48 |
+
# Data files
|
49 |
+
data/
|
50 |
+
|
51 |
+
# Versioned leaderboard files
|
52 |
+
data/leaderboard_v*.json
|
.gitmodules
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
[submodule "guard-bench-submodule"]
|
2 |
+
path = guard-bench-submodule
|
3 |
+
url = https://github.com/whitecircle-ai/circle-guard-bench.git
|
.gradio/certificate.pem
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
-----BEGIN CERTIFICATE-----
|
2 |
+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
|
3 |
+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
|
4 |
+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
|
5 |
+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
|
6 |
+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
|
7 |
+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
|
8 |
+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
|
9 |
+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
|
10 |
+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
|
11 |
+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
|
12 |
+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
|
13 |
+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
|
14 |
+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
|
15 |
+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
|
16 |
+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
|
17 |
+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
|
18 |
+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
|
19 |
+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
|
20 |
+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
|
21 |
+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
|
22 |
+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
|
23 |
+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
|
24 |
+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
|
25 |
+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
|
26 |
+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
|
27 |
+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
|
28 |
+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
|
29 |
+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
|
30 |
+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
|
31 |
+
-----END CERTIFICATE-----
|
README.md
CHANGED
@@ -1,136 +1,158 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
|
|
7 |
app_file: app.py
|
8 |
pinned: true
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
---
|
14 |
|
15 |
-
#
|
16 |
|
|
|
17 |
A comprehensive benchmark and leaderboard for code review generation models, inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench).
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
- **
|
25 |
-
- **
|
26 |
-
- **
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
- **
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
- **
|
38 |
-
- **
|
39 |
-
- **
|
40 |
-
- **
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
- **
|
45 |
-
- **
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
```bash
|
53 |
pip install -r requirements.txt
|
54 |
```
|
55 |
|
56 |
-
|
57 |
|
58 |
```bash
|
59 |
python app.py
|
60 |
```
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
### 3. Analytics & Insights
|
88 |
-
|
89 |
-
- Visit the **📈 Analytics** tab to see:
|
90 |
-
- Recent submission history
|
91 |
-
- Language performance comparisons
|
92 |
-
- Category performance analysis
|
93 |
-
- Trends and patterns
|
94 |
-
|
95 |
-
### 4. Data Export
|
96 |
-
|
97 |
-
- Use the **ℹ️ About** tab to export data in JSON or CSV format
|
98 |
-
- Full leaderboard data available for research and analysis
|
99 |
-
|
100 |
-
## 🏗️ Architecture
|
101 |
-
|
102 |
-
### Directory Structure
|
103 |
-
|
104 |
-
```
|
105 |
-
├── src/
|
106 |
-
│ ├── about.py # About page content
|
107 |
-
│ ├── envs.py # Environment configuration
|
108 |
-
│ ├── display/ # Display utilities
|
109 |
-
│ │ ├── css_html_js.py # Styling and themes
|
110 |
-
│ │ ├── formatting.py # Data formatting
|
111 |
-
│ │ └── utils.py # Display utilities
|
112 |
-
│ ├── leaderboard/ # Leaderboard processing
|
113 |
-
│ │ └── processor.py # Data operations
|
114 |
-
│ └── submission/ # Submission handling
|
115 |
-
│ └── submit.py # Submission validation
|
116 |
-
├── data/ # Data storage
|
117 |
-
│ ├── leaderboard_data.json # Main leaderboard
|
118 |
-
│ └── submissions.json # Submission log
|
119 |
-
├── app.py # Main application
|
120 |
-
└── requirements.txt # Dependencies
|
121 |
```
|
122 |
|
123 |
-
|
124 |
|
125 |
-
|
126 |
-
- **SubmissionHandler**: Manages model submissions with IP tracking and validation
|
127 |
-
- **Display Utils**: Provides filtering, formatting, and table generation
|
128 |
-
- **Dark Theme**: Custom CSS for modern, accessible interface
|
129 |
|
130 |
-
## 🎨 Features Inspired by CodeReviewBench
|
131 |
|
132 |
-
|
133 |
|
|
|
134 |
- **Multi-tab Interface**: Organized navigation with dedicated sections
|
135 |
- **Advanced Filtering**: Real-time filtering by multiple criteria
|
136 |
- **Dark Theme**: Modern, GitHub-inspired dark interface
|
|
|
1 |
---
|
2 |
+
title: CircleGuardBench
|
3 |
+
emoji: ⚪
|
4 |
+
colorFrom: gray
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.44.1
|
8 |
app_file: app.py
|
9 |
pinned: true
|
10 |
+
short_description: First benchmark testing LLM guards on safety and accuracy.
|
11 |
+
models:
|
12 |
+
- AtlaAI/Selene-1-Mini-Llama-3.1-8B
|
13 |
+
- google/gemma-3-12b-it
|
14 |
+
- google/gemma-3-4b-it
|
15 |
+
- meta-llama/Llama-3.1-8B-Instruct
|
16 |
+
- meta-llama/Llama-3.2-3B-Instruct
|
17 |
+
- meta-llama/Llama-4-Maverick-17B-128E-Instruct
|
18 |
+
- meta-llama/Llama-4-Scout-17B-16E-Instruct
|
19 |
+
- meta-llama/Llama-Guard-3-1B
|
20 |
+
- meta-llama/Llama-Guard-3-8B
|
21 |
+
- meta-llama/Llama-Guard-4-12B
|
22 |
+
- mistralai/Ministral-8B-Instruct-2410
|
23 |
+
- mistralai/Mistral-Small-3.1-24B-Instruct-2503
|
24 |
+
- Qwen/Qwen2.5-7B-Instruct
|
25 |
+
- Qwen/Qwen3-0.6B
|
26 |
+
- Qwen/Qwen3-1.7B
|
27 |
+
- Qwen/Qwen3-4B
|
28 |
+
- Qwen/Qwen3-8B
|
29 |
+
|
30 |
---
|
31 |
|
32 |
+
# CodeReview Bench Leaderboard
|
33 |
|
34 |
+
<<<<<<< HEAD
|
35 |
A comprehensive benchmark and leaderboard for code review generation models, inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench).
|
36 |
+
=======
|
37 |
+
A comprehensive leaderboard for evaluating automated code review systems across programming languages and review quality dimensions.
|
38 |
+
>>>>>>> f990f507d1e99e7867021841fa223fe6ca8f653b
|
39 |
+
|
40 |
+
## Features
|
41 |
+
|
42 |
+
- **Multi-Language Support**: Evaluates models across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more
|
43 |
+
- **Dual Language Comments**: Supports both Russian and English comment languages
|
44 |
+
- **Comprehensive Metrics**:
|
45 |
+
- LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity)
|
46 |
+
- Exact-match metrics (pass@1, pass@5, pass@10, BLEU@10)
|
47 |
+
- **Interactive Visualization**: Compare model performance across categories with radar plots
|
48 |
+
- **Easy Submission**: Submit your model results via web interface
|
49 |
+
|
50 |
+
## Metrics
|
51 |
+
|
52 |
+
### LLM-based Multimetric
|
53 |
+
|
54 |
+
- **Readability**: How easy the review is to understand
|
55 |
+
- **Relevance**: How relevant the review is to the code
|
56 |
+
- **Explanation Clarity**: How clear the explanations are
|
57 |
+
- **Problem Identification**: How well problems are identified
|
58 |
+
- **Actionability**: How actionable the suggestions are
|
59 |
+
- **Completeness**: How complete the review is
|
60 |
+
- **Specificity**: How specific the feedback is
|
61 |
+
- **Contextual Adequacy**: How well the review fits the context
|
62 |
+
- **Consistency**: How consistent the review style is
|
63 |
+
- **Brevity**: How concise the review is
|
64 |
+
|
65 |
+
### Exact-Match Metrics
|
66 |
+
|
67 |
+
- **Pass@1**: Percentage of correct reviews on first attempt
|
68 |
+
- **Pass@5**: Percentage of correct reviews in top 5 attempts
|
69 |
+
- **Pass@10**: Percentage of correct reviews in top 10 attempts
|
70 |
+
- **BLEU@10**: BLEU score for top 10 review candidates
|
71 |
+
|
72 |
+
## Programming Languages Supported
|
73 |
+
|
74 |
+
- Python
|
75 |
+
- JavaScript
|
76 |
+
- Java
|
77 |
+
- C++
|
78 |
+
- C#
|
79 |
+
- TypeScript
|
80 |
+
- Go
|
81 |
+
- Rust
|
82 |
+
- Swift
|
83 |
+
- Kotlin
|
84 |
+
- Ruby
|
85 |
+
- PHP
|
86 |
+
- C
|
87 |
+
- Scala
|
88 |
+
- R
|
89 |
+
- Dart
|
90 |
+
- Other
|
91 |
+
|
92 |
+
## Comment Languages
|
93 |
+
|
94 |
+
- Russian (ru)
|
95 |
+
- English (en)
|
96 |
+
|
97 |
+
## Example Categories
|
98 |
+
|
99 |
+
- Bug Fix
|
100 |
+
- Code Style
|
101 |
+
- Performance
|
102 |
+
- Security
|
103 |
+
- Refactoring
|
104 |
+
- Documentation
|
105 |
+
- Testing
|
106 |
+
- Architecture
|
107 |
+
- Other
|
108 |
+
|
109 |
+
## Installation
|
110 |
|
111 |
```bash
|
112 |
pip install -r requirements.txt
|
113 |
```
|
114 |
|
115 |
+
## Usage
|
116 |
|
117 |
```bash
|
118 |
python app.py
|
119 |
```
|
120 |
|
121 |
+
## Submission Format
|
122 |
+
|
123 |
+
Submit your results as a JSONL file where each line contains:
|
124 |
+
|
125 |
+
```json
|
126 |
+
{
|
127 |
+
"model_name": "your-model-name",
|
128 |
+
"programming_language": "python",
|
129 |
+
"comment_language": "en",
|
130 |
+
"readability": 8.5,
|
131 |
+
"relevance": 9.0,
|
132 |
+
"explanation_clarity": 7.8,
|
133 |
+
"problem_identification": 8.2,
|
134 |
+
"actionability": 8.7,
|
135 |
+
"completeness": 8.0,
|
136 |
+
"specificity": 7.5,
|
137 |
+
"contextual_adequacy": 8.3,
|
138 |
+
"consistency": 8.8,
|
139 |
+
"brevity": 7.2,
|
140 |
+
"pass_at_1": 0.75,
|
141 |
+
"pass_at_5": 0.88,
|
142 |
+
"pass_at_10": 0.92,
|
143 |
+
"bleu_at_10": 0.65,
|
144 |
+
"total_evaluations": 100
|
145 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
```
|
147 |
|
148 |
+
## Environment Variables
|
149 |
|
150 |
+
Set the following environment variables:
|
|
|
|
|
|
|
151 |
|
|
|
152 |
|
153 |
+
## Citation
|
154 |
|
155 |
+
<<<<<<< HEAD
|
156 |
- **Multi-tab Interface**: Organized navigation with dedicated sections
|
157 |
- **Advanced Filtering**: Real-time filtering by multiple criteria
|
158 |
- **Dark Theme**: Modern, GitHub-inspired dark interface
|
app.py
CHANGED
@@ -3,363 +3,1114 @@ CodeReview Leaderboard - Inspired by CodeReviewBench
|
|
3 |
A comprehensive leaderboard for code review generation models
|
4 |
"""
|
5 |
|
|
|
|
|
|
|
|
|
6 |
import gradio as gr
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
14 |
)
|
15 |
-
from src.
|
16 |
-
from src.display.css_html_js import DARK_THEME_CSS, CUSTOM_JS, HEADER_HTML, FOOTER_HTML
|
17 |
from src.display.utils import (
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
)
|
21 |
-
from src.
|
22 |
-
from src.
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
"taxonomy_category": "All"
|
33 |
-
}
|
34 |
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
)
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
}
|
47 |
-
|
48 |
-
#
|
49 |
-
|
50 |
-
|
51 |
-
#
|
52 |
-
|
53 |
-
|
54 |
)
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
)
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
stats_text = f"""
|
65 |
-
## 📊 Current Statistics
|
66 |
-
- **Total Models**: {stats['total_models']}
|
67 |
-
- **Total Submissions**: {stats['total_submissions']}
|
68 |
-
- **Average Pass@1**: {stats['avg_pass_1']:.3f}
|
69 |
-
- **Best Model**: {stats['best_model']}
|
70 |
-
- **Languages Covered**: {stats['languages_covered']}
|
71 |
-
- **Categories Covered**: {stats['categories_covered']}
|
72 |
"""
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
):
|
88 |
-
"""
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
)
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
else:
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
info="Filter by programming language"
|
134 |
-
)
|
135 |
-
comment_lang_filter = gr.Dropdown(
|
136 |
-
choices=COMMENT_LANGUAGES,
|
137 |
-
value="All",
|
138 |
-
label="🌍 Comment Language",
|
139 |
-
info="Filter by comment language"
|
140 |
-
)
|
141 |
-
taxonomy_filter = gr.Dropdown(
|
142 |
-
choices=TAXONOMY_CATEGORIES,
|
143 |
-
value="All",
|
144 |
-
label="🏷️ Taxonomy Category",
|
145 |
-
info="Filter by review category"
|
146 |
)
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
160 |
)
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
|
|
|
|
|
|
|
|
|
|
170 |
)
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
form_components["taxonomy_category"],
|
187 |
-
form_components["bleu"],
|
188 |
-
form_components["pass1"],
|
189 |
-
form_components["pass5"],
|
190 |
-
form_components["pass10"],
|
191 |
-
form_components["readability"],
|
192 |
-
form_components["relevance"],
|
193 |
-
form_components["explanation_clarity"],
|
194 |
-
form_components["problem_identification"],
|
195 |
-
form_components["actionability"],
|
196 |
-
form_components["completeness"],
|
197 |
-
form_components["specificity"],
|
198 |
-
form_components["contextual_adequacy"],
|
199 |
-
form_components["consistency"],
|
200 |
-
form_components["brevity"],
|
201 |
-
],
|
202 |
-
outputs=[
|
203 |
-
leaderboard_state,
|
204 |
-
main_leaderboard,
|
205 |
-
quality_metrics,
|
206 |
-
form_components["status_msg"],
|
207 |
-
stats_display
|
208 |
-
]
|
209 |
-
)
|
210 |
-
|
211 |
-
# Analytics Tab
|
212 |
-
with gr.Tab("📈 Analytics"):
|
213 |
-
|
214 |
-
with gr.Row():
|
215 |
-
analytics_prog_lang = gr.Dropdown(
|
216 |
-
choices=PROGRAMMING_LANGUAGES,
|
217 |
-
value="All",
|
218 |
-
label="Programming Language"
|
219 |
)
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
)
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
)
|
230 |
-
|
231 |
-
#
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
label="
|
254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
)
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
267 |
)
|
268 |
-
export_btn = gr.Button("📥 Export Data")
|
269 |
-
|
270 |
-
export_output = gr.Textbox(
|
271 |
-
label="Export Output",
|
272 |
-
lines=10,
|
273 |
-
max_lines=20,
|
274 |
-
show_copy_button=True
|
275 |
-
)
|
276 |
-
|
277 |
-
# Footer
|
278 |
-
gr.HTML(FOOTER_HTML)
|
279 |
-
|
280 |
-
# Initialize with data
|
281 |
-
initial_main, initial_quality, initial_stats = update_leaderboard_tables()
|
282 |
-
|
283 |
-
# Update tables when filters change
|
284 |
-
filter_inputs = [prog_lang_filter, comment_lang_filter, taxonomy_filter]
|
285 |
-
filter_outputs = [main_leaderboard, quality_metrics, stats_display]
|
286 |
-
|
287 |
-
for filter_input in filter_inputs:
|
288 |
-
filter_input.change(
|
289 |
-
fn=update_leaderboard_tables,
|
290 |
-
inputs=filter_inputs,
|
291 |
-
outputs=filter_outputs
|
292 |
-
)
|
293 |
-
|
294 |
-
# Refresh button
|
295 |
-
refresh_btn.click(
|
296 |
-
fn=refresh_data,
|
297 |
-
outputs=filter_outputs
|
298 |
-
)
|
299 |
-
|
300 |
-
# Analytics updates
|
301 |
-
analytics_inputs = [analytics_prog_lang, analytics_comment_lang, analytics_taxonomy]
|
302 |
-
|
303 |
-
def update_analytics(prog_lang, comment_lang, taxonomy):
|
304 |
-
"""Update analytics tables"""
|
305 |
-
data = processor.load_leaderboard_data()
|
306 |
-
|
307 |
-
# Get submission history
|
308 |
-
history = get_submission_history_data(data, prog_lang, comment_lang, taxonomy)
|
309 |
-
|
310 |
-
# Get language performance
|
311 |
-
lang_perf = []
|
312 |
-
for lang in PROGRAMMING_LANGUAGES[1:]:
|
313 |
-
lang_data = [d for d in data if d.get("programming_language") == lang]
|
314 |
-
if lang_data:
|
315 |
-
avg_score = sum(d.get("llm_pass_1", 0) for d in lang_data) / len(lang_data)
|
316 |
-
best_model = max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
|
317 |
-
lang_perf.append([lang, f"{avg_score:.3f}", len(lang_data), best_model])
|
318 |
-
|
319 |
-
# Get category performance
|
320 |
-
cat_perf = []
|
321 |
-
for cat in TAXONOMY_CATEGORIES[1:]:
|
322 |
-
cat_data = [d for d in data if d.get("taxonomy_category") == cat]
|
323 |
-
if cat_data:
|
324 |
-
avg_score = sum(d.get("llm_pass_1", 0) for d in cat_data) / len(cat_data)
|
325 |
-
best_model = max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
|
326 |
-
cat_perf.append([cat, f"{avg_score:.3f}", len(cat_data), best_model])
|
327 |
-
|
328 |
-
return history, lang_perf, cat_perf
|
329 |
-
|
330 |
-
for analytics_input in analytics_inputs:
|
331 |
-
analytics_input.change(
|
332 |
-
fn=update_analytics,
|
333 |
-
inputs=analytics_inputs,
|
334 |
-
outputs=[submission_history, language_analysis, category_analysis]
|
335 |
-
)
|
336 |
-
|
337 |
-
# Export functionality
|
338 |
-
def export_data(format_type):
|
339 |
-
"""Export leaderboard data"""
|
340 |
-
return processor.export_data(format_type.lower())
|
341 |
-
|
342 |
-
export_btn.click(
|
343 |
-
fn=export_data,
|
344 |
-
inputs=[export_format],
|
345 |
-
outputs=[export_output]
|
346 |
-
)
|
347 |
-
|
348 |
-
# Set initial values
|
349 |
-
demo.load(
|
350 |
-
fn=lambda: (initial_main, initial_quality, initial_stats),
|
351 |
-
outputs=[main_leaderboard, quality_metrics, stats_display]
|
352 |
-
)
|
353 |
|
354 |
-
#
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
)
|
363 |
|
364 |
-
|
365 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
A comprehensive leaderboard for code review generation models
|
4 |
"""
|
5 |
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import tempfile
|
9 |
+
import logging
|
10 |
import gradio as gr
|
11 |
+
import pandas as pd
|
12 |
+
import plotly.express as px
|
13 |
+
import plotly.graph_objects as go
|
14 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
15 |
+
import numpy as np
|
16 |
+
from gradio.themes.utils import fonts, colors
|
17 |
+
from dataclasses import fields, dataclass
|
18 |
|
19 |
+
from src.about import (
|
20 |
+
CITATION_BUTTON_LABEL,
|
21 |
+
CITATION_BUTTON_TEXT,
|
22 |
+
EVALUATION_QUEUE_TEXT,
|
23 |
+
INTRODUCTION_TEXT,
|
24 |
+
LLM_BENCHMARKS_TEXT,
|
25 |
+
TITLE,
|
26 |
)
|
27 |
+
from src.display.css_html_js import custom_css
|
|
|
28 |
from src.display.utils import (
|
29 |
+
CODEREVIEW_COLUMN,
|
30 |
+
DISPLAY_COLS,
|
31 |
+
METRIC_COLS,
|
32 |
+
HIDDEN_COLS,
|
33 |
+
NEVER_HIDDEN_COLS,
|
34 |
+
CATEGORIES,
|
35 |
+
COMMENT_LANGUAGES,
|
36 |
+
EXAMPLE_CATEGORIES,
|
37 |
+
TOPICS,
|
38 |
+
ModelType,
|
39 |
+
Mode,
|
40 |
+
Precision,
|
41 |
+
WeightType,
|
42 |
+
ReviewModelType,
|
43 |
+
get_all_column_choices,
|
44 |
+
get_default_visible_columns,
|
45 |
)
|
46 |
+
from src.display.formatting import styled_message, styled_error, styled_warning
|
47 |
+
from src.envs import (
|
48 |
+
ADMIN_USERNAME,
|
49 |
+
ADMIN_PASSWORD,
|
50 |
+
RESULTS_DATASET_ID,
|
51 |
+
SUBMITTER_TOKEN,
|
52 |
+
TOKEN,
|
53 |
+
DATA_PATH,
|
54 |
+
)
|
55 |
+
from src.populate import get_leaderboard_df, get_category_leaderboard_df
|
56 |
+
from src.submission.submit import process_submission
|
|
|
|
|
57 |
|
58 |
+
# Configure logging
|
59 |
+
logging.basicConfig(
|
60 |
+
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
|
61 |
+
)
|
62 |
+
logger = logging.getLogger(__name__)
|
63 |
+
|
64 |
+
# Ensure data directory exists
|
65 |
+
os.makedirs(DATA_PATH, exist_ok=True)
|
66 |
+
|
67 |
+
# Available benchmark versions
|
68 |
+
BENCHMARK_VERSIONS = ["v0"]
|
69 |
+
CURRENT_VERSION = "v0"
|
70 |
+
|
71 |
+
# Initialize leaderboard data
|
72 |
+
try:
|
73 |
+
logger.info("Initializing leaderboard data...")
|
74 |
+
LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
|
75 |
+
logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
|
76 |
+
except Exception as e:
|
77 |
+
logger.error(f"Error loading leaderboard data: {e}")
|
78 |
+
LEADERBOARD_DF = pd.DataFrame()
|
79 |
+
|
80 |
+
custom_theme = gr.themes.Default(
|
81 |
+
primary_hue=colors.slate,
|
82 |
+
secondary_hue=colors.slate,
|
83 |
+
neutral_hue=colors.neutral,
|
84 |
+
font=(fonts.GoogleFont("Inter"), "sans-serif"),
|
85 |
+
).set(
|
86 |
+
# font_size="16px",
|
87 |
+
body_background_fill="#0f0f10",
|
88 |
+
body_background_fill_dark="#0f0f10",
|
89 |
+
body_text_color="#f4f4f5",
|
90 |
+
body_text_color_subdued="#a1a1aa",
|
91 |
+
block_background_fill="#1e1e1e", # Cooler Grey
|
92 |
+
block_border_color="#333333", # Cooler Grey
|
93 |
+
block_shadow="none",
|
94 |
+
# Swapped primary and secondary button styles
|
95 |
+
button_primary_background_fill="#121212", # Changed to specific color for Refresh button
|
96 |
+
button_primary_text_color="#f4f4f5",
|
97 |
+
button_primary_border_color="#333333", # Keep border grey or change to #121212?
|
98 |
+
button_secondary_background_fill="#f4f4f5",
|
99 |
+
button_secondary_text_color="#0f0f10",
|
100 |
+
button_secondary_border_color="#f4f4f5",
|
101 |
+
input_background_fill="#1e1e1e", # Cooler Grey
|
102 |
+
input_border_color="#333333", # Cooler Grey
|
103 |
+
input_placeholder_color="#71717a",
|
104 |
+
table_border_color="#333333", # Cooler Grey
|
105 |
+
table_even_background_fill="#2d2d2d", # Cooler Grey (Slightly lighter)
|
106 |
+
table_odd_background_fill="#1e1e1e", # Cooler Grey
|
107 |
+
table_text_color="#f4f4f5",
|
108 |
+
link_text_color="#ffffff",
|
109 |
+
border_color_primary="#333333", # Cooler Grey
|
110 |
+
background_fill_secondary="#333333", # Cooler Grey
|
111 |
+
color_accent="#f4f4f5",
|
112 |
+
border_color_accent="#333333", # Cooler Grey
|
113 |
+
button_primary_background_fill_hover="#424242", # Cooler Grey
|
114 |
+
block_title_text_color="#f4f4f5",
|
115 |
+
accordion_text_color="#f4f4f5",
|
116 |
+
panel_background_fill="#1e1e1e", # Cooler Grey
|
117 |
+
panel_border_color="#333333", # Cooler Grey
|
118 |
+
# Explicitly setting primary/secondary/accent colors/borders
|
119 |
+
background_fill_primary="#0f0f10",
|
120 |
+
background_fill_primary_dark="#0f0f10",
|
121 |
+
background_fill_secondary_dark="#333333", # Cooler Grey
|
122 |
+
border_color_primary_dark="#333333", # Cooler Grey
|
123 |
+
border_color_accent_dark="#333333", # Cooler Grey
|
124 |
+
border_color_accent_subdued="#424242", # Cooler Grey
|
125 |
+
border_color_accent_subdued_dark="#424242", # Cooler Grey
|
126 |
+
color_accent_soft="#a1a1aa",
|
127 |
+
color_accent_soft_dark="#a1a1aa",
|
128 |
+
# Explicitly setting input hover/focus states
|
129 |
+
input_background_fill_dark="#1e1e1e", # Cooler Grey
|
130 |
+
input_background_fill_focus="#424242", # Cooler Grey
|
131 |
+
input_background_fill_focus_dark="#424242", # Cooler Grey
|
132 |
+
input_background_fill_hover="#2d2d2d", # Cooler Grey
|
133 |
+
input_background_fill_hover_dark="#2d2d2d", # Cooler Grey
|
134 |
+
input_border_color_dark="#333333", # Cooler Grey
|
135 |
+
input_border_color_focus="#f4f4f5",
|
136 |
+
input_border_color_focus_dark="#f4f4f5",
|
137 |
+
input_border_color_hover="#424242", # Cooler Grey
|
138 |
+
input_border_color_hover_dark="#424242", # Cooler Grey
|
139 |
+
input_placeholder_color_dark="#71717a",
|
140 |
+
# Explicitly set dark variants for table backgrounds
|
141 |
+
table_even_background_fill_dark="#2d2d2d", # Cooler Grey
|
142 |
+
table_odd_background_fill_dark="#1e1e1e", # Cooler Grey
|
143 |
+
# Explicitly set dark text variants
|
144 |
+
body_text_color_dark="#f4f4f5",
|
145 |
+
body_text_color_subdued_dark="#a1a1aa",
|
146 |
+
block_title_text_color_dark="#f4f4f5",
|
147 |
+
accordion_text_color_dark="#f4f4f5",
|
148 |
+
table_text_color_dark="#f4f4f5",
|
149 |
+
# Explicitly set dark panel/block variants
|
150 |
+
panel_background_fill_dark="#1e1e1e", # Cooler Grey
|
151 |
+
panel_border_color_dark="#333333", # Cooler Grey
|
152 |
+
block_background_fill_dark="#1e1e1e", # Cooler Grey
|
153 |
+
block_border_color_dark="#333333", # Cooler Grey
|
154 |
+
)
|
155 |
+
|
156 |
+
|
157 |
+
@dataclass
|
158 |
+
class ColumnInfo:
|
159 |
+
"""Information about a column in the leaderboard."""
|
160 |
+
|
161 |
+
name: str
|
162 |
+
display_name: str
|
163 |
+
type: str = "text"
|
164 |
+
hidden: bool = False
|
165 |
+
never_hidden: bool = False
|
166 |
+
displayed_by_default: bool = True
|
167 |
+
|
168 |
+
|
169 |
+
def update_column_choices(df):
|
170 |
+
"""Update column choices based on what's actually in the dataframe"""
|
171 |
+
if df is None or df.empty:
|
172 |
+
return get_all_column_choices()
|
173 |
+
|
174 |
+
# Get columns that actually exist in the dataframe
|
175 |
+
existing_columns = list(df.columns)
|
176 |
+
|
177 |
+
# Get all possible columns with their display names
|
178 |
+
all_columns = get_all_column_choices()
|
179 |
+
|
180 |
+
# Filter to only include columns that exist in the dataframe
|
181 |
+
valid_columns = [
|
182 |
+
(col_name, display_name)
|
183 |
+
for col_name, display_name in all_columns
|
184 |
+
if col_name in existing_columns
|
185 |
+
]
|
186 |
+
|
187 |
+
# Return default if there are no valid columns
|
188 |
+
if not valid_columns:
|
189 |
+
return get_all_column_choices()
|
190 |
+
|
191 |
+
return valid_columns
|
192 |
+
|
193 |
+
|
194 |
+
# Update the column_selector initialization
|
195 |
+
def get_initial_columns():
|
196 |
+
"""Get initial columns to show in the dropdown"""
|
197 |
+
try:
|
198 |
+
# Get available columns in the main dataframe
|
199 |
+
available_cols = list(LEADERBOARD_DF.columns)
|
200 |
+
logger.info(f"Available columns in LEADERBOARD_DF: {available_cols}")
|
201 |
+
|
202 |
+
# If dataframe is empty, use default visible columns
|
203 |
+
if not available_cols:
|
204 |
+
return get_default_visible_columns()
|
205 |
+
|
206 |
+
# Get default visible columns that actually exist in the dataframe
|
207 |
+
valid_defaults = [
|
208 |
+
col for col in get_default_visible_columns() if col in available_cols
|
209 |
+
]
|
210 |
+
|
211 |
+
# If none of the defaults exist, return all available columns
|
212 |
+
if not valid_defaults:
|
213 |
+
return available_cols
|
214 |
+
|
215 |
+
return valid_defaults
|
216 |
+
except Exception as e:
|
217 |
+
logger.error(f"Error getting initial columns: {e}")
|
218 |
+
return get_default_visible_columns()
|
219 |
+
|
220 |
+
|
221 |
+
def init_leaderboard(dataframe, visible_columns=None):
|
222 |
+
"""
|
223 |
+
Initialize a standard Gradio Dataframe component for the leaderboard.
|
224 |
+
"""
|
225 |
+
if dataframe is None or dataframe.empty:
|
226 |
+
# Create an empty dataframe with the right columns
|
227 |
+
columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
|
228 |
+
dataframe = pd.DataFrame(columns=columns)
|
229 |
+
logger.warning("Initializing empty leaderboard")
|
230 |
+
|
231 |
+
# Lowercase model_name for display
|
232 |
+
if "model_name" in dataframe.columns:
|
233 |
+
dataframe = dataframe.copy()
|
234 |
+
dataframe["model_name"] = dataframe["model_name"].str.lower()
|
235 |
+
|
236 |
+
if "model_type" in dataframe.columns:
|
237 |
+
dataframe = dataframe.copy()
|
238 |
+
dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
|
239 |
+
|
240 |
+
if "review_model_type" in dataframe.columns:
|
241 |
+
dataframe = dataframe.copy()
|
242 |
+
dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
|
243 |
+
|
244 |
+
# print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
|
245 |
+
|
246 |
+
# Determine which columns to display
|
247 |
+
display_column_names = [
|
248 |
+
getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
|
249 |
+
]
|
250 |
+
hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
|
251 |
+
|
252 |
+
# Columns that should always be shown
|
253 |
+
always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
|
254 |
+
|
255 |
+
# Use provided visible columns if specified, otherwise use default
|
256 |
+
if visible_columns is None:
|
257 |
+
# Determine which columns to show initially
|
258 |
+
visible_columns = [
|
259 |
+
col for col in display_column_names if col not in hidden_column_names
|
260 |
+
]
|
261 |
+
|
262 |
+
# Always include the never-hidden columns
|
263 |
+
for col in always_visible:
|
264 |
+
if col not in visible_columns and col in dataframe.columns:
|
265 |
+
visible_columns.append(col)
|
266 |
+
|
267 |
+
# Make sure we only include columns that actually exist in the dataframe
|
268 |
+
visible_columns = [col for col in visible_columns if col in dataframe.columns]
|
269 |
+
|
270 |
+
# Map GuardBench column types to Gradio's expected datatype strings
|
271 |
+
# Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
|
272 |
+
type_mapping = {
|
273 |
+
"text": "str",
|
274 |
+
"number": "number",
|
275 |
+
"bool": "bool",
|
276 |
+
"date": "date",
|
277 |
+
"markdown": "markdown",
|
278 |
+
"html": "html",
|
279 |
+
"image": "image",
|
280 |
+
}
|
281 |
+
|
282 |
+
# Create a list of datatypes in the format Gradio expects
|
283 |
+
datatypes = []
|
284 |
+
for col in visible_columns:
|
285 |
+
# Find the corresponding CODEREVIEW_COLUMN entry
|
286 |
+
col_type = None
|
287 |
+
for display_col in DISPLAY_COLS:
|
288 |
+
if getattr(CODEREVIEW_COLUMN, display_col).name == col:
|
289 |
+
orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
|
290 |
+
# Map to Gradio's expected types
|
291 |
+
col_type = type_mapping.get(orig_type, "str")
|
292 |
+
break
|
293 |
+
|
294 |
+
# Default to 'str' if type not found or not mappable
|
295 |
+
if col_type is None:
|
296 |
+
col_type = "str"
|
297 |
+
|
298 |
+
datatypes.append(col_type)
|
299 |
+
|
300 |
+
# Create a dummy column for search functionality if it doesn't exist
|
301 |
+
if "search_dummy" not in dataframe.columns:
|
302 |
+
dataframe["search_dummy"] = dataframe.apply(
|
303 |
+
lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
|
304 |
+
axis=1,
|
305 |
+
)
|
306 |
+
|
307 |
+
# Select only the visible columns for display
|
308 |
+
visible_columns.remove("model_name")
|
309 |
+
|
310 |
+
visible_columns = ["model_name"] + visible_columns
|
311 |
+
display_df = dataframe[visible_columns].copy()
|
312 |
+
|
313 |
+
# print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
|
314 |
+
# print(display_df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head() if all(c in display_df.columns for c in ['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']) else "Relevant columns not present")
|
315 |
+
# print(f"-------------------------------------------------------------")
|
316 |
+
|
317 |
+
# Round numeric columns to 3 decimal places for display
|
318 |
+
numeric_cols = display_df.select_dtypes(include=np.number).columns
|
319 |
+
for col in numeric_cols:
|
320 |
+
# Avoid rounding integer columns like counts
|
321 |
+
if not pd.api.types.is_integer_dtype(display_df[col]):
|
322 |
+
# Format floats to exactly 3 decimal places, preserving trailing zeros
|
323 |
+
display_df[col] = display_df[col].apply(
|
324 |
+
lambda x: f"{x:.3f}" if pd.notna(x) else None
|
325 |
+
)
|
326 |
+
|
327 |
+
column_info_map = {
|
328 |
+
f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
|
329 |
+
}
|
330 |
+
column_mapping = {
|
331 |
+
col: column_info_map.get(col, ColumnInfo(col, col)).display_name
|
332 |
+
for col in visible_columns
|
333 |
}
|
334 |
+
|
335 |
+
# Rename columns in the DataFrame
|
336 |
+
display_df.rename(columns=column_mapping, inplace=True)
|
337 |
+
|
338 |
+
# Apply styling - note: styling might need adjustment if it relies on column names
|
339 |
+
styler = display_df.style.set_properties(**{"text-align": "right"}).set_properties(
|
340 |
+
subset=["Model"], **{"width": "200px"}
|
341 |
)
|
342 |
+
|
343 |
+
return gr.Dataframe(
|
344 |
+
value=styler,
|
345 |
+
datatype=datatypes,
|
346 |
+
interactive=False,
|
347 |
+
wrap=True,
|
348 |
+
height=2500,
|
349 |
+
elem_id="leaderboard-table",
|
350 |
+
row_count=len(display_df),
|
351 |
)
|
352 |
+
|
353 |
+
|
354 |
+
def search_filter_leaderboard(
|
355 |
+
df, search_query="", comment_languages=None, version=CURRENT_VERSION
|
356 |
+
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
357 |
"""
|
358 |
+
Filter the leaderboard based on search query and comment languages.
|
359 |
+
"""
|
360 |
+
if df is None or df.empty:
|
361 |
+
return df
|
362 |
+
|
363 |
+
filtered_df = df.copy()
|
364 |
+
|
365 |
+
# Add search dummy column if it doesn't exist
|
366 |
+
if "search_dummy" not in filtered_df.columns:
|
367 |
+
filtered_df["search_dummy"] = filtered_df.apply(
|
368 |
+
lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
|
369 |
+
axis=1,
|
370 |
+
)
|
371 |
+
|
372 |
+
# Apply comment language filter (assuming there's a comment_language column in the data)
|
373 |
+
if comment_languages and len(comment_languages) > 0:
|
374 |
+
# Look for a comment language column in the dataframe
|
375 |
+
comment_lang_cols = [col for col in filtered_df.columns if 'comment_language' in col.lower()]
|
376 |
+
if comment_lang_cols:
|
377 |
+
filtered_df = filtered_df[
|
378 |
+
filtered_df[comment_lang_cols[0]].isin(comment_languages)
|
379 |
+
]
|
380 |
+
|
381 |
+
# Apply search query
|
382 |
+
if search_query:
|
383 |
+
search_terms = [
|
384 |
+
term.strip() for term in search_query.split(";") if term.strip()
|
385 |
+
]
|
386 |
+
if search_terms:
|
387 |
+
combined_mask = None
|
388 |
+
for term in search_terms:
|
389 |
+
mask = filtered_df["search_dummy"].str.contains(
|
390 |
+
term, case=False, na=False
|
391 |
+
)
|
392 |
+
if combined_mask is None:
|
393 |
+
combined_mask = mask
|
394 |
+
else:
|
395 |
+
combined_mask = combined_mask | mask
|
396 |
+
|
397 |
+
if combined_mask is not None:
|
398 |
+
filtered_df = filtered_df[combined_mask]
|
399 |
+
|
400 |
+
# Drop the search dummy column before returning
|
401 |
+
visible_columns = [col for col in filtered_df.columns if col != "search_dummy"]
|
402 |
+
return filtered_df[visible_columns]
|
403 |
|
404 |
+
|
405 |
+
def refresh_data_with_filters(
|
406 |
+
version=CURRENT_VERSION, search_query="", comment_languages=None, selected_columns=None
|
407 |
+
):
|
408 |
+
"""
|
409 |
+
Refresh the leaderboard data and update all components with filtering.
|
410 |
+
Ensures we handle cases where dataframes might have limited columns.
|
411 |
+
"""
|
412 |
+
global LEADERBOARD_DF
|
413 |
+
try:
|
414 |
+
logger.info(f"Performing refresh of leaderboard data with filters...")
|
415 |
+
# Get new data
|
416 |
+
main_df = get_leaderboard_df(version=version)
|
417 |
+
LEADERBOARD_DF = main_df
|
418 |
+
category_dfs = [
|
419 |
+
get_category_leaderboard_df(category, version=version)
|
420 |
+
for category in CATEGORIES
|
421 |
+
]
|
422 |
+
selected_columns = [
|
423 |
+
x.lower()
|
424 |
+
.replace(" ", "_")
|
425 |
+
.replace("(", "")
|
426 |
+
.replace(")", "")
|
427 |
+
.replace("_recall", "_recall_binary")
|
428 |
+
.replace("_precision", "_precision_binary")
|
429 |
+
for x in selected_columns
|
430 |
+
]
|
431 |
+
|
432 |
+
# Log the actual columns we have
|
433 |
+
logger.info(f"Main dataframe columns: {list(main_df.columns)}")
|
434 |
+
|
435 |
+
# Apply filters to each dataframe
|
436 |
+
filtered_main_df = search_filter_leaderboard(
|
437 |
+
main_df, search_query, comment_languages, version
|
438 |
+
)
|
439 |
+
filtered_category_dfs = [
|
440 |
+
search_filter_leaderboard(df, search_query, comment_languages, version)
|
441 |
+
for df in category_dfs
|
442 |
+
]
|
443 |
+
|
444 |
+
# Get available columns from the dataframe
|
445 |
+
available_columns = list(filtered_main_df.columns)
|
446 |
+
|
447 |
+
# Filter selected columns to only those available in the data
|
448 |
+
if selected_columns:
|
449 |
+
# Convert display names to internal names first
|
450 |
+
internal_selected_columns = [
|
451 |
+
x.lower()
|
452 |
+
.replace(" ", "_")
|
453 |
+
.replace("(", "")
|
454 |
+
.replace(")", "")
|
455 |
+
.replace("_recall", "_recall_binary")
|
456 |
+
.replace("_precision", "_precision_binary")
|
457 |
+
for x in selected_columns
|
458 |
+
]
|
459 |
+
valid_selected_columns = [
|
460 |
+
col for col in internal_selected_columns if col in available_columns
|
461 |
+
]
|
462 |
+
if not valid_selected_columns and "model_name" in available_columns:
|
463 |
+
# Fallback if conversion/filtering leads to empty selection
|
464 |
+
valid_selected_columns = ["model_name"] + [
|
465 |
+
col
|
466 |
+
for col in get_default_visible_columns()
|
467 |
+
if col in available_columns
|
468 |
+
]
|
469 |
+
else:
|
470 |
+
# If no columns were selected in the dropdown, use default visible columns that exist
|
471 |
+
valid_selected_columns = [
|
472 |
+
col for col in get_default_visible_columns() if col in available_columns
|
473 |
+
]
|
474 |
+
|
475 |
+
# Initialize dataframes for display with valid selected columns
|
476 |
+
main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
|
477 |
+
|
478 |
+
# For category dataframes, get columns that actually exist in each one
|
479 |
+
category_dataframes = []
|
480 |
+
for df in filtered_category_dfs:
|
481 |
+
df_columns = list(df.columns)
|
482 |
+
df_valid_columns = [
|
483 |
+
col for col in valid_selected_columns if col in df_columns
|
484 |
+
]
|
485 |
+
if not df_valid_columns and "model_name" in df_columns:
|
486 |
+
df_valid_columns = ["model_name"] + get_default_visible_columns()
|
487 |
+
category_dataframes.append(init_leaderboard(df, df_valid_columns))
|
488 |
+
|
489 |
+
return main_dataframe, *category_dataframes
|
490 |
+
|
491 |
+
except Exception as e:
|
492 |
+
logger.error(f"Error in refresh with filters: {e}")
|
493 |
+
# Return the current leaderboards on error
|
494 |
+
return leaderboard, *[
|
495 |
+
tab.children[0] for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
|
496 |
+
]
|
497 |
+
|
498 |
+
|
499 |
+
def submit_results(
|
500 |
+
model_name: str,
|
501 |
+
base_model: str,
|
502 |
+
revision: str,
|
503 |
+
precision: str,
|
504 |
+
weight_type: str,
|
505 |
+
model_type: str,
|
506 |
+
mode: str,
|
507 |
+
submission_file: tempfile._TemporaryFileWrapper,
|
508 |
+
version: str,
|
509 |
+
review_model_type: ReviewModelType,
|
510 |
+
programming_language: str,
|
511 |
+
comment_language: str,
|
512 |
):
|
513 |
+
"""
|
514 |
+
Handle submission of results with model metadata.
|
515 |
+
"""
|
516 |
+
if submission_file is None:
|
517 |
+
return styled_error("No submission file provided")
|
518 |
+
|
519 |
+
if not model_name:
|
520 |
+
return styled_error("Model name is required")
|
521 |
+
|
522 |
+
if not model_type:
|
523 |
+
return styled_error("Please select a model type")
|
524 |
+
|
525 |
+
if not mode:
|
526 |
+
return styled_error("Please select an inference mode")
|
527 |
+
|
528 |
+
file_path = submission_file.name
|
529 |
+
logger.info(f"Received submission for model {model_name}: {file_path}")
|
530 |
+
|
531 |
+
# Add metadata to the submission
|
532 |
+
metadata = {
|
533 |
+
"model_name": model_name,
|
534 |
+
"base_model": base_model,
|
535 |
+
"revision": revision if revision else "main",
|
536 |
+
"precision": precision,
|
537 |
+
"weight_type": weight_type,
|
538 |
+
"model_type": model_type,
|
539 |
+
"mode": mode,
|
540 |
+
"version": version,
|
541 |
+
"review_model_type": review_model_type,
|
542 |
+
"programming_language": programming_language,
|
543 |
+
"comment_language": comment_language,
|
544 |
+
}
|
545 |
+
|
546 |
+
# Process the submission
|
547 |
+
result = process_submission(file_path, metadata, version=version)
|
548 |
+
|
549 |
+
# Refresh the leaderboard data
|
550 |
+
global LEADERBOARD_DF
|
551 |
+
try:
|
552 |
+
logger.info(
|
553 |
+
f"Refreshing leaderboard data after submission for version {version}..."
|
554 |
)
|
555 |
+
LEADERBOARD_DF = get_leaderboard_df(version=version)
|
556 |
+
logger.info("Refreshed leaderboard data after submission")
|
557 |
+
except Exception as e:
|
558 |
+
logger.error(f"Error refreshing leaderboard data: {e}")
|
559 |
+
|
560 |
+
return result
|
561 |
+
|
562 |
+
|
563 |
+
def refresh_data(version=CURRENT_VERSION):
|
564 |
+
"""
|
565 |
+
Refresh the leaderboard data and update all components.
|
566 |
+
"""
|
567 |
+
try:
|
568 |
+
logger.info(f"Performing scheduled refresh of leaderboard data...")
|
569 |
+
# Get new data
|
570 |
+
main_df = get_leaderboard_df(version=version)
|
571 |
+
category_dfs = [
|
572 |
+
get_category_leaderboard_df(category, version=version)
|
573 |
+
for category in CATEGORIES
|
574 |
+
]
|
575 |
+
|
576 |
+
# For gr.Dataframe, we return the actual dataframes
|
577 |
+
return main_df, *category_dfs
|
578 |
+
|
579 |
+
except Exception as e:
|
580 |
+
logger.error(f"Error in scheduled refresh: {e}")
|
581 |
+
return None, *[None for _ in CATEGORIES]
|
582 |
+
|
583 |
+
|
584 |
+
def update_leaderboards(version):
|
585 |
+
"""
|
586 |
+
Update all leaderboard components with data for the selected version.
|
587 |
+
"""
|
588 |
+
try:
|
589 |
+
new_df = get_leaderboard_df(version=version)
|
590 |
+
category_dfs = [
|
591 |
+
get_category_leaderboard_df(category, version=version)
|
592 |
+
for category in CATEGORIES
|
593 |
+
]
|
594 |
+
return new_df, *category_dfs
|
595 |
+
except Exception as e:
|
596 |
+
logger.error(f"Error updating leaderboards for version {version}: {e}")
|
597 |
+
return None, *[None for _ in CATEGORIES]
|
598 |
+
|
599 |
+
|
600 |
+
def create_performance_plot(
|
601 |
+
selected_models, category, metric="f1_binary", version=CURRENT_VERSION
|
602 |
+
):
|
603 |
+
"""
|
604 |
+
Create a radar plot comparing model performance for selected models.
|
605 |
+
"""
|
606 |
+
if category == "All Results":
|
607 |
+
df = get_leaderboard_df(version=version)
|
608 |
else:
|
609 |
+
df = get_category_leaderboard_df(category, version=version)
|
610 |
+
|
611 |
+
if df.empty:
|
612 |
+
return go.Figure()
|
613 |
+
|
614 |
+
# Lowercase model_name in df and selected_models
|
615 |
+
df = df.copy()
|
616 |
+
df["model_name"] = df["model_name"].str.lower()
|
617 |
+
selected_models = [m.lower() for m in selected_models]
|
618 |
+
df = df[df["model_name"].isin(selected_models)]
|
619 |
+
metric_cols = [col for col in df.columns if metric in col]
|
620 |
+
fig = go.Figure()
|
621 |
+
colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
|
622 |
+
for idx, model in enumerate(selected_models):
|
623 |
+
model_data = df[df["model_name"] == model]
|
624 |
+
if not model_data.empty:
|
625 |
+
values = model_data[metric_cols].values[0].tolist()
|
626 |
+
values = values + [values[0]]
|
627 |
+
categories = [col.replace(f"_{metric}", "") for col in metric_cols]
|
628 |
+
# Replace 'jailbreaked' with 'jailbroken' in categories
|
629 |
+
categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
|
630 |
+
categories = categories + [categories[0]]
|
631 |
+
fig.add_trace(
|
632 |
+
go.Scatterpolar(
|
633 |
+
r=values,
|
634 |
+
theta=categories,
|
635 |
+
name=model,
|
636 |
+
line_color=colors[idx % len(colors)],
|
637 |
+
fill="toself",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
638 |
)
|
639 |
+
)
|
640 |
+
fig.update_layout(
|
641 |
+
paper_bgcolor="#000000",
|
642 |
+
plot_bgcolor="#000000",
|
643 |
+
font={"color": "#ffffff"},
|
644 |
+
title={
|
645 |
+
"text": f"{category} - {metric.upper()} Score Comparison",
|
646 |
+
"font": {"color": "#ffffff", "size": 24},
|
647 |
+
},
|
648 |
+
polar=dict(
|
649 |
+
bgcolor="#000000",
|
650 |
+
radialaxis=dict(
|
651 |
+
visible=True,
|
652 |
+
range=[0, 1],
|
653 |
+
gridcolor="#333333",
|
654 |
+
linecolor="#333333",
|
655 |
+
tickfont={"color": "#ffffff"},
|
656 |
+
),
|
657 |
+
angularaxis=dict(
|
658 |
+
gridcolor="#333333",
|
659 |
+
linecolor="#333333",
|
660 |
+
tickfont={"color": "#ffffff"},
|
661 |
+
),
|
662 |
+
),
|
663 |
+
height=600,
|
664 |
+
showlegend=True,
|
665 |
+
legend=dict(
|
666 |
+
yanchor="top",
|
667 |
+
y=0.99,
|
668 |
+
xanchor="right",
|
669 |
+
x=0.99,
|
670 |
+
bgcolor="rgba(0,0,0,0.5)",
|
671 |
+
font={"color": "#ffffff"},
|
672 |
+
),
|
673 |
+
)
|
674 |
+
return fig
|
675 |
+
|
676 |
+
|
677 |
+
def update_model_choices(version):
|
678 |
+
"""
|
679 |
+
Update the list of available models for the given version.
|
680 |
+
"""
|
681 |
+
df = get_leaderboard_df(version=version)
|
682 |
+
if df.empty:
|
683 |
+
return []
|
684 |
+
return sorted(df["model_name"].str.lower().unique().tolist())
|
685 |
+
|
686 |
+
|
687 |
+
def update_visualization(selected_models, selected_category, selected_metric, version):
|
688 |
+
"""
|
689 |
+
Update the visualization based on user selections.
|
690 |
+
"""
|
691 |
+
if not selected_models:
|
692 |
+
return go.Figure()
|
693 |
+
return create_performance_plot(
|
694 |
+
selected_models, selected_category, selected_metric, version
|
695 |
+
)
|
696 |
+
|
697 |
+
|
698 |
+
# Create Gradio app
|
699 |
+
demo = gr.Blocks(css=custom_css, theme=custom_theme)
|
700 |
+
|
701 |
+
CATEGORY_DISPLAY_MAP = {
|
702 |
+
"Python": "Python",
|
703 |
+
"Java": "Java",
|
704 |
+
"Scala": "Scala",
|
705 |
+
"Go": "Go"
|
706 |
+
}
|
707 |
+
# Create reverse mapping for lookups
|
708 |
+
CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
|
709 |
+
|
710 |
+
with demo:
|
711 |
+
gr.HTML(TITLE)
|
712 |
+
# gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
713 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
714 |
+
|
715 |
+
with gr.Row():
|
716 |
+
tabs = gr.Tabs(elem_classes="tab-buttons")
|
717 |
+
|
718 |
+
with tabs:
|
719 |
+
with gr.TabItem("Leaderboard", elem_id="codereview-leaderboard-tab", id=0):
|
720 |
+
with gr.Row():
|
721 |
+
version_selector = gr.Dropdown(
|
722 |
+
choices=BENCHMARK_VERSIONS,
|
723 |
+
label="Benchmark Version",
|
724 |
+
value=CURRENT_VERSION,
|
725 |
+
interactive=True,
|
726 |
+
elem_classes="version-selector",
|
727 |
+
scale=1,
|
728 |
+
visible=False,
|
729 |
+
)
|
730 |
+
|
731 |
+
with gr.Row():
|
732 |
+
search_input = gr.Textbox(
|
733 |
+
placeholder="Search by models (use ; to split)",
|
734 |
+
label="Search",
|
735 |
+
elem_id="search-bar",
|
736 |
+
scale=2,
|
737 |
+
)
|
738 |
+
comment_language_filter = gr.Dropdown(
|
739 |
+
choices=["en", "ru"],
|
740 |
+
label="Comment Language",
|
741 |
+
multiselect=True,
|
742 |
+
value=[],
|
743 |
+
interactive=True,
|
744 |
+
scale=1,
|
745 |
+
)
|
746 |
+
programming_language_filter = gr.Dropdown(
|
747 |
+
choices=["Python", "Java", "Scala", "Go"],
|
748 |
+
label="Programming Language",
|
749 |
+
multiselect=True,
|
750 |
+
value=[],
|
751 |
+
interactive=True,
|
752 |
+
scale=1,
|
753 |
+
)
|
754 |
+
with gr.Row():
|
755 |
+
topic_filter = gr.Dropdown(
|
756 |
+
choices=TOPICS,
|
757 |
+
label="Topic",
|
758 |
+
multiselect=True,
|
759 |
+
value=[],
|
760 |
+
interactive=True,
|
761 |
+
scale=2,
|
762 |
+
)
|
763 |
+
column_selector = gr.Dropdown(
|
764 |
+
choices=get_all_column_choices(),
|
765 |
+
label="Columns",
|
766 |
+
multiselect=True,
|
767 |
+
value=get_initial_columns(),
|
768 |
+
interactive=True,
|
769 |
+
visible=False,
|
770 |
+
scale=1,
|
771 |
+
)
|
772 |
+
with gr.Row():
|
773 |
+
refresh_button = gr.Button(
|
774 |
+
"Refresh", scale=0, elem_id="refresh-button"
|
775 |
+
)
|
776 |
+
|
777 |
+
# Create tabs for each category
|
778 |
+
with gr.Tabs(elem_classes="category-tabs") as category_tabs:
|
779 |
+
# First tab for average metrics across all categories
|
780 |
+
with gr.TabItem("All Results", elem_id="overall-tab"):
|
781 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
782 |
+
|
783 |
+
# Create a tab for each category using display names
|
784 |
+
for category in CATEGORIES:
|
785 |
+
display_name = CATEGORY_DISPLAY_MAP.get(category, category)
|
786 |
+
elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
|
787 |
+
with gr.TabItem(display_name, elem_id=elem_id):
|
788 |
+
category_df = get_category_leaderboard_df(
|
789 |
+
category, version=CURRENT_VERSION
|
790 |
+
)
|
791 |
+
category_leaderboard = init_leaderboard(category_df)
|
792 |
+
|
793 |
+
# Connect search and filter inputs to update function
|
794 |
+
def update_with_search_filters(
|
795 |
+
version=CURRENT_VERSION,
|
796 |
+
search_query="",
|
797 |
+
comment_languages=None,
|
798 |
+
selected_columns=None,
|
799 |
+
):
|
800 |
+
"""
|
801 |
+
Update the leaderboards with search and filter settings.
|
802 |
+
"""
|
803 |
+
return refresh_data_with_filters(
|
804 |
+
version, search_query, comment_languages, selected_columns
|
805 |
+
)
|
806 |
+
|
807 |
+
# Refresh button functionality
|
808 |
+
def refresh_and_update(
|
809 |
+
version, search_query, comment_languages, selected_columns
|
810 |
+
):
|
811 |
+
"""
|
812 |
+
Refresh data, update LEADERBOARD_DF, and return updated components.
|
813 |
+
"""
|
814 |
+
global LEADERBOARD_DF
|
815 |
+
main_df = get_leaderboard_df(version=version)
|
816 |
+
LEADERBOARD_DF = main_df # Update the global DataFrame
|
817 |
+
return refresh_data_with_filters(
|
818 |
+
version, search_query, comment_languages, selected_columns
|
819 |
+
)
|
820 |
+
|
821 |
+
refresh_button.click(
|
822 |
+
fn=refresh_and_update,
|
823 |
+
inputs=[
|
824 |
+
version_selector,
|
825 |
+
search_input,
|
826 |
+
comment_language_filter,
|
827 |
+
column_selector,
|
828 |
+
],
|
829 |
+
outputs=[leaderboard]
|
830 |
+
+ [
|
831 |
+
category_tabs.children[i].children[0]
|
832 |
+
for i in range(1, len(CATEGORIES) + 1)
|
833 |
+
],
|
834 |
)
|
835 |
+
# Search input functionality
|
836 |
+
search_input.change(
|
837 |
+
fn=refresh_data_with_filters,
|
838 |
+
inputs=[
|
839 |
+
version_selector,
|
840 |
+
search_input,
|
841 |
+
comment_language_filter,
|
842 |
+
column_selector,
|
843 |
+
],
|
844 |
+
outputs=[leaderboard]
|
845 |
+
+ [
|
846 |
+
category_tabs.children[i].children[0]
|
847 |
+
for i in range(1, len(CATEGORIES) + 1)
|
848 |
+
],
|
849 |
)
|
850 |
+
|
851 |
+
# Comment language filter functionality
|
852 |
+
comment_language_filter.change(
|
853 |
+
fn=refresh_data_with_filters,
|
854 |
+
inputs=[
|
855 |
+
version_selector,
|
856 |
+
search_input,
|
857 |
+
comment_language_filter,
|
858 |
+
column_selector,
|
859 |
+
],
|
860 |
+
outputs=[leaderboard]
|
861 |
+
+ [
|
862 |
+
category_tabs.children[i].children[0]
|
863 |
+
for i in range(1, len(CATEGORIES) + 1)
|
864 |
+
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
865 |
)
|
866 |
+
|
867 |
+
# Version selector functionality
|
868 |
+
version_selector.change(
|
869 |
+
fn=refresh_data_with_filters,
|
870 |
+
inputs=[
|
871 |
+
version_selector,
|
872 |
+
search_input,
|
873 |
+
comment_language_filter,
|
874 |
+
column_selector,
|
875 |
+
],
|
876 |
+
outputs=[leaderboard]
|
877 |
+
+ [
|
878 |
+
category_tabs.children[i].children[0]
|
879 |
+
for i in range(1, len(CATEGORIES) + 1)
|
880 |
+
],
|
881 |
)
|
882 |
+
|
883 |
+
# Update the update_columns function to handle updating all tabs at once
|
884 |
+
def update_columns(selected_columns):
|
885 |
+
"""
|
886 |
+
Update all leaderboards to show the selected columns.
|
887 |
+
Ensures all selected columns are preserved in the update.
|
888 |
+
|
889 |
+
"""
|
890 |
+
|
891 |
+
try:
|
892 |
+
logger.info(f"Updating columns to show: {selected_columns}")
|
893 |
+
|
894 |
+
# If no columns are selected, use default visible columns
|
895 |
+
if not selected_columns or len(selected_columns) == 0:
|
896 |
+
selected_columns = get_default_visible_columns()
|
897 |
+
logger.info(
|
898 |
+
f"No columns selected, using defaults: {selected_columns}"
|
899 |
+
)
|
900 |
+
|
901 |
+
# Convert display names to internal names
|
902 |
+
internal_selected_columns = [
|
903 |
+
x.lower()
|
904 |
+
.replace(" ", "_")
|
905 |
+
.replace("(", "")
|
906 |
+
.replace(")", "")
|
907 |
+
.replace("_recall", "_recall_binary")
|
908 |
+
.replace("_precision", "_precision_binary")
|
909 |
+
for x in selected_columns
|
910 |
+
]
|
911 |
+
|
912 |
+
# Get the current data with ALL columns preserved
|
913 |
+
main_df = get_leaderboard_df(version=version_selector.value)
|
914 |
+
|
915 |
+
# Get category dataframes with ALL columns preserved
|
916 |
+
category_dfs = [
|
917 |
+
get_category_leaderboard_df(
|
918 |
+
category, version=version_selector.value
|
919 |
+
)
|
920 |
+
for category in CATEGORIES
|
921 |
+
]
|
922 |
+
|
923 |
+
# Log columns for debugging
|
924 |
+
logger.info(f"Main dataframe columns: {list(main_df.columns)}")
|
925 |
+
logger.info(
|
926 |
+
f"Selected columns (internal): {internal_selected_columns}"
|
927 |
+
)
|
928 |
+
|
929 |
+
# IMPORTANT: Make sure model_name is always included
|
930 |
+
if (
|
931 |
+
"model_name" in main_df.columns
|
932 |
+
and "model_name" not in internal_selected_columns
|
933 |
+
):
|
934 |
+
internal_selected_columns = [
|
935 |
+
"model_name"
|
936 |
+
] + internal_selected_columns
|
937 |
+
|
938 |
+
# Initialize the main leaderboard with the selected columns
|
939 |
+
# We're passing the internal_selected_columns directly to preserve the selection
|
940 |
+
main_leaderboard = init_leaderboard(
|
941 |
+
main_df, internal_selected_columns
|
942 |
+
)
|
943 |
+
|
944 |
+
# Initialize category dataframes with the same selected columns
|
945 |
+
# This ensures consistency across all tabs
|
946 |
+
category_leaderboards = []
|
947 |
+
for df in category_dfs:
|
948 |
+
# Use the same selected columns for each category
|
949 |
+
# init_leaderboard will automatically handle filtering to columns that exist
|
950 |
+
category_leaderboards.append(
|
951 |
+
init_leaderboard(df, internal_selected_columns)
|
952 |
+
)
|
953 |
+
|
954 |
+
return main_leaderboard, *category_leaderboards
|
955 |
+
|
956 |
+
except Exception as e:
|
957 |
+
logger.error(f"Error updating columns: {e}")
|
958 |
+
import traceback
|
959 |
+
|
960 |
+
logger.error(traceback.format_exc())
|
961 |
+
return leaderboard, *[
|
962 |
+
tab.children[0]
|
963 |
+
for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
|
964 |
+
]
|
965 |
+
|
966 |
+
# Connect column selector to update function
|
967 |
+
column_selector.change(
|
968 |
+
fn=update_columns,
|
969 |
+
inputs=[column_selector],
|
970 |
+
outputs=[leaderboard]
|
971 |
+
+ [
|
972 |
+
category_tabs.children[i].children[0]
|
973 |
+
for i in range(1, len(CATEGORIES) + 1)
|
974 |
+
],
|
975 |
)
|
976 |
+
|
977 |
+
# with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
|
978 |
+
# gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
979 |
+
|
980 |
+
with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=1):
|
981 |
+
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
982 |
+
|
983 |
+
with gr.Row():
|
984 |
+
# with gr.Column(scale=3):
|
985 |
+
# gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
|
986 |
+
with gr.Column(scale=1):
|
987 |
+
# Add version selector specifically for the submission tab
|
988 |
+
submission_version_selector = gr.Dropdown(
|
989 |
+
choices=BENCHMARK_VERSIONS,
|
990 |
+
label="Benchmark Version",
|
991 |
+
value=CURRENT_VERSION,
|
992 |
+
interactive=True,
|
993 |
+
elem_classes="version-selector",
|
994 |
+
visible=False,
|
995 |
+
)
|
996 |
+
|
997 |
+
with gr.Row():
|
998 |
+
with gr.Column():
|
999 |
+
model_name_textbox = gr.Textbox(label="Model name")
|
1000 |
+
mode_selector = gr.Dropdown(
|
1001 |
+
choices=[m.name for m in Mode],
|
1002 |
+
label="Mode",
|
1003 |
+
multiselect=False,
|
1004 |
+
value=None,
|
1005 |
+
interactive=True,
|
1006 |
+
)
|
1007 |
+
revision_name_textbox = gr.Textbox(
|
1008 |
+
label="Revision commit", placeholder="main"
|
1009 |
+
)
|
1010 |
+
model_type = gr.Dropdown(
|
1011 |
+
choices=[
|
1012 |
+
t.to_str("-")
|
1013 |
+
for t in ModelType
|
1014 |
+
if t != ModelType.Unknown and t != ModelType.ClosedSource
|
1015 |
+
],
|
1016 |
+
label="Model type",
|
1017 |
+
multiselect=False,
|
1018 |
+
value=None,
|
1019 |
+
interactive=True,
|
1020 |
+
)
|
1021 |
+
review_model_type = gr.Dropdown(
|
1022 |
+
choices=[t.name for t in ReviewModelType],
|
1023 |
+
label="Review model type",
|
1024 |
+
multiselect=False,
|
1025 |
+
value=ReviewModelType.CUSTOM.name,
|
1026 |
+
interactive=True,
|
1027 |
+
)
|
1028 |
+
programming_language_selector = gr.Dropdown(
|
1029 |
+
choices=["Python", "Java", "Scala", "Go"],
|
1030 |
+
label="Programming Language",
|
1031 |
+
multiselect=False,
|
1032 |
+
value=None,
|
1033 |
+
interactive=True,
|
1034 |
+
)
|
1035 |
+
comment_language_selector = gr.Dropdown(
|
1036 |
+
choices=["en", "ru"],
|
1037 |
+
label="Comment Language",
|
1038 |
+
multiselect=False,
|
1039 |
+
value="en",
|
1040 |
+
interactive=True,
|
1041 |
+
)
|
1042 |
+
|
1043 |
+
with gr.Column():
|
1044 |
+
precision = gr.Dropdown(
|
1045 |
+
choices=[
|
1046 |
+
i.name for i in Precision if i != Precision.Unknown
|
1047 |
+
],
|
1048 |
+
label="Precision",
|
1049 |
+
multiselect=False,
|
1050 |
+
value="float16",
|
1051 |
+
interactive=True,
|
1052 |
+
)
|
1053 |
+
weight_type = gr.Dropdown(
|
1054 |
+
choices=[i.name for i in WeightType],
|
1055 |
+
label="Weights type",
|
1056 |
+
multiselect=False,
|
1057 |
+
value="Original",
|
1058 |
+
interactive=True,
|
1059 |
+
)
|
1060 |
+
base_model_name_textbox = gr.Textbox(
|
1061 |
+
label="Base model (for delta or adapter weights)"
|
1062 |
+
)
|
1063 |
+
|
1064 |
+
with gr.Row():
|
1065 |
+
file_input = gr.File(
|
1066 |
+
label="Upload JSONL Results File", file_types=[".jsonl"]
|
1067 |
)
|
1068 |
+
|
1069 |
+
submit_button = gr.Button("Submit Results")
|
1070 |
+
result_output = gr.Markdown()
|
1071 |
+
|
1072 |
+
submit_button.click(
|
1073 |
+
fn=submit_results,
|
1074 |
+
inputs=[
|
1075 |
+
model_name_textbox,
|
1076 |
+
base_model_name_textbox,
|
1077 |
+
revision_name_textbox,
|
1078 |
+
precision,
|
1079 |
+
weight_type,
|
1080 |
+
model_type,
|
1081 |
+
mode_selector,
|
1082 |
+
file_input,
|
1083 |
+
submission_version_selector,
|
1084 |
+
review_model_type,
|
1085 |
+
programming_language_selector,
|
1086 |
+
comment_language_selector,
|
1087 |
+
],
|
1088 |
+
outputs=result_output,
|
1089 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1090 |
|
1091 |
+
# Version selector functionality
|
1092 |
+
version_selector.change(
|
1093 |
+
fn=update_leaderboards,
|
1094 |
+
inputs=[version_selector],
|
1095 |
+
outputs=[leaderboard]
|
1096 |
+
+ [
|
1097 |
+
category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
|
1098 |
+
],
|
1099 |
+
).then(
|
1100 |
+
lambda version: refresh_data_with_filters(version),
|
1101 |
+
inputs=[version_selector],
|
1102 |
+
outputs=[leaderboard]
|
1103 |
+
+ [
|
1104 |
+
category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
|
1105 |
+
],
|
1106 |
)
|
1107 |
|
1108 |
+
|
1109 |
+
# Set up the scheduler to refresh data periodically
|
1110 |
+
scheduler = BackgroundScheduler()
|
1111 |
+
scheduler.add_job(refresh_data, "interval", minutes=30)
|
1112 |
+
scheduler.start()
|
1113 |
+
|
1114 |
+
# Launch the app
|
1115 |
+
if __name__ == "__main__":
|
1116 |
+
demo.launch()
|
example_submission.jsonl
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"model_name": "GPT-4-CodeReview", "programming_language": "Python", "comment_language": "en", "topic": "Code Reliability", "observation_id": "obs_001", "code_snippet": "def calculate_sum(a, b):\n return a + b", "review_text": "This function is simple and correct, but consider adding type hints and docstring for better documentation.", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
|
2 |
+
{"model_name": "GPT-4-CodeReview", "programming_language": "Java", "comment_language": "en", "topic": "Coding Standards", "observation_id": "obs_002", "code_snippet": "public class Calculator {\n public int add(int a, int b) {\n return a + b;\n }\n}", "review_text": "Consider following Java naming conventions and adding JavaDoc comments. The method is functionally correct.", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
|
3 |
+
{"model_name": "Claude-3-CodeReview", "programming_language": "Scala", "comment_language": "ru", "topic": "Performance Issues", "observation_id": "obs_003", "code_snippet": "def fibonacci(n: Int): Int = {\n if (n <= 1) n\n else fibonacci(n-1) + fibonacci(n-2)\n}", "review_text": "Эта реализация неэффективна из-за экспоненциальной сложности. Рекомендуется использовать мемоизацию или итеративный подход.", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 9.2, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
|
4 |
+
{"model_name": "Llama-CodeReview", "programming_language": "Go", "comment_language": "en", "topic": "Variables", "observation_id": "obs_004", "code_snippet": "package main\n\nimport \"fmt\"\n\nfunc main() {\n var x int = 5\n var y int = 10\n fmt.Println(x + y)\n}", "review_text": "Consider using short variable declarations (:=) for local variables. Also, the variable names could be more descriptive.", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}
|
gradio_test.ipynb
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": []
|
9 |
+
}
|
10 |
+
],
|
11 |
+
"metadata": {
|
12 |
+
"kernelspec": {
|
13 |
+
"display_name": "agent_env",
|
14 |
+
"language": "python",
|
15 |
+
"name": "python3"
|
16 |
+
},
|
17 |
+
"language_info": {
|
18 |
+
"codemirror_mode": {
|
19 |
+
"name": "ipython",
|
20 |
+
"version": 3
|
21 |
+
},
|
22 |
+
"file_extension": ".py",
|
23 |
+
"mimetype": "text/x-python",
|
24 |
+
"name": "python",
|
25 |
+
"nbconvert_exporter": "python",
|
26 |
+
"pygments_lexer": "ipython3",
|
27 |
+
"version": "3.13.2"
|
28 |
+
}
|
29 |
+
},
|
30 |
+
"nbformat": 4,
|
31 |
+
"nbformat_minor": 2
|
32 |
+
}
|
leaderboard_data.json
CHANGED
@@ -1,23 +1,32 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
{
|
4 |
-
"model_name": "
|
5 |
-
"
|
6 |
-
"
|
7 |
-
"
|
8 |
-
"
|
9 |
-
"
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
}
|
22 |
-
]
|
|
|
|
|
23 |
}
|
|
|
1 |
{
|
2 |
+
"entries": [
|
3 |
{
|
4 |
+
"model_name": "GPT-4-CodeReview",
|
5 |
+
"model_type": "LLM",
|
6 |
+
"mode": "Strict",
|
7 |
+
"review_model_type": "gpt-4",
|
8 |
+
"programming_language": "Python",
|
9 |
+
"comment_language": "en",
|
10 |
+
"topic": "Code Reliability",
|
11 |
+
"submission_date": "2024-10-06T12:00:00Z",
|
12 |
+
"version": "v0",
|
13 |
+
"readability": 8.5,
|
14 |
+
"relevance": 9.0,
|
15 |
+
"explanation_clarity": 7.8,
|
16 |
+
"problem_identification": 8.2,
|
17 |
+
"actionability": 8.7,
|
18 |
+
"completeness": 8.0,
|
19 |
+
"specificity": 7.5,
|
20 |
+
"contextual_adequacy": 8.3,
|
21 |
+
"consistency": 8.8,
|
22 |
+
"brevity": 7.2,
|
23 |
+
"pass_at_1": 0.75,
|
24 |
+
"pass_at_5": 0.88,
|
25 |
+
"pass_at_10": 0.92,
|
26 |
+
"bleu_at_10": 0.65,
|
27 |
+
"total_evaluations": 100
|
28 |
}
|
29 |
+
],
|
30 |
+
"last_updated": "2024-10-06T12:00:00Z",
|
31 |
+
"version": "v0"
|
32 |
}
|
requirements.txt
CHANGED
@@ -1,19 +1,8 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
matplotlib
|
10 |
-
numpy
|
11 |
-
pandas>=1.3.0
|
12 |
-
python-dateutil
|
13 |
-
tqdm
|
14 |
-
transformers
|
15 |
-
tokenizers>=0.15.0
|
16 |
-
sentencepiece
|
17 |
-
fastapi
|
18 |
-
uvicorn
|
19 |
-
pydantic>=2.0.0
|
|
|
1 |
+
gradio==4.44.1
|
2 |
+
pandas>=2.0.0
|
3 |
+
huggingface_hub>=0.20.0
|
4 |
+
datasets>=2.0.0
|
5 |
+
apscheduler>=3.10.0
|
6 |
+
python-dotenv>=1.0.0
|
7 |
+
plotly>=5.18.0
|
8 |
+
pydantic==2.10.6
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/about.py
CHANGED
@@ -1,48 +1,60 @@
|
|
1 |
"""
|
2 |
-
|
3 |
"""
|
4 |
|
5 |
-
TITLE = "
|
|
|
|
|
|
|
|
|
6 |
|
7 |
INTRODUCTION_TEXT = """
|
8 |
-
|
9 |
-
|
10 |
-
A comprehensive benchmark for evaluating code review generation models across multiple programming languages and comment types.
|
11 |
|
12 |
-
|
|
|
|
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
- **Taxonomy Categories**: Performance across different types of code review feedback
|
18 |
|
19 |
-
|
|
|
20 |
|
21 |
-
-
|
22 |
-
- **Pass@1/5/10**: Percentage of reviews that pass quality checks in 1, 5, or 10 attempts
|
23 |
-
- **Multi-dimensional Quality Scores**: Detailed evaluation across 10 quality dimensions
|
24 |
|
25 |
-
|
26 |
|
27 |
-
|
28 |
-
✨ **Comment Language Support**: Filter by the natural language of code comments
|
29 |
-
✨ **Taxonomy Categories**: Browse results by review type (bug detection, style, performance, etc.)
|
30 |
-
✨ **IP-based Submissions**: Secure submission system with IP tracking
|
31 |
-
✨ **Dark Theme**: Modern, eye-friendly interface
|
32 |
"""
|
33 |
|
34 |
-
|
35 |
-
##
|
36 |
|
37 |
-
|
38 |
-
2. **Format**: Provide scores in the specified format ranges
|
39 |
-
3. **Reproducibility**: Include model details and evaluation setup
|
40 |
-
4. **Quality Metrics**: Rate your model across all 10 quality dimensions
|
41 |
-
5. **Metadata**: Specify programming language, comment language, and taxonomy focus
|
42 |
-
"""
|
43 |
|
44 |
-
|
45 |
-
|
|
|
46 |
|
47 |
-
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""
|
2 |
+
Text content for the CodeReview Bench Leaderboard.
|
3 |
"""
|
4 |
|
5 |
+
TITLE = """
|
6 |
+
<div style="text-align: center; margin-bottom: 1rem">
|
7 |
+
<h1>CodeReview Bench Leaderboard</h1>
|
8 |
+
</div>
|
9 |
+
"""
|
10 |
|
11 |
INTRODUCTION_TEXT = """
|
12 |
+
## Introduction
|
|
|
|
|
13 |
|
14 |
+
CodeReview Bench is a comprehensive benchmark for evaluating the quality and effectiveness of automated code review systems.
|
15 |
+
This leaderboard tracks model performance across various programming languages and review criteria,
|
16 |
+
including readability, relevance, explanation clarity, and actionability.
|
17 |
|
18 |
+
Models are evaluated on their ability to provide high-quality code reviews that are helpful,
|
19 |
+
accurate, and actionable across multiple programming languages and review categories.
|
20 |
+
"""
|
|
|
21 |
|
22 |
+
LLM_BENCHMARKS_TEXT = """
|
23 |
+
CodeReview Bench is a comprehensive benchmark for evaluating automated code review systems across programming languages and review quality dimensions.
|
24 |
|
25 |
+
It evaluates models on their ability to provide high-quality code reviews using both LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity) and exact-match metrics (pass@1, pass@5, pass@10, BLEU@10).
|
|
|
|
|
26 |
|
27 |
+
The benchmark supports both Russian and English comment languages across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more.
|
28 |
|
29 |
+
Learn more about automated code review evaluation and best practices.
|
|
|
|
|
|
|
|
|
30 |
"""
|
31 |
|
32 |
+
EVALUATION_QUEUE_TEXT = """
|
33 |
+
## Submit Your Model
|
34 |
|
35 |
+
To add your model to the CodeReview Bench leaderboard:
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
1. Run your evaluation using the CodeReview Bench framework
|
38 |
+
2. Upload your results in .jsonl format using this form.
|
39 |
+
3. Once validated, your model will appear on the leaderboard.
|
40 |
|
41 |
+
### Requirements:
|
42 |
+
- Results must include all required metrics: LLM-based multimetric scores and exact-match metrics
|
43 |
+
- Submissions should cover multiple programming languages where applicable
|
44 |
+
- Both Russian and English comment languages are supported
|
45 |
+
|
46 |
+
### ✉️✨ Ready? Upload your results below!
|
47 |
+
"""
|
48 |
+
|
49 |
+
CITATION_BUTTON_LABEL = "Cite CodeReview Bench"
|
50 |
+
|
51 |
+
CITATION_BUTTON_TEXT = """
|
52 |
+
@misc{codereviewbench2025,
|
53 |
+
author = {CodeReview Bench Team},
|
54 |
+
title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
|
55 |
+
year = {2025},
|
56 |
+
publisher = {GitHub},
|
57 |
+
journal = {GitHub repository},
|
58 |
+
howpublished = {\\url{https://github.com/your-org/codereview-bench}}
|
59 |
+
}
|
60 |
+
"""
|
src/display/css_html_js.py
CHANGED
@@ -1,306 +1,97 @@
|
|
1 |
"""
|
2 |
-
|
3 |
"""
|
4 |
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
:
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
--text-primary: #e6edf3;
|
13 |
-
--text-secondary: #7d8590;
|
14 |
-
--border-color: #30363d;
|
15 |
-
--accent-color: #ffffff;
|
16 |
-
--accent-hover: #f0f0f0;
|
17 |
-
--danger-color: #da3633;
|
18 |
-
--warning-color: #d29922;
|
19 |
-
--info-color: #1f6feb;
|
20 |
}
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
background:
|
25 |
-
color:
|
26 |
}
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
color: var(--text-primary) !important;
|
31 |
}
|
32 |
|
33 |
-
.
|
34 |
-
|
35 |
}
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
background:
|
40 |
-
|
|
|
|
|
41 |
}
|
42 |
|
43 |
-
.
|
44 |
-
|
45 |
-
color:
|
46 |
-
border: none !important;
|
47 |
-
padding: 12px 24px !important;
|
48 |
-
transition: all 0.2s ease !important;
|
49 |
}
|
50 |
|
51 |
-
.
|
52 |
-
|
53 |
-
background: var(--bg-tertiary) !important;
|
54 |
}
|
55 |
|
56 |
-
.
|
57 |
-
|
58 |
-
|
59 |
-
border-bottom: 2px solid var(--accent-color) !important;
|
60 |
}
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
border:
|
66 |
-
border-radius: 8px !important;
|
67 |
-
overflow: hidden !important;
|
68 |
}
|
69 |
|
70 |
-
.
|
71 |
-
|
|
|
72 |
}
|
73 |
|
74 |
-
.
|
75 |
-
|
76 |
-
|
77 |
-
border-bottom: 2px solid var(--border-color) !important;
|
78 |
-
padding: 12px !important;
|
79 |
-
font-weight: 600 !important;
|
80 |
}
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
border-bottom: 1px solid var(--border-color) !important;
|
86 |
-
padding: 10px 12px !important;
|
87 |
}
|
88 |
|
89 |
-
.
|
90 |
-
|
|
|
91 |
}
|
92 |
|
93 |
-
/*
|
94 |
-
.gradio-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
|
|
|
|
100 |
}
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
|
|
105 |
}
|
106 |
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
color: var(--bg-primary) !important;
|
111 |
-
border: 1px solid var(--border-color) !important;
|
112 |
-
border-radius: 6px !important;
|
113 |
-
padding: 8px 16px !important;
|
114 |
-
font-weight: 500 !important;
|
115 |
-
transition: all 0.2s ease !important;
|
116 |
-
}
|
117 |
-
|
118 |
-
.gradio-container button:hover {
|
119 |
-
background: var(--accent-hover) !important;
|
120 |
-
transform: translateY(-1px) !important;
|
121 |
-
color: var(--bg-primary) !important;
|
122 |
-
}
|
123 |
-
|
124 |
-
.gradio-container button:active {
|
125 |
-
transform: translateY(0) !important;
|
126 |
-
}
|
127 |
-
|
128 |
-
/* Dropdowns */
|
129 |
-
.gradio-container .dropdown {
|
130 |
-
background: var(--bg-tertiary) !important;
|
131 |
-
border: 1px solid var(--border-color) !important;
|
132 |
-
border-radius: 6px !important;
|
133 |
-
}
|
134 |
-
|
135 |
-
.gradio-container .dropdown-menu {
|
136 |
-
background: var(--bg-secondary) !important;
|
137 |
-
border: 1px solid var(--border-color) !important;
|
138 |
-
border-radius: 6px !important;
|
139 |
-
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important;
|
140 |
-
}
|
141 |
-
|
142 |
-
.gradio-container .dropdown-menu .dropdown-item {
|
143 |
-
color: var(--text-primary) !important;
|
144 |
-
padding: 8px 12px !important;
|
145 |
-
}
|
146 |
-
|
147 |
-
.gradio-container .dropdown-menu .dropdown-item:hover {
|
148 |
-
background: var(--bg-tertiary) !important;
|
149 |
-
}
|
150 |
-
|
151 |
-
/* Sliders */
|
152 |
-
.gradio-container .slider {
|
153 |
-
background: var(--bg-tertiary) !important;
|
154 |
-
}
|
155 |
-
|
156 |
-
.gradio-container .slider input[type="range"] {
|
157 |
-
background: var(--bg-tertiary) !important;
|
158 |
-
}
|
159 |
-
|
160 |
-
.gradio-container .slider input[type="range"]::-webkit-slider-thumb {
|
161 |
-
background: var(--accent-color) !important;
|
162 |
-
border: 2px solid var(--bg-primary) !important;
|
163 |
-
border-radius: 50% !important;
|
164 |
-
width: 18px !important;
|
165 |
-
height: 18px !important;
|
166 |
-
}
|
167 |
-
|
168 |
-
.gradio-container .slider input[type="range"]::-webkit-slider-track {
|
169 |
-
background: var(--border-color) !important;
|
170 |
-
border-radius: 4px !important;
|
171 |
-
height: 6px !important;
|
172 |
-
}
|
173 |
-
|
174 |
-
/* Accordions */
|
175 |
-
.gradio-container .accordion {
|
176 |
-
background: var(--bg-secondary) !important;
|
177 |
-
border: 1px solid var(--border-color) !important;
|
178 |
-
border-radius: 8px !important;
|
179 |
-
margin: 16px 0 !important;
|
180 |
-
}
|
181 |
-
|
182 |
-
.gradio-container .accordion-header {
|
183 |
-
background: var(--bg-tertiary) !important;
|
184 |
-
color: var(--text-primary) !important;
|
185 |
-
padding: 16px !important;
|
186 |
-
border-bottom: 1px solid var(--border-color) !important;
|
187 |
-
cursor: pointer !important;
|
188 |
-
font-weight: 500 !important;
|
189 |
-
}
|
190 |
-
|
191 |
-
.gradio-container .accordion-header:hover {
|
192 |
-
background: var(--bg-primary) !important;
|
193 |
-
}
|
194 |
-
|
195 |
-
/* Status messages */
|
196 |
-
.gradio-container .success {
|
197 |
-
background: rgba(255, 255, 255, 0.1) !important;
|
198 |
-
color: var(--text-primary) !important;
|
199 |
-
border: 1px solid var(--accent-color) !important;
|
200 |
-
border-radius: 6px !important;
|
201 |
-
padding: 12px 16px !important;
|
202 |
-
margin: 8px 0 !important;
|
203 |
-
}
|
204 |
-
|
205 |
-
.gradio-container .error {
|
206 |
-
background: rgba(218, 54, 51, 0.1) !important;
|
207 |
-
color: var(--danger-color) !important;
|
208 |
-
border: 1px solid var(--danger-color) !important;
|
209 |
-
border-radius: 6px !important;
|
210 |
-
padding: 12px 16px !important;
|
211 |
-
margin: 8px 0 !important;
|
212 |
-
}
|
213 |
-
|
214 |
-
/* Responsive design */
|
215 |
-
@media (max-width: 768px) {
|
216 |
-
.gradio-container {
|
217 |
-
padding: 16px !important;
|
218 |
-
}
|
219 |
-
|
220 |
-
.gradio-container .tab-nav button {
|
221 |
-
padding: 8px 16px !important;
|
222 |
-
font-size: 14px !important;
|
223 |
-
}
|
224 |
-
|
225 |
-
.gradio-container .dataframe {
|
226 |
-
font-size: 14px !important;
|
227 |
-
}
|
228 |
}
|
229 |
"""
|
230 |
-
|
231 |
-
# Custom JavaScript for enhanced functionality
|
232 |
-
CUSTOM_JS = """
|
233 |
-
// Enhanced table sorting and filtering
|
234 |
-
function enhanceTable() {
|
235 |
-
const tables = document.querySelectorAll('.dataframe table');
|
236 |
-
tables.forEach(table => {
|
237 |
-
// Add sorting functionality
|
238 |
-
const headers = table.querySelectorAll('th');
|
239 |
-
headers.forEach((header, index) => {
|
240 |
-
header.style.cursor = 'pointer';
|
241 |
-
header.addEventListener('click', () => sortTable(table, index));
|
242 |
-
});
|
243 |
-
});
|
244 |
-
}
|
245 |
-
|
246 |
-
function sortTable(table, columnIndex) {
|
247 |
-
const tbody = table.querySelector('tbody');
|
248 |
-
const rows = Array.from(tbody.querySelectorAll('tr'));
|
249 |
-
|
250 |
-
rows.sort((a, b) => {
|
251 |
-
const aText = a.cells[columnIndex].textContent.trim();
|
252 |
-
const bText = b.cells[columnIndex].textContent.trim();
|
253 |
-
|
254 |
-
// Try to parse as numbers first
|
255 |
-
const aNum = parseFloat(aText);
|
256 |
-
const bNum = parseFloat(bText);
|
257 |
-
|
258 |
-
if (!isNaN(aNum) && !isNaN(bNum)) {
|
259 |
-
return bNum - aNum; // Descending for numbers
|
260 |
-
}
|
261 |
-
|
262 |
-
return aText.localeCompare(bText); // Ascending for text
|
263 |
-
});
|
264 |
-
|
265 |
-
rows.forEach(row => tbody.appendChild(row));
|
266 |
-
}
|
267 |
-
|
268 |
-
// Auto-refresh functionality
|
269 |
-
function autoRefresh() {
|
270 |
-
setInterval(() => {
|
271 |
-
const refreshBtn = document.querySelector('button[aria-label="Refresh"]');
|
272 |
-
if (refreshBtn) {
|
273 |
-
refreshBtn.click();
|
274 |
-
}
|
275 |
-
}, 30000); // Refresh every 30 seconds
|
276 |
-
}
|
277 |
-
|
278 |
-
// Initialize enhancements
|
279 |
-
document.addEventListener('DOMContentLoaded', function() {
|
280 |
-
enhanceTable();
|
281 |
-
autoRefresh();
|
282 |
-
});
|
283 |
-
"""
|
284 |
-
|
285 |
-
# HTML components
|
286 |
-
HEADER_HTML = """
|
287 |
-
<div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-bottom: 20px;">
|
288 |
-
<h1 style="color: var(--text-primary); margin: 0; font-size: 2.5em; font-weight: 700;">
|
289 |
-
🏆 CodeReview Leaderboard
|
290 |
-
</h1>
|
291 |
-
<p style="color: var(--text-secondary); margin: 10px 0 0 0; font-size: 1.2em;">
|
292 |
-
Benchmarking code review generation models across languages and categories
|
293 |
-
</p>
|
294 |
-
</div>
|
295 |
-
"""
|
296 |
-
|
297 |
-
FOOTER_HTML = """
|
298 |
-
<div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-top: 20px;">
|
299 |
-
<p style="color: var(--text-secondary); margin: 0; font-size: 0.9em;">
|
300 |
-
Built with ❤️ for the code review community |
|
301 |
-
<a href="https://github.com/your-repo" style="color: var(--accent-color); text-decoration: none;">
|
302 |
-
GitHub
|
303 |
-
</a>
|
304 |
-
</p>
|
305 |
-
</div>
|
306 |
-
"""
|
|
|
1 |
"""
|
2 |
+
CSS and styling for the CodeReview Bench Leaderboard.
|
3 |
"""
|
4 |
|
5 |
+
custom_css = """
|
6 |
+
.markdown-text {
|
7 |
+
font-size: 16px !important;
|
8 |
+
text-align: justify !important;
|
9 |
+
line-height: 1.0 !important;
|
10 |
+
margin-top: 10px !important;
|
11 |
+
margin-bottom: 10px !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
}
|
13 |
|
14 |
+
.tab-buttons button.selected {
|
15 |
+
border-color: #f4f4f5 !important;
|
16 |
+
background: #3f3f46 !important;
|
17 |
+
color: #f4f4f5 !important;
|
18 |
}
|
19 |
|
20 |
+
#citation-button textarea {
|
21 |
+
font-family: monospace !important;
|
|
|
22 |
}
|
23 |
|
24 |
+
.leaderboard-container {
|
25 |
+
margin-top: 20px;
|
26 |
}
|
27 |
|
28 |
+
.category-header {
|
29 |
+
font-weight: bold;
|
30 |
+
background-color: #f5f5f5;
|
31 |
+
padding: 10px;
|
32 |
+
margin-top: 15px;
|
33 |
+
border-radius: 5px;
|
34 |
}
|
35 |
|
36 |
+
.metric-name {
|
37 |
+
font-weight: bold;
|
38 |
+
color: #a1a1aa !important;
|
|
|
|
|
|
|
39 |
}
|
40 |
|
41 |
+
.model-name {
|
42 |
+
font-weight: bold;
|
|
|
43 |
}
|
44 |
|
45 |
+
.model-link:hover {
|
46 |
+
text-decoration: underline;
|
47 |
+
color: #ffffff !important;
|
|
|
48 |
}
|
49 |
|
50 |
+
.version-selector {
|
51 |
+
margin: 0 !important;
|
52 |
+
padding: 5px;
|
53 |
+
border-radius: 5px;
|
|
|
|
|
54 |
}
|
55 |
|
56 |
+
.version-selector label {
|
57 |
+
font-weight: bold;
|
58 |
+
color: #f4f4f5 !important;
|
59 |
}
|
60 |
|
61 |
+
.version-selector select {
|
62 |
+
border-color: #3f3f46 !important;
|
63 |
+
border-radius: 5px;
|
|
|
|
|
|
|
64 |
}
|
65 |
|
66 |
+
/* Make sure the version selector is properly aligned with refresh button */
|
67 |
+
.version-selector > .block {
|
68 |
+
padding: 0 !important;
|
|
|
|
|
69 |
}
|
70 |
|
71 |
+
.version-selector > .block > .wrap {
|
72 |
+
position: relative;
|
73 |
+
top: -5px;
|
74 |
}
|
75 |
|
76 |
+
/* Force background/border for common layout containers */
|
77 |
+
.gradio-row > .block,
|
78 |
+
.gradio-column > .block,
|
79 |
+
.form,
|
80 |
+
.panel {
|
81 |
+
/* background: #18181b !important; */ /* Removed background override */
|
82 |
+
border-color: #27272a80 !important; /* Made border color semi-transparent */
|
83 |
+
border-width: 1px !important; /* Ensure border is visible */
|
84 |
+
border-style: solid !important;
|
85 |
}
|
86 |
|
87 |
+
/* Target the specific file upload component area */
|
88 |
+
.gradio-file .wrap {
|
89 |
+
/* background: #18181b !important; */ /* Removed background override */
|
90 |
+
border-color: #27272a !important;
|
91 |
}
|
92 |
|
93 |
+
#refresh-button {
|
94 |
+
margin-top: 5px !important;
|
95 |
+
margin-bottom: 5px !important;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
}
|
97 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/formatting.py
CHANGED
@@ -1,182 +1,71 @@
|
|
1 |
"""
|
2 |
-
Formatting utilities for
|
3 |
"""
|
4 |
|
5 |
-
import
|
6 |
-
|
7 |
-
from datetime import datetime, timezone
|
8 |
|
9 |
-
def format_score(score: float, precision: int = 3) -> str:
|
10 |
-
"""Format a score with specified precision"""
|
11 |
-
if isinstance(score, (int, float)):
|
12 |
-
return f"{score:.{precision}f}"
|
13 |
-
return str(score)
|
14 |
|
15 |
-
def
|
16 |
-
"""
|
17 |
-
|
18 |
-
|
19 |
-
return
|
20 |
|
21 |
-
def format_model_name(name: str) -> str:
|
22 |
-
"""Format model name for display"""
|
23 |
-
# Remove common prefixes and make more readable
|
24 |
-
name = name.strip()
|
25 |
-
if "/" in name:
|
26 |
-
org, model = name.split("/", 1)
|
27 |
-
return f"<span style='color: var(--text-secondary); font-size: 0.9em;'>{org}/</span><strong>{model}</strong>"
|
28 |
-
return f"<strong>{name}</strong>"
|
29 |
|
30 |
-
def
|
31 |
-
"""
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
except:
|
36 |
-
return timestamp
|
37 |
|
38 |
-
def format_ip_address(ip: str) -> str:
|
39 |
-
"""Format IP address for display (partial masking)"""
|
40 |
-
if not ip:
|
41 |
-
return "Unknown"
|
42 |
-
|
43 |
-
# Mask part of IP for privacy
|
44 |
-
parts = ip.split(".")
|
45 |
-
if len(parts) == 4:
|
46 |
-
return f"{parts[0]}.{parts[1]}.{parts[2]}.xxx"
|
47 |
-
return "xxx.xxx.xxx.xxx"
|
48 |
|
49 |
-
def
|
50 |
-
"""
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
color = "#ffffff" # White
|
57 |
-
elif score >= 6:
|
58 |
-
color = "#d0d0d0" # Light gray
|
59 |
-
elif score >= 4:
|
60 |
-
color = "#a0a0a0" # Gray
|
61 |
-
else:
|
62 |
-
color = "#707070" # Dark gray
|
63 |
-
|
64 |
-
return f"<span style='color: {color}; font-weight: 600;'>{score}</span>"
|
65 |
|
66 |
-
def format_language_badge(language: str) -> str:
|
67 |
-
"""Format programming language as a badge"""
|
68 |
-
if not language or language == "All":
|
69 |
-
return language
|
70 |
-
|
71 |
-
# Language-specific colors
|
72 |
-
colors = {
|
73 |
-
"Python": "#3776ab",
|
74 |
-
"JavaScript": "#f7df1e",
|
75 |
-
"Java": "#ed8b00",
|
76 |
-
"C++": "#00599c",
|
77 |
-
"C#": "#239120",
|
78 |
-
"Go": "#00add8",
|
79 |
-
"Rust": "#ce422b",
|
80 |
-
"TypeScript": "#3178c6",
|
81 |
-
"PHP": "#777bb4",
|
82 |
-
"Ruby": "#cc342d",
|
83 |
-
"Swift": "#fa7343",
|
84 |
-
"Kotlin": "#7f52ff",
|
85 |
-
"Scala": "#dc322f",
|
86 |
-
"R": "#276dc3",
|
87 |
-
"MATLAB": "#e16737"
|
88 |
-
}
|
89 |
-
|
90 |
-
color = colors.get(language, "#6c757d")
|
91 |
-
return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{language}</span>"
|
92 |
|
93 |
-
def
|
94 |
-
"""
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
"Bug Detection": "#dc3545",
|
101 |
-
"Code Style": "#6f42c1",
|
102 |
-
"Performance": "#fd7e14",
|
103 |
-
"Security": "#e83e8c",
|
104 |
-
"Maintainability": "#ffffff",
|
105 |
-
"Documentation": "#17a2b8",
|
106 |
-
"Testing": "#ffffff",
|
107 |
-
"Architecture": "#6c757d",
|
108 |
-
"Best Practices": "#007bff",
|
109 |
-
"Refactoring": "#ffc107"
|
110 |
-
}
|
111 |
-
|
112 |
-
color = colors.get(category, "#6c757d")
|
113 |
-
return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{category}</span>"
|
114 |
|
115 |
-
def format_comment_language_flag(language: str) -> str:
|
116 |
-
"""Format comment language with flag emoji"""
|
117 |
-
if not language or language == "All":
|
118 |
-
return language
|
119 |
-
|
120 |
-
# Language-specific flags
|
121 |
-
flags = {
|
122 |
-
"English": "🇺🇸",
|
123 |
-
"Chinese": "🇨🇳",
|
124 |
-
"Spanish": "🇪🇸",
|
125 |
-
"French": "🇫🇷",
|
126 |
-
"German": "🇩🇪",
|
127 |
-
"Japanese": "🇯🇵",
|
128 |
-
"Korean": "🇰🇷",
|
129 |
-
"Russian": "🇷🇺",
|
130 |
-
"Portuguese": "🇵🇹",
|
131 |
-
"Italian": "🇮🇹",
|
132 |
-
"Dutch": "🇳🇱"
|
133 |
-
}
|
134 |
-
|
135 |
-
flag = flags.get(language, "🌐")
|
136 |
-
return f"{flag} {language}"
|
137 |
|
138 |
-
def
|
139 |
-
"""
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
#
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
text = re.sub(r'on\w+=\'[^\']*\'', '', text, flags=re.IGNORECASE)
|
148 |
-
|
149 |
-
return text
|
150 |
|
151 |
-
def truncate_text(text: str, max_length: int = 50) -> str:
|
152 |
-
"""Truncate text with ellipsis"""
|
153 |
-
if not isinstance(text, str):
|
154 |
-
text = str(text)
|
155 |
-
|
156 |
-
if len(text) <= max_length:
|
157 |
-
return text
|
158 |
-
|
159 |
-
return text[:max_length-3] + "..."
|
160 |
|
161 |
-
def
|
162 |
-
"""
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
#
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
else:
|
182 |
-
return sanitize_html(str(value))
|
|
|
1 |
"""
|
2 |
+
Formatting utilities for the GuardBench Leaderboard.
|
3 |
"""
|
4 |
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
|
|
7 |
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
def make_clickable_model(model_name: str) -> str:
|
10 |
+
"""
|
11 |
+
Create a clickable link for a model name.
|
12 |
+
"""
|
13 |
+
return f'<a href="https://huggingface.co/{model_name}" target="_blank">{model_name}</a>'
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
+
def has_no_nan_values(df: pd.DataFrame, columns: list) -> pd.Series:
|
17 |
+
"""
|
18 |
+
Check if a row has no NaN values in the specified columns.
|
19 |
+
"""
|
20 |
+
return ~df[columns].isna().any(axis=1)
|
|
|
|
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
+
def format_percentage(value: float) -> str:
|
24 |
+
"""
|
25 |
+
Format a value as a percentage.
|
26 |
+
"""
|
27 |
+
if pd.isna(value):
|
28 |
+
return "N/A"
|
29 |
+
return f"{value * 100:.2f}%"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
def format_number(value: float, precision: int = 2) -> str:
|
33 |
+
"""
|
34 |
+
Format a number with specified precision.
|
35 |
+
"""
|
36 |
+
if pd.isna(value):
|
37 |
+
return "N/A"
|
38 |
+
return f"{value:.{precision}f}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
+
def styled_message(message: str) -> str:
|
42 |
+
"""
|
43 |
+
Format a success message with styling.
|
44 |
+
"""
|
45 |
+
return f"""
|
46 |
+
<div style="padding: 10px; border-radius: 5px; background-color: #e6f7e6; color: #2e7d32; border: 1px solid #2e7d32;">
|
47 |
+
✅ {message}
|
48 |
+
</div>
|
49 |
+
"""
|
|
|
|
|
|
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
+
def styled_warning(message: str) -> str:
|
53 |
+
"""
|
54 |
+
Format a warning message with styling.
|
55 |
+
"""
|
56 |
+
return f"""
|
57 |
+
<div style="padding: 10px; border-radius: 5px; background-color: #fff8e1; color: #ff8f00; border: 1px solid #ff8f00;">
|
58 |
+
⚠️ {message}
|
59 |
+
</div>
|
60 |
+
"""
|
61 |
+
|
62 |
+
|
63 |
+
def styled_error(message: str) -> str:
|
64 |
+
"""
|
65 |
+
Format an error message with styling.
|
66 |
+
"""
|
67 |
+
return f"""
|
68 |
+
<div style="padding: 10px; border-radius: 5px; background-color: #ffebee; color: #c62828; border: 1px solid #c62828;">
|
69 |
+
❌ {message}
|
70 |
+
</div>
|
71 |
+
"""
|
|
|
|
src/display/utils.py
CHANGED
@@ -1,292 +1,417 @@
|
|
1 |
"""
|
2 |
-
|
3 |
"""
|
4 |
|
5 |
-
from
|
6 |
-
import
|
7 |
-
from
|
8 |
-
from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
|
9 |
-
from src.display.formatting import format_table_cell, format_timestamp
|
10 |
-
|
11 |
-
def filter_leaderboard_data(
|
12 |
-
data: List[Dict],
|
13 |
-
programming_language: str = "All",
|
14 |
-
comment_language: str = "All",
|
15 |
-
taxonomy_category: str = "All",
|
16 |
-
sort_by: str = "llm_pass_1",
|
17 |
-
sort_order: str = "desc"
|
18 |
-
) -> List[Dict]:
|
19 |
-
"""Filter and sort leaderboard data based on criteria"""
|
20 |
-
|
21 |
-
if not data:
|
22 |
-
return []
|
23 |
-
|
24 |
-
# Apply filters
|
25 |
-
filtered_data = data.copy()
|
26 |
-
|
27 |
-
if programming_language != "All":
|
28 |
-
filtered_data = [
|
29 |
-
entry for entry in filtered_data
|
30 |
-
if entry.get("programming_language", "").lower() == programming_language.lower()
|
31 |
-
]
|
32 |
-
|
33 |
-
if comment_language != "All":
|
34 |
-
filtered_data = [
|
35 |
-
entry for entry in filtered_data
|
36 |
-
if entry.get("comment_language", "").lower() == comment_language.lower()
|
37 |
-
]
|
38 |
-
|
39 |
-
if taxonomy_category != "All":
|
40 |
-
filtered_data = [
|
41 |
-
entry for entry in filtered_data
|
42 |
-
if entry.get("taxonomy_category", "").lower() == taxonomy_category.lower()
|
43 |
-
]
|
44 |
-
|
45 |
-
# Sort data
|
46 |
-
reverse = sort_order.lower() == "desc"
|
47 |
-
|
48 |
-
try:
|
49 |
-
if sort_by in ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]:
|
50 |
-
filtered_data.sort(key=lambda x: x.get(sort_by, 0), reverse=reverse)
|
51 |
-
elif sort_by in QUALITY_METRICS:
|
52 |
-
filtered_data.sort(key=lambda x: x.get("metrics", {}).get(sort_by, 0), reverse=reverse)
|
53 |
-
else:
|
54 |
-
filtered_data.sort(key=lambda x: str(x.get(sort_by, "")), reverse=reverse)
|
55 |
-
except Exception as e:
|
56 |
-
print(f"Error sorting data: {e}")
|
57 |
-
# Default sort by pass@1
|
58 |
-
filtered_data.sort(key=lambda x: x.get("llm_pass_1", 0), reverse=True)
|
59 |
-
|
60 |
-
return filtered_data
|
61 |
-
|
62 |
-
def get_main_leaderboard_data(
|
63 |
-
data: List[Dict],
|
64 |
-
programming_language: str = "All",
|
65 |
-
comment_language: str = "All",
|
66 |
-
taxonomy_category: str = "All",
|
67 |
-
sort_by: str = "llm_pass_1"
|
68 |
-
) -> List[List[str]]:
|
69 |
-
"""Get formatted main leaderboard table data"""
|
70 |
-
|
71 |
-
filtered_data = filter_leaderboard_data(
|
72 |
-
data, programming_language, comment_language, taxonomy_category, sort_by
|
73 |
-
)
|
74 |
-
|
75 |
-
table_rows = []
|
76 |
-
for entry in filtered_data:
|
77 |
-
row = [
|
78 |
-
format_table_cell(entry.get("model_name", ""), "model"),
|
79 |
-
format_table_cell(entry.get("programming_language", ""), "programming language"),
|
80 |
-
format_table_cell(entry.get("comment_language", ""), "comment language"),
|
81 |
-
format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
|
82 |
-
format_table_cell(entry.get("bleu", 0), "bleu"),
|
83 |
-
format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
|
84 |
-
format_table_cell(entry.get("llm_pass_5", 0), "pass@5"),
|
85 |
-
format_table_cell(entry.get("llm_pass_10", 0), "pass@10"),
|
86 |
-
]
|
87 |
-
table_rows.append(row)
|
88 |
-
|
89 |
-
return table_rows
|
90 |
-
|
91 |
-
def get_quality_metrics_data(
|
92 |
-
data: List[Dict],
|
93 |
-
programming_language: str = "All",
|
94 |
-
comment_language: str = "All",
|
95 |
-
taxonomy_category: str = "All",
|
96 |
-
sort_by: str = "llm_pass_1"
|
97 |
-
) -> List[List[str]]:
|
98 |
-
"""Get formatted quality metrics table data"""
|
99 |
-
|
100 |
-
filtered_data = filter_leaderboard_data(
|
101 |
-
data, programming_language, comment_language, taxonomy_category, sort_by
|
102 |
-
)
|
103 |
-
|
104 |
-
table_rows = []
|
105 |
-
for entry in filtered_data:
|
106 |
-
metrics = entry.get("metrics", {})
|
107 |
-
row = [format_table_cell(entry.get("model_name", ""), "model")]
|
108 |
-
|
109 |
-
for metric in QUALITY_METRICS:
|
110 |
-
formatted_value = format_table_cell(metrics.get(metric, 0), metric.replace("_", " "))
|
111 |
-
row.append(formatted_value)
|
112 |
-
|
113 |
-
table_rows.append(row)
|
114 |
-
|
115 |
-
return table_rows
|
116 |
-
|
117 |
-
def get_submission_history_data(
|
118 |
-
data: List[Dict],
|
119 |
-
programming_language: str = "All",
|
120 |
-
comment_language: str = "All",
|
121 |
-
taxonomy_category: str = "All",
|
122 |
-
limit: int = 50
|
123 |
-
) -> List[List[str]]:
|
124 |
-
"""Get formatted submission history data"""
|
125 |
-
|
126 |
-
filtered_data = filter_leaderboard_data(
|
127 |
-
data, programming_language, comment_language, taxonomy_category, "submission_date", "desc"
|
128 |
-
)
|
129 |
-
|
130 |
-
# Limit results
|
131 |
-
filtered_data = filtered_data[:limit]
|
132 |
-
|
133 |
-
table_rows = []
|
134 |
-
for entry in filtered_data:
|
135 |
-
row = [
|
136 |
-
format_table_cell(entry.get("model_name", ""), "model"),
|
137 |
-
format_table_cell(entry.get("programming_language", ""), "programming language"),
|
138 |
-
format_table_cell(entry.get("comment_language", ""), "comment language"),
|
139 |
-
format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
|
140 |
-
format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
|
141 |
-
format_timestamp(entry.get("submission_date", "")),
|
142 |
-
entry.get("submission_ip", "").split(".")[0] + ".xxx.xxx.xxx" if entry.get("submission_ip") else "Unknown"
|
143 |
-
]
|
144 |
-
table_rows.append(row)
|
145 |
-
|
146 |
-
return table_rows
|
147 |
|
148 |
-
def get_statistics_summary(data: List[Dict]) -> Dict[str, Any]:
|
149 |
-
"""Get summary statistics for the leaderboard"""
|
150 |
-
|
151 |
-
if not data:
|
152 |
-
return {
|
153 |
-
"total_models": 0,
|
154 |
-
"total_submissions": 0,
|
155 |
-
"avg_pass_1": 0,
|
156 |
-
"best_model": "None",
|
157 |
-
"languages_covered": 0,
|
158 |
-
"categories_covered": 0
|
159 |
-
}
|
160 |
-
|
161 |
-
# Calculate statistics
|
162 |
-
total_models = len(set(entry.get("model_name", "") for entry in data))
|
163 |
-
total_submissions = len(data)
|
164 |
-
|
165 |
-
pass_1_scores = [entry.get("llm_pass_1", 0) for entry in data if entry.get("llm_pass_1") is not None]
|
166 |
-
avg_pass_1 = sum(pass_1_scores) / len(pass_1_scores) if pass_1_scores else 0
|
167 |
-
|
168 |
-
best_entry = max(data, key=lambda x: x.get("llm_pass_1", 0)) if data else None
|
169 |
-
best_model = best_entry.get("model_name", "None") if best_entry else "None"
|
170 |
-
|
171 |
-
languages_covered = len(set(entry.get("programming_language", "") for entry in data if entry.get("programming_language")))
|
172 |
-
categories_covered = len(set(entry.get("taxonomy_category", "") for entry in data if entry.get("taxonomy_category")))
|
173 |
-
|
174 |
-
return {
|
175 |
-
"total_models": total_models,
|
176 |
-
"total_submissions": total_submissions,
|
177 |
-
"avg_pass_1": avg_pass_1,
|
178 |
-
"best_model": best_model,
|
179 |
-
"languages_covered": languages_covered,
|
180 |
-
"categories_covered": categories_covered
|
181 |
-
}
|
182 |
-
|
183 |
-
def validate_submission_data(data: Dict[str, Any]) -> Tuple[bool, str]:
|
184 |
-
"""Validate submission data"""
|
185 |
-
|
186 |
-
required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
|
187 |
-
|
188 |
-
# Check required fields
|
189 |
-
for field in required_fields:
|
190 |
-
if not data.get(field):
|
191 |
-
return False, f"Missing required field: {field}"
|
192 |
-
|
193 |
-
# Validate scores
|
194 |
-
score_fields = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
|
195 |
-
for field in score_fields:
|
196 |
-
value = data.get(field)
|
197 |
-
if value is None:
|
198 |
-
return False, f"Missing score: {field}"
|
199 |
-
if not isinstance(value, (int, float)):
|
200 |
-
return False, f"Invalid score format: {field}"
|
201 |
-
if not 0 <= value <= 1:
|
202 |
-
return False, f"Score out of range (0-1): {field}"
|
203 |
-
|
204 |
-
# Validate metrics
|
205 |
-
metrics = data.get("metrics", {})
|
206 |
-
for metric in QUALITY_METRICS:
|
207 |
-
value = metrics.get(metric)
|
208 |
-
if value is None:
|
209 |
-
return False, f"Missing metric: {metric}"
|
210 |
-
if not isinstance(value, (int, float)):
|
211 |
-
return False, f"Invalid metric format: {metric}"
|
212 |
-
if not 0 <= value <= 10:
|
213 |
-
return False, f"Metric out of range (0-10): {metric}"
|
214 |
-
|
215 |
-
# Validate language and category choices
|
216 |
-
if data.get("programming_language") not in PROGRAMMING_LANGUAGES:
|
217 |
-
return False, "Invalid programming language"
|
218 |
-
|
219 |
-
if data.get("comment_language") not in COMMENT_LANGUAGES:
|
220 |
-
return False, "Invalid comment language"
|
221 |
-
|
222 |
-
if data.get("taxonomy_category") not in TAXONOMY_CATEGORIES:
|
223 |
-
return False, "Invalid taxonomy category"
|
224 |
-
|
225 |
-
return True, "Valid submission"
|
226 |
|
227 |
-
|
228 |
-
"""
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
"""
|
2 |
+
Utility classes and functions for the CodeReview Bench Leaderboard display.
|
3 |
"""
|
4 |
|
5 |
+
from dataclasses import dataclass, field, fields
|
6 |
+
from enum import Enum, auto
|
7 |
+
from typing import List, Optional
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
class Mode(Enum):
|
11 |
+
"""Inference mode for the review model."""
|
12 |
+
CoT = auto() # Chain of Thought
|
13 |
+
Strict = auto()
|
14 |
+
|
15 |
+
def __str__(self):
|
16 |
+
"""String representation of the mode."""
|
17 |
+
return self.name
|
18 |
+
|
19 |
+
|
20 |
+
class ModelType(Enum):
|
21 |
+
"""Model types for the leaderboard."""
|
22 |
+
Unknown = auto()
|
23 |
+
OpenSource = auto()
|
24 |
+
ClosedSource = auto()
|
25 |
+
API = auto()
|
26 |
+
|
27 |
+
def to_str(self, separator: str = "-") -> str:
|
28 |
+
"""Convert enum to string with separator."""
|
29 |
+
if self == ModelType.Unknown:
|
30 |
+
return "Unknown"
|
31 |
+
elif self == ModelType.OpenSource:
|
32 |
+
return f"Open{separator}Source"
|
33 |
+
elif self == ModelType.ClosedSource:
|
34 |
+
return f"Closed{separator}Source"
|
35 |
+
elif self == ModelType.API:
|
36 |
+
return "API"
|
37 |
+
return "Unknown"
|
38 |
+
|
39 |
+
|
40 |
+
class ReviewModelType(str, Enum):
|
41 |
+
"""Review model types for the leaderboard."""
|
42 |
+
GPT_4 = "gpt-4"
|
43 |
+
GPT_3_5 = "gpt-3.5-turbo"
|
44 |
+
CLAUDE = "claude"
|
45 |
+
LLAMA = "llama"
|
46 |
+
GEMINI = "gemini"
|
47 |
+
CUSTOM = "custom"
|
48 |
+
|
49 |
+
def __str__(self):
|
50 |
+
"""String representation of the review model type."""
|
51 |
+
return self.value
|
52 |
+
|
53 |
+
|
54 |
+
class Precision(Enum):
|
55 |
+
"""Model precision types."""
|
56 |
+
Unknown = auto()
|
57 |
+
float16 = auto()
|
58 |
+
bfloat16 = auto()
|
59 |
+
float32 = auto()
|
60 |
+
int8 = auto()
|
61 |
+
int4 = auto()
|
62 |
+
NA = auto()
|
63 |
+
|
64 |
+
def __str__(self):
|
65 |
+
"""String representation of the precision type."""
|
66 |
+
return self.name
|
67 |
+
|
68 |
+
|
69 |
+
class WeightType(Enum):
|
70 |
+
"""Model weight types."""
|
71 |
+
Original = auto()
|
72 |
+
Delta = auto()
|
73 |
+
Adapter = auto()
|
74 |
+
|
75 |
+
def __str__(self):
|
76 |
+
"""String representation of the weight type."""
|
77 |
+
return self.name
|
78 |
+
|
79 |
+
|
80 |
+
@dataclass
|
81 |
+
class ColumnInfo:
|
82 |
+
"""Information about a column in the leaderboard."""
|
83 |
+
name: str
|
84 |
+
display_name: str
|
85 |
+
type: str = "text"
|
86 |
+
hidden: bool = False
|
87 |
+
never_hidden: bool = False
|
88 |
+
displayed_by_default: bool = True
|
89 |
+
|
90 |
+
|
91 |
+
@dataclass
|
92 |
+
class CodeReviewBenchColumn:
|
93 |
+
"""Columns for the CodeReview Bench leaderboard."""
|
94 |
+
# Core metadata
|
95 |
+
model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
96 |
+
name="model_name",
|
97 |
+
display_name="Model",
|
98 |
+
never_hidden=True,
|
99 |
+
displayed_by_default=True
|
100 |
+
))
|
101 |
+
mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
102 |
+
name="mode",
|
103 |
+
display_name="Mode",
|
104 |
+
displayed_by_default=True
|
105 |
+
))
|
106 |
+
model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
107 |
+
name="model_type",
|
108 |
+
display_name="Access_Type",
|
109 |
+
displayed_by_default=True
|
110 |
+
))
|
111 |
+
submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
112 |
+
name="submission_date",
|
113 |
+
display_name="Submission_Date",
|
114 |
+
displayed_by_default=False
|
115 |
+
))
|
116 |
+
version: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
117 |
+
name="version",
|
118 |
+
display_name="Version",
|
119 |
+
displayed_by_default=False
|
120 |
+
))
|
121 |
+
review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
122 |
+
name="review_model_type",
|
123 |
+
display_name="Type",
|
124 |
+
displayed_by_default=False
|
125 |
+
))
|
126 |
+
base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
127 |
+
name="base_model",
|
128 |
+
display_name="Base Model",
|
129 |
+
displayed_by_default=False
|
130 |
+
))
|
131 |
+
revision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
132 |
+
name="revision",
|
133 |
+
display_name="Revision",
|
134 |
+
displayed_by_default=False
|
135 |
+
))
|
136 |
+
precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
137 |
+
name="precision",
|
138 |
+
display_name="Precision",
|
139 |
+
displayed_by_default=False
|
140 |
+
))
|
141 |
+
weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
142 |
+
name="weight_type",
|
143 |
+
display_name="Weight Type",
|
144 |
+
displayed_by_default=False
|
145 |
+
))
|
146 |
+
topic: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
147 |
+
name="topic",
|
148 |
+
display_name="Topic",
|
149 |
+
displayed_by_default=True
|
150 |
+
))
|
151 |
+
|
152 |
+
# LLM-based multimetric scores
|
153 |
+
readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
154 |
+
name="readability",
|
155 |
+
display_name="Readability",
|
156 |
+
type="number",
|
157 |
+
displayed_by_default=True
|
158 |
+
))
|
159 |
+
relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
160 |
+
name="relevance",
|
161 |
+
display_name="Relevance",
|
162 |
+
type="number",
|
163 |
+
displayed_by_default=True
|
164 |
+
))
|
165 |
+
explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
166 |
+
name="explanation_clarity",
|
167 |
+
display_name="Explanation_Clarity",
|
168 |
+
type="number",
|
169 |
+
displayed_by_default=True
|
170 |
+
))
|
171 |
+
problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
172 |
+
name="problem_identification",
|
173 |
+
display_name="Problem_Identification",
|
174 |
+
type="number",
|
175 |
+
displayed_by_default=True
|
176 |
+
))
|
177 |
+
actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
178 |
+
name="actionability",
|
179 |
+
display_name="Actionability",
|
180 |
+
type="number",
|
181 |
+
displayed_by_default=True
|
182 |
+
))
|
183 |
+
completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
184 |
+
name="completeness",
|
185 |
+
display_name="Completeness",
|
186 |
+
type="number",
|
187 |
+
displayed_by_default=True
|
188 |
+
))
|
189 |
+
specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
190 |
+
name="specificity",
|
191 |
+
display_name="Specificity",
|
192 |
+
type="number",
|
193 |
+
displayed_by_default=True
|
194 |
+
))
|
195 |
+
contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
196 |
+
name="contextual_adequacy",
|
197 |
+
display_name="Contextual_Adequacy",
|
198 |
+
type="number",
|
199 |
+
displayed_by_default=True
|
200 |
+
))
|
201 |
+
consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
202 |
+
name="consistency",
|
203 |
+
display_name="Consistency",
|
204 |
+
type="number",
|
205 |
+
displayed_by_default=True
|
206 |
+
))
|
207 |
+
brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
208 |
+
name="brevity",
|
209 |
+
display_name="Brevity",
|
210 |
+
type="number",
|
211 |
+
displayed_by_default=True
|
212 |
+
))
|
213 |
+
|
214 |
+
# LLM-based-exact-match metrics
|
215 |
+
pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
216 |
+
name="pass_at_1",
|
217 |
+
display_name="Pass@1",
|
218 |
+
type="number",
|
219 |
+
displayed_by_default=True
|
220 |
+
))
|
221 |
+
pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
222 |
+
name="pass_at_5",
|
223 |
+
display_name="Pass@5",
|
224 |
+
type="number",
|
225 |
+
displayed_by_default=True
|
226 |
+
))
|
227 |
+
pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
228 |
+
name="pass_at_10",
|
229 |
+
display_name="Pass@10",
|
230 |
+
type="number",
|
231 |
+
displayed_by_default=True
|
232 |
+
))
|
233 |
+
bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
234 |
+
name="bleu_at_10",
|
235 |
+
display_name="BLEU@10",
|
236 |
+
type="number",
|
237 |
+
displayed_by_default=True
|
238 |
+
))
|
239 |
+
|
240 |
+
# Overall aggregated metrics
|
241 |
+
overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
242 |
+
name="overall_score",
|
243 |
+
display_name="Overall_Score",
|
244 |
+
type="number",
|
245 |
+
displayed_by_default=True
|
246 |
+
))
|
247 |
+
multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
248 |
+
name="multimetric_average",
|
249 |
+
display_name="Multimetric_Average",
|
250 |
+
type="number",
|
251 |
+
displayed_by_default=True
|
252 |
+
))
|
253 |
+
exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
254 |
+
name="exact_match_average",
|
255 |
+
display_name="Exact_Match_Average",
|
256 |
+
type="number",
|
257 |
+
displayed_by_default=True
|
258 |
+
))
|
259 |
+
total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
260 |
+
name="total_evaluations",
|
261 |
+
display_name="Total_Evaluations",
|
262 |
+
type="number",
|
263 |
+
displayed_by_default=True
|
264 |
+
))
|
265 |
+
|
266 |
+
# Language-specific metrics (Russian)
|
267 |
+
ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
268 |
+
name="ru_readability",
|
269 |
+
display_name="RU_Readability",
|
270 |
+
type="number",
|
271 |
+
displayed_by_default=False
|
272 |
+
))
|
273 |
+
ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
274 |
+
name="ru_relevance",
|
275 |
+
display_name="RU_Relevance",
|
276 |
+
type="number",
|
277 |
+
displayed_by_default=False
|
278 |
+
))
|
279 |
+
ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
280 |
+
name="ru_overall_score",
|
281 |
+
display_name="RU_Overall_Score",
|
282 |
+
type="number",
|
283 |
+
displayed_by_default=False
|
284 |
+
))
|
285 |
+
|
286 |
+
# Language-specific metrics (English)
|
287 |
+
en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
288 |
+
name="en_readability",
|
289 |
+
display_name="EN_Readability",
|
290 |
+
type="number",
|
291 |
+
displayed_by_default=False
|
292 |
+
))
|
293 |
+
en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
294 |
+
name="en_relevance",
|
295 |
+
display_name="EN_Relevance",
|
296 |
+
type="number",
|
297 |
+
displayed_by_default=False
|
298 |
+
))
|
299 |
+
en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
|
300 |
+
name="en_overall_score",
|
301 |
+
display_name="EN_Overall_Score",
|
302 |
+
type="number",
|
303 |
+
displayed_by_default=False
|
304 |
+
))
|
305 |
+
|
306 |
+
|
307 |
+
# Create instances for easy access
|
308 |
+
CODEREVIEW_COLUMN = CodeReviewBenchColumn()
|
309 |
+
|
310 |
+
# Extract column lists for different views
|
311 |
+
COLS = [f.name for f in fields(CODEREVIEW_COLUMN)]
|
312 |
+
DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
313 |
+
if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
|
314 |
+
|
315 |
+
# Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
|
316 |
+
def reorder_display_cols():
|
317 |
+
cols = DISPLAY_COLS
|
318 |
+
if 'model_name' in cols and 'mode' in cols:
|
319 |
+
cols.remove('mode')
|
320 |
+
model_name_index = cols.index('model_name')
|
321 |
+
cols.insert(model_name_index + 1, 'mode')
|
322 |
+
return cols
|
323 |
+
DISPLAY_COLS = reorder_display_cols()
|
324 |
+
|
325 |
+
METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
326 |
+
if getattr(CODEREVIEW_COLUMN, f.name).type == "number"]
|
327 |
+
HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
328 |
+
if getattr(CODEREVIEW_COLUMN, f.name).hidden]
|
329 |
+
NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
330 |
+
if getattr(CODEREVIEW_COLUMN, f.name).never_hidden]
|
331 |
+
|
332 |
+
# Categories for CodeReview Bench (Programming Languages)
|
333 |
+
CATEGORIES = [
|
334 |
+
'Python',
|
335 |
+
'Java',
|
336 |
+
'Scala',
|
337 |
+
'Go'
|
338 |
+
]
|
339 |
+
|
340 |
+
# Language taxonomies for CodeReview Bench
|
341 |
+
COMMENT_LANGUAGES = [
|
342 |
+
'ru', # Russian
|
343 |
+
'en' # English
|
344 |
+
]
|
345 |
+
|
346 |
+
# Topics for CodeReview Bench
|
347 |
+
TOPICS = [
|
348 |
+
'Code Reliability',
|
349 |
+
'Coding Standards',
|
350 |
+
'Code Organization',
|
351 |
+
'Performance Issues',
|
352 |
+
'Validation',
|
353 |
+
'Variables'
|
354 |
+
]
|
355 |
+
|
356 |
+
# Example categories
|
357 |
+
EXAMPLE_CATEGORIES = [
|
358 |
+
'Bug_Fix',
|
359 |
+
'Code_Style',
|
360 |
+
'Performance',
|
361 |
+
'Security',
|
362 |
+
'Refactoring',
|
363 |
+
'Documentation',
|
364 |
+
'Testing',
|
365 |
+
'Architecture',
|
366 |
+
'Other'
|
367 |
+
]
|
368 |
+
|
369 |
+
# Metrics for CodeReview Bench
|
370 |
+
MULTIMETRIC_METRICS = [
|
371 |
+
"readability",
|
372 |
+
"relevance",
|
373 |
+
"explanation_clarity",
|
374 |
+
"problem_identification",
|
375 |
+
"actionability",
|
376 |
+
"completeness",
|
377 |
+
"specificity",
|
378 |
+
"contextual_adequacy",
|
379 |
+
"consistency",
|
380 |
+
"brevity"
|
381 |
+
]
|
382 |
+
|
383 |
+
EXACT_MATCH_METRICS = [
|
384 |
+
"pass_at_1",
|
385 |
+
"pass_at_5",
|
386 |
+
"pass_at_10",
|
387 |
+
"bleu_at_10"
|
388 |
+
]
|
389 |
+
|
390 |
+
def get_all_column_choices():
|
391 |
+
"""
|
392 |
+
Get all available column choices for the multiselect dropdown.
|
393 |
+
|
394 |
+
Returns:
|
395 |
+
List of tuples with (column_name, display_name) for all columns.
|
396 |
+
"""
|
397 |
+
column_choices = []
|
398 |
+
|
399 |
+
default_visible_columns = get_default_visible_columns()
|
400 |
+
|
401 |
+
for f in fields(CODEREVIEW_COLUMN):
|
402 |
+
column_info = getattr(CODEREVIEW_COLUMN, f.name)
|
403 |
+
# Create a tuple with both the internal name and display name
|
404 |
+
if column_info.name not in default_visible_columns:
|
405 |
+
column_choices.append((column_info.name, column_info.display_name))
|
406 |
+
|
407 |
+
return column_choices
|
408 |
+
|
409 |
+
def get_default_visible_columns():
|
410 |
+
"""
|
411 |
+
Get the list of column names that should be visible by default.
|
412 |
+
|
413 |
+
Returns:
|
414 |
+
List of column names that are displayed by default.
|
415 |
+
"""
|
416 |
+
return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
|
417 |
+
if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
|
src/envs.py
CHANGED
@@ -1,106 +1,27 @@
|
|
1 |
-
"""
|
2 |
-
Environment configuration and constants
|
3 |
-
"""
|
4 |
-
|
5 |
import os
|
6 |
-
from
|
7 |
-
|
8 |
-
# Data paths
|
9 |
-
DATA_DIR = Path("data")
|
10 |
-
LEADERBOARD_PATH = DATA_DIR / "leaderboard_data.json"
|
11 |
-
SUBMISSIONS_PATH = DATA_DIR / "submissions.json"
|
12 |
-
|
13 |
-
# Create data directory if it doesn't exist
|
14 |
-
DATA_DIR.mkdir(exist_ok=True)
|
15 |
-
|
16 |
-
# Programming languages supported
|
17 |
-
PROGRAMMING_LANGUAGES = [
|
18 |
-
"All",
|
19 |
-
"Python",
|
20 |
-
"JavaScript",
|
21 |
-
"Java",
|
22 |
-
"C++",
|
23 |
-
"C#",
|
24 |
-
"Go",
|
25 |
-
"Rust",
|
26 |
-
"TypeScript",
|
27 |
-
"PHP",
|
28 |
-
"Ruby",
|
29 |
-
"Swift",
|
30 |
-
"Kotlin",
|
31 |
-
"Scala",
|
32 |
-
"R",
|
33 |
-
"MATLAB",
|
34 |
-
"Other"
|
35 |
-
]
|
36 |
|
37 |
-
#
|
38 |
-
|
39 |
-
"All",
|
40 |
-
"English",
|
41 |
-
"Chinese",
|
42 |
-
"Spanish",
|
43 |
-
"French",
|
44 |
-
"German",
|
45 |
-
"Japanese",
|
46 |
-
"Korean",
|
47 |
-
"Russian",
|
48 |
-
"Portuguese",
|
49 |
-
"Italian",
|
50 |
-
"Dutch",
|
51 |
-
"Other"
|
52 |
-
]
|
53 |
|
54 |
-
#
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
"Security",
|
61 |
-
"Maintainability",
|
62 |
-
"Documentation",
|
63 |
-
"Testing",
|
64 |
-
"Architecture",
|
65 |
-
"Best Practices",
|
66 |
-
"Refactoring",
|
67 |
-
"Other"
|
68 |
-
]
|
69 |
|
70 |
-
#
|
71 |
-
|
72 |
-
|
73 |
-
"relevance",
|
74 |
-
"explanation_clarity",
|
75 |
-
"problem_identification",
|
76 |
-
"actionability",
|
77 |
-
"completeness",
|
78 |
-
"specificity",
|
79 |
-
"contextual_adequacy",
|
80 |
-
"consistency",
|
81 |
-
"brevity"
|
82 |
-
]
|
83 |
|
84 |
-
#
|
85 |
-
|
|
|
86 |
|
87 |
-
|
|
|
88 |
|
89 |
-
#
|
90 |
-
|
91 |
-
"model_name": "example/model",
|
92 |
-
"programming_language": "Python",
|
93 |
-
"comment_language": "English",
|
94 |
-
"taxonomy_category": "Bug Detection",
|
95 |
-
"bleu": 0.5,
|
96 |
-
"llm_pass_1": 0.5,
|
97 |
-
"llm_pass_5": 0.5,
|
98 |
-
"llm_pass_10": 0.5,
|
99 |
-
"metrics": {
|
100 |
-
"readability": 5, "relevance": 5, "explanation_clarity": 5,
|
101 |
-
"problem_identification": 5, "actionability": 5, "completeness": 5,
|
102 |
-
"specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5
|
103 |
-
},
|
104 |
-
"submission_ip": "127.0.0.1",
|
105 |
-
"submission_date": "2024-01-01T00:00:00Z"
|
106 |
-
}]
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
+
from huggingface_hub import HfApi
|
3 |
+
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
+
# Load environment variables
|
6 |
+
load_dotenv()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
+
# Hugging Face configuration
|
9 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
10 |
+
OWNER = os.environ.get("OWNER", "codereview-bench") # Change to your org
|
11 |
+
SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
|
12 |
+
ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
|
13 |
+
ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
# Repository IDs
|
16 |
+
REPO_ID = f"{OWNER}/codereview-bench"
|
17 |
+
RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/codereview-bench-results")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
# Cache paths
|
20 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
21 |
+
DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
|
22 |
|
23 |
+
# Local data paths
|
24 |
+
LEADERBOARD_FILE = os.path.join(DATA_PATH, "leaderboard.json")
|
25 |
|
26 |
+
# HF API instance
|
27 |
+
API = HfApi(token=TOKEN)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/leaderboard/processor.py
CHANGED
@@ -1,306 +1,271 @@
|
|
1 |
"""
|
2 |
-
|
3 |
"""
|
4 |
|
5 |
import json
|
6 |
-
import
|
7 |
-
|
8 |
-
from datetime import datetime
|
9 |
-
from
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
if not self.leaderboard_path.exists():
|
24 |
-
self.save_leaderboard_data(DEFAULT_DATA)
|
25 |
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
-
|
30 |
-
"""Load leaderboard data from storage"""
|
31 |
-
try:
|
32 |
-
with open(self.leaderboard_path, 'r', encoding='utf-8') as f:
|
33 |
-
data = json.load(f)
|
34 |
-
return data.get("leaderboard", [])
|
35 |
-
except Exception as e:
|
36 |
-
print(f"Error loading leaderboard: {e}")
|
37 |
-
return DEFAULT_DATA.copy()
|
38 |
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
"last_updated": datetime.now(timezone.utc).isoformat(),
|
45 |
-
"total_entries": len(data)
|
46 |
-
}
|
47 |
-
|
48 |
-
with open(self.leaderboard_path, 'w', encoding='utf-8') as f:
|
49 |
-
json.dump(to_store, f, indent=2, ensure_ascii=False)
|
50 |
-
|
51 |
-
return True
|
52 |
-
except Exception as e:
|
53 |
-
print(f"Error saving leaderboard: {e}")
|
54 |
-
return False
|
55 |
|
56 |
-
|
57 |
-
"""Load submission log from storage"""
|
58 |
-
try:
|
59 |
-
with open(self.submissions_path, 'r', encoding='utf-8') as f:
|
60 |
-
data = json.load(f)
|
61 |
-
return data.get("submissions", [])
|
62 |
-
except Exception as e:
|
63 |
-
print(f"Error loading submission log: {e}")
|
64 |
-
return []
|
65 |
|
66 |
-
|
67 |
-
|
68 |
-
try:
|
69 |
-
to_store = {
|
70 |
-
"submissions": submissions,
|
71 |
-
"last_updated": datetime.now(timezone.utc).isoformat(),
|
72 |
-
"total_submissions": len(submissions)
|
73 |
-
}
|
74 |
-
|
75 |
-
with open(self.submissions_path, 'w', encoding='utf-8') as f:
|
76 |
-
json.dump(to_store, f, indent=2, ensure_ascii=False)
|
77 |
-
|
78 |
-
return True
|
79 |
-
except Exception as e:
|
80 |
-
print(f"Error saving submission log: {e}")
|
81 |
-
return False
|
82 |
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
else:
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
])
|
213 |
-
|
214 |
-
return {
|
215 |
-
**basic_stats,
|
216 |
-
"recent_submissions_7d": recent_submissions,
|
217 |
-
"total_logged_submissions": len(submissions),
|
218 |
-
"last_updated": datetime.now(timezone.utc).isoformat()
|
219 |
-
}
|
220 |
-
|
221 |
-
except Exception as e:
|
222 |
-
print(f"Error getting leaderboard stats: {e}")
|
223 |
-
return {}
|
224 |
-
|
225 |
-
def backup_data(self) -> bool:
|
226 |
-
"""Create backup of current data"""
|
227 |
-
try:
|
228 |
-
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
229 |
-
backup_dir = Path("backups")
|
230 |
-
backup_dir.mkdir(exist_ok=True)
|
231 |
-
|
232 |
-
# Backup leaderboard
|
233 |
-
if self.leaderboard_path.exists():
|
234 |
-
backup_path = backup_dir / f"leaderboard_{timestamp}.json"
|
235 |
-
with open(self.leaderboard_path, 'r') as src, open(backup_path, 'w') as dst:
|
236 |
-
dst.write(src.read())
|
237 |
-
|
238 |
-
# Backup submissions
|
239 |
-
if self.submissions_path.exists():
|
240 |
-
backup_path = backup_dir / f"submissions_{timestamp}.json"
|
241 |
-
with open(self.submissions_path, 'r') as src, open(backup_path, 'w') as dst:
|
242 |
-
dst.write(src.read())
|
243 |
-
|
244 |
-
return True
|
245 |
-
|
246 |
-
except Exception as e:
|
247 |
-
print(f"Error creating backup: {e}")
|
248 |
-
return False
|
249 |
-
|
250 |
-
def export_data(self, format_type: str = "json") -> str:
|
251 |
-
"""Export leaderboard data in specified format"""
|
252 |
-
try:
|
253 |
-
from src.display.utils import export_leaderboard_data
|
254 |
-
|
255 |
-
data = self.load_leaderboard_data()
|
256 |
-
return export_leaderboard_data(data, format_type)
|
257 |
-
|
258 |
-
except Exception as e:
|
259 |
-
print(f"Error exporting data: {e}")
|
260 |
-
return f"Export failed: {str(e)}"
|
261 |
-
|
262 |
-
def validate_data_integrity(self) -> Dict[str, Any]:
|
263 |
-
"""Validate data integrity and return report"""
|
264 |
-
try:
|
265 |
-
data = self.load_leaderboard_data()
|
266 |
-
submissions = self.load_submission_log()
|
267 |
-
|
268 |
-
issues = []
|
269 |
-
|
270 |
-
# Check for duplicate models
|
271 |
-
model_names = [entry.get("model_name") for entry in data]
|
272 |
-
duplicates = [name for name in model_names if model_names.count(name) > 1]
|
273 |
-
if duplicates:
|
274 |
-
issues.append(f"Duplicate models found: {set(duplicates)}")
|
275 |
-
|
276 |
-
# Check for missing required fields
|
277 |
-
required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
|
278 |
-
for i, entry in enumerate(data):
|
279 |
-
missing = [field for field in required_fields if not entry.get(field)]
|
280 |
-
if missing:
|
281 |
-
issues.append(f"Entry {i}: Missing fields {missing}")
|
282 |
-
|
283 |
-
# Check score ranges
|
284 |
-
for i, entry in enumerate(data):
|
285 |
-
scores = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
|
286 |
-
for score in scores:
|
287 |
-
value = entry.get(score)
|
288 |
-
if value is not None and (value < 0 or value > 1):
|
289 |
-
issues.append(f"Entry {i}: {score} out of range: {value}")
|
290 |
-
|
291 |
-
return {
|
292 |
-
"is_valid": len(issues) == 0,
|
293 |
-
"issues": issues,
|
294 |
-
"total_entries": len(data),
|
295 |
-
"total_submissions": len(submissions),
|
296 |
-
"check_date": datetime.now(timezone.utc).isoformat()
|
297 |
-
}
|
298 |
-
|
299 |
-
except Exception as e:
|
300 |
-
return {
|
301 |
-
"is_valid": False,
|
302 |
-
"issues": [f"Validation failed: {str(e)}"],
|
303 |
-
"total_entries": 0,
|
304 |
-
"total_submissions": 0,
|
305 |
-
"check_date": datetime.now(timezone.utc).isoformat()
|
306 |
-
}
|
|
|
1 |
"""
|
2 |
+
Process CodeReview Bench leaderboard data and submissions.
|
3 |
"""
|
4 |
|
5 |
import json
|
6 |
+
import os
|
7 |
+
import pandas as pd
|
8 |
+
from datetime import datetime
|
9 |
+
from typing import Dict, List, Tuple, Optional
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
from src.display.utils import (
|
13 |
+
CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES,
|
14 |
+
MULTIMETRIC_METRICS, EXACT_MATCH_METRICS
|
15 |
+
)
|
16 |
+
|
17 |
+
|
18 |
+
def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
|
19 |
+
"""
|
20 |
+
Process a JSONL submission file for CodeReview Bench.
|
21 |
|
22 |
+
Args:
|
23 |
+
file_path: Path to the JSONL submission file
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
Tuple of (entries_list, message)
|
27 |
+
"""
|
28 |
+
try:
|
29 |
+
entries = []
|
30 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
31 |
+
for line_num, line in enumerate(f, 1):
|
32 |
+
line = line.strip()
|
33 |
+
if not line:
|
34 |
+
continue
|
35 |
+
|
36 |
+
try:
|
37 |
+
entry = json.loads(line)
|
38 |
+
|
39 |
+
# Validate required fields
|
40 |
+
required_fields = ['model_name', 'programming_language', 'comment_language']
|
41 |
+
missing_fields = [field for field in required_fields if field not in entry]
|
42 |
+
if missing_fields:
|
43 |
+
return [], f"Missing required fields {missing_fields} in line {line_num}"
|
44 |
+
|
45 |
+
# Validate metrics exist
|
46 |
+
has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS)
|
47 |
+
has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS)
|
48 |
+
|
49 |
+
if not has_multimetric and not has_exact_match:
|
50 |
+
return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}"
|
51 |
+
|
52 |
+
entries.append(entry)
|
53 |
+
|
54 |
+
except json.JSONDecodeError as e:
|
55 |
+
return [], f"Invalid JSON in line {line_num}: {e}"
|
56 |
+
|
57 |
+
if not entries:
|
58 |
+
return [], "No valid entries found in submission file"
|
59 |
+
|
60 |
+
return entries, f"Successfully processed {len(entries)} entries"
|
61 |
+
|
62 |
+
except Exception as e:
|
63 |
+
return [], f"Error processing submission: {e}"
|
64 |
+
|
65 |
+
|
66 |
+
def calculate_overall_score(entry: Dict) -> float:
|
67 |
+
"""
|
68 |
+
Calculate overall score for a CodeReview Bench entry.
|
69 |
|
70 |
+
Args:
|
71 |
+
entry: Dictionary containing model evaluation results
|
|
|
|
|
72 |
|
73 |
+
Returns:
|
74 |
+
Overall score as float
|
75 |
+
"""
|
76 |
+
# Calculate multimetric average
|
77 |
+
multimetric_scores = []
|
78 |
+
for metric in MULTIMETRIC_METRICS:
|
79 |
+
if metric in entry and isinstance(entry[metric], (int, float)):
|
80 |
+
multimetric_scores.append(entry[metric])
|
81 |
|
82 |
+
multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
+
# Calculate exact match average
|
85 |
+
exact_match_scores = []
|
86 |
+
for metric in EXACT_MATCH_METRICS:
|
87 |
+
if metric in entry and isinstance(entry[metric], (int, float)):
|
88 |
+
exact_match_scores.append(entry[metric])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
+
exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
# Weighted combination (can be adjusted based on requirements)
|
93 |
+
overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
+
return overall_score
|
96 |
+
|
97 |
+
|
98 |
+
def load_leaderboard_data(file_path: str) -> Dict:
|
99 |
+
"""
|
100 |
+
Load the leaderboard data from a JSON file.
|
101 |
+
"""
|
102 |
+
if not os.path.exists(file_path):
|
103 |
+
version = "v0"
|
104 |
+
if "_v" in file_path:
|
105 |
+
version = file_path.split("_")[-1].split(".")[0]
|
106 |
+
return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
|
107 |
+
|
108 |
+
with open(file_path, 'r') as f:
|
109 |
+
data = json.load(f)
|
110 |
+
|
111 |
+
# Ensure version field exists
|
112 |
+
if "version" not in data:
|
113 |
+
version = "v0"
|
114 |
+
if "_v" in file_path:
|
115 |
+
version = file_path.split("_")[-1].split(".")[0]
|
116 |
+
data["version"] = version
|
117 |
+
|
118 |
+
return data
|
119 |
+
|
120 |
+
|
121 |
+
def save_leaderboard_data(data: Dict, file_path: str) -> None:
|
122 |
+
"""
|
123 |
+
Save the leaderboard data to a JSON file.
|
124 |
+
"""
|
125 |
+
# Ensure the directory exists
|
126 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
127 |
+
|
128 |
+
# Update the last_updated timestamp
|
129 |
+
data["last_updated"] = datetime.now().isoformat()
|
130 |
+
|
131 |
+
# Ensure version is set
|
132 |
+
if "version" not in data:
|
133 |
+
version = "v0"
|
134 |
+
if "_v" in file_path:
|
135 |
+
version = file_path.split("_")[-1].split(".")[0]
|
136 |
+
data["version"] = version
|
137 |
+
|
138 |
+
with open(file_path, 'w') as f:
|
139 |
+
json.dump(data, f, indent=2)
|
140 |
+
|
141 |
+
|
142 |
+
def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
|
143 |
+
"""
|
144 |
+
Convert leaderboard data to a pandas DataFrame for display.
|
145 |
+
"""
|
146 |
+
rows = []
|
147 |
+
|
148 |
+
for entry in leaderboard_data.get("entries", []):
|
149 |
+
model_name = entry.get("model_name", "Unknown Model")
|
150 |
+
|
151 |
+
# Extract basic metadata
|
152 |
+
row = {
|
153 |
+
"model_name": model_name,
|
154 |
+
"model_type": entry.get("model_type", "Unknown"),
|
155 |
+
"mode": entry.get("mode", "Strict"),
|
156 |
+
"submission_date": entry.get("submission_date", ""),
|
157 |
+
"version": entry.get("version", "v0"),
|
158 |
+
"review_model_type": entry.get("review_model_type", "custom").lower()
|
159 |
+
}
|
160 |
+
|
161 |
+
# Add additional metadata fields if present
|
162 |
+
for key in ["base_model", "revision", "precision", "weight_type", "topic", "programming_language", "comment_language"]:
|
163 |
+
if key in entry:
|
164 |
+
row[key] = entry[key]
|
165 |
+
|
166 |
+
# Add multimetric scores
|
167 |
+
for metric in MULTIMETRIC_METRICS:
|
168 |
+
if metric in entry:
|
169 |
+
row[metric] = entry[metric]
|
170 |
else:
|
171 |
+
row[metric] = pd.NA
|
172 |
+
|
173 |
+
# Add exact match metrics
|
174 |
+
for metric in EXACT_MATCH_METRICS:
|
175 |
+
if metric in entry:
|
176 |
+
row[metric] = entry[metric]
|
177 |
+
else:
|
178 |
+
row[metric] = pd.NA
|
179 |
+
|
180 |
+
# Calculate aggregated metrics
|
181 |
+
multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])]
|
182 |
+
exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])]
|
183 |
+
|
184 |
+
if multimetric_scores:
|
185 |
+
row["multimetric_average"] = np.mean(multimetric_scores)
|
186 |
+
else:
|
187 |
+
row["multimetric_average"] = pd.NA
|
188 |
+
|
189 |
+
if exact_match_scores:
|
190 |
+
row["exact_match_average"] = np.mean(exact_match_scores)
|
191 |
+
else:
|
192 |
+
row["exact_match_average"] = pd.NA
|
193 |
+
|
194 |
+
# Calculate overall score
|
195 |
+
row["overall_score"] = calculate_overall_score(entry)
|
196 |
+
|
197 |
+
# Add language-specific metrics if available
|
198 |
+
for lang in COMMENT_LANGUAGES:
|
199 |
+
for metric in ["readability", "relevance", "overall_score"]:
|
200 |
+
lang_key = f"{lang}_{metric}"
|
201 |
+
if lang_key in entry:
|
202 |
+
row[lang_key] = entry[lang_key]
|
203 |
+
else:
|
204 |
+
row[lang_key] = pd.NA
|
205 |
+
|
206 |
+
# Add evaluation count
|
207 |
+
row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA))
|
208 |
+
|
209 |
+
rows.append(row)
|
210 |
+
|
211 |
+
# Create DataFrame and sort by overall score
|
212 |
+
df = pd.DataFrame(rows)
|
213 |
+
|
214 |
+
# Ensure all expected columns exist
|
215 |
+
for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS:
|
216 |
+
if metric not in df.columns:
|
217 |
+
df[metric] = pd.NA
|
218 |
+
|
219 |
+
# Sort by overall score (descending)
|
220 |
+
if not df.empty:
|
221 |
+
df = df.sort_values(by="overall_score", ascending=False, na_position='last')
|
222 |
+
|
223 |
+
# Ensure summary columns exist
|
224 |
+
summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"]
|
225 |
+
for col in summary_cols:
|
226 |
+
if col not in df.columns:
|
227 |
+
df[col] = pd.NA
|
228 |
+
|
229 |
+
return df
|
230 |
+
|
231 |
+
|
232 |
+
def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict:
|
233 |
+
"""
|
234 |
+
Add new entries to the leaderboard, replacing any with the same model name.
|
235 |
+
"""
|
236 |
+
# Create a mapping of existing entries by model name and version
|
237 |
+
existing_entries = {
|
238 |
+
(entry["model_name"], entry.get("version", "v0")): i
|
239 |
+
for i, entry in enumerate(leaderboard_data.get("entries", []))
|
240 |
+
}
|
241 |
+
|
242 |
+
# Process each new entry
|
243 |
+
for new_entry in new_entries:
|
244 |
+
model_name = new_entry.get("model_name")
|
245 |
+
version = new_entry.get("version", "v0")
|
246 |
+
|
247 |
+
# Add calculated metrics
|
248 |
+
new_entry["overall_score"] = calculate_overall_score(new_entry)
|
249 |
+
|
250 |
+
# Calculate averages
|
251 |
+
multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
|
252 |
+
exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
|
253 |
+
|
254 |
+
if multimetric_scores:
|
255 |
+
new_entry["multimetric_average"] = np.mean(multimetric_scores)
|
256 |
+
if exact_match_scores:
|
257 |
+
new_entry["exact_match_average"] = np.mean(exact_match_scores)
|
258 |
+
|
259 |
+
if (model_name, version) in existing_entries:
|
260 |
+
# Replace existing entry
|
261 |
+
leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
|
262 |
+
else:
|
263 |
+
# Add new entry
|
264 |
+
if "entries" not in leaderboard_data:
|
265 |
+
leaderboard_data["entries"] = []
|
266 |
+
leaderboard_data["entries"].append(new_entry)
|
267 |
+
|
268 |
+
# Update the last_updated timestamp
|
269 |
+
leaderboard_data["last_updated"] = datetime.now().isoformat()
|
270 |
+
|
271 |
+
return leaderboard_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/populate.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Populate the CodeReview Bench leaderboard from HuggingFace datasets.
|
3 |
+
"""
|
4 |
+
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
import pandas as pd
|
8 |
+
import tempfile
|
9 |
+
from typing import Dict, List, Optional
|
10 |
+
from datetime import datetime
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
from huggingface_hub import hf_hub_download, HfApi
|
14 |
+
from datasets import load_dataset
|
15 |
+
|
16 |
+
from src.display.utils import CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES
|
17 |
+
from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
|
18 |
+
from src.leaderboard.processor import leaderboard_to_dataframe
|
19 |
+
|
20 |
+
|
21 |
+
def get_latest_leaderboard(version="v0") -> Optional[Dict]:
|
22 |
+
"""
|
23 |
+
Get the latest leaderboard data from HuggingFace dataset.
|
24 |
+
Fallback to local JSON file if HF download fails or is unavailable.
|
25 |
+
"""
|
26 |
+
# First try to fetch from HuggingFace Hub
|
27 |
+
try:
|
28 |
+
leaderboard_path = hf_hub_download(
|
29 |
+
repo_id=RESULTS_DATASET_ID,
|
30 |
+
filename=f"leaderboards/leaderboard_{version}.json",
|
31 |
+
repo_type="dataset",
|
32 |
+
token=TOKEN
|
33 |
+
)
|
34 |
+
with open(leaderboard_path, 'r') as f:
|
35 |
+
return json.load(f)
|
36 |
+
except Exception as hf_err:
|
37 |
+
print(f"HF download failed or unavailable: {hf_err}. Trying local fallback...")
|
38 |
+
|
39 |
+
# Fallback: attempt to load a local leaderboard_data.json located at the project root
|
40 |
+
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
41 |
+
local_path_candidates = [
|
42 |
+
os.path.join(project_root, "leaderboard_data.json"), # legacy path in root
|
43 |
+
os.path.join(project_root, "data", "leaderboard.json"), # path defined in envs.py
|
44 |
+
]
|
45 |
+
|
46 |
+
for local_path in local_path_candidates:
|
47 |
+
if os.path.exists(local_path):
|
48 |
+
try:
|
49 |
+
with open(local_path, 'r') as f:
|
50 |
+
return json.load(f)
|
51 |
+
except Exception as local_err:
|
52 |
+
print(f"Error loading local leaderboard file {local_path}: {local_err}")
|
53 |
+
|
54 |
+
# If nothing found, return None
|
55 |
+
return None
|
56 |
+
|
57 |
+
|
58 |
+
def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
|
59 |
+
"""
|
60 |
+
Get a specific model's entry from the entries folder, uniquely identified by model_name, mode, and version.
|
61 |
+
"""
|
62 |
+
try:
|
63 |
+
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
64 |
+
mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
|
65 |
+
entry_path = hf_hub_download(
|
66 |
+
repo_id=RESULTS_DATASET_ID,
|
67 |
+
filename=f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json",
|
68 |
+
repo_type="dataset",
|
69 |
+
token=TOKEN
|
70 |
+
)
|
71 |
+
with open(entry_path, 'r') as f:
|
72 |
+
return json.load(f)
|
73 |
+
except Exception as e:
|
74 |
+
print(f"Error downloading model entry: {e}")
|
75 |
+
return None
|
76 |
+
|
77 |
+
|
78 |
+
def get_all_entries(version="v0") -> List[Dict]:
|
79 |
+
"""
|
80 |
+
Get all entries from the HuggingFace dataset.
|
81 |
+
"""
|
82 |
+
try:
|
83 |
+
api = HfApi(token=TOKEN)
|
84 |
+
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
|
85 |
+
entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
|
86 |
+
|
87 |
+
all_entries = []
|
88 |
+
for entry_file in entry_files:
|
89 |
+
try:
|
90 |
+
entry_path = hf_hub_download(
|
91 |
+
repo_id=RESULTS_DATASET_ID,
|
92 |
+
filename=entry_file,
|
93 |
+
repo_type="dataset",
|
94 |
+
token=TOKEN
|
95 |
+
)
|
96 |
+
with open(entry_path, 'r') as f:
|
97 |
+
entry_data = json.load(f)
|
98 |
+
all_entries.append(entry_data)
|
99 |
+
except Exception as e:
|
100 |
+
print(f"Error loading entry {entry_file}: {e}")
|
101 |
+
|
102 |
+
return all_entries
|
103 |
+
except Exception as e:
|
104 |
+
print(f"Error getting all entries: {e}")
|
105 |
+
return []
|
106 |
+
|
107 |
+
|
108 |
+
def get_leaderboard_df(version="v0") -> pd.DataFrame:
|
109 |
+
"""
|
110 |
+
Get the leaderboard data as a DataFrame.
|
111 |
+
"""
|
112 |
+
# Get latest leaderboard data
|
113 |
+
leaderboard_data = get_latest_leaderboard(version)
|
114 |
+
|
115 |
+
if not leaderboard_data:
|
116 |
+
# If no leaderboard exists, try to build it from entries
|
117 |
+
entries = get_all_entries(version)
|
118 |
+
if entries:
|
119 |
+
leaderboard_data = {
|
120 |
+
"entries": entries,
|
121 |
+
"last_updated": datetime.now().isoformat(),
|
122 |
+
"version": version
|
123 |
+
}
|
124 |
+
else:
|
125 |
+
# Return empty DataFrame if no data available
|
126 |
+
return pd.DataFrame(columns=DISPLAY_COLS)
|
127 |
+
|
128 |
+
# Convert to DataFrame
|
129 |
+
return leaderboard_to_dataframe(leaderboard_data)
|
130 |
+
|
131 |
+
|
132 |
+
def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
|
133 |
+
"""
|
134 |
+
Get the leaderboard data filtered by a specific programming language category.
|
135 |
+
"""
|
136 |
+
# Get latest leaderboard data
|
137 |
+
leaderboard_data = get_latest_leaderboard(version)
|
138 |
+
|
139 |
+
if not leaderboard_data:
|
140 |
+
# If no leaderboard exists, try to build it from entries
|
141 |
+
entries = get_all_entries(version)
|
142 |
+
if entries:
|
143 |
+
leaderboard_data = {
|
144 |
+
"entries": entries,
|
145 |
+
"last_updated": datetime.now().isoformat(),
|
146 |
+
"version": version
|
147 |
+
}
|
148 |
+
else:
|
149 |
+
# Return empty DataFrame if no data available
|
150 |
+
return pd.DataFrame(columns=DISPLAY_COLS)
|
151 |
+
|
152 |
+
# Filter entries to only include those with data for the specified programming language
|
153 |
+
filtered_entries = []
|
154 |
+
for entry in leaderboard_data.get("entries", []):
|
155 |
+
# Check if entry has data for this programming language
|
156 |
+
programming_language = entry.get("programming_language", "").lower()
|
157 |
+
if programming_language == category.lower() or category.lower() == "other":
|
158 |
+
# For "other" category, include entries that don't match any specific language
|
159 |
+
if category.lower() == "other":
|
160 |
+
if programming_language not in [cat.lower() for cat in CATEGORIES[:-1]]: # Exclude "Other" from check
|
161 |
+
filtered_entries.append(entry)
|
162 |
+
else:
|
163 |
+
filtered_entries.append(entry)
|
164 |
+
|
165 |
+
# Create a new leaderboard data structure with the filtered entries
|
166 |
+
filtered_leaderboard = {
|
167 |
+
"entries": filtered_entries,
|
168 |
+
"last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
|
169 |
+
"version": version
|
170 |
+
}
|
171 |
+
|
172 |
+
# Convert to DataFrame
|
173 |
+
return leaderboard_to_dataframe(filtered_leaderboard)
|
174 |
+
|
175 |
+
|
176 |
+
def get_detailed_model_data(model_name: str, mode: str, version="v0") -> Dict:
|
177 |
+
"""
|
178 |
+
Get detailed data for a specific model and mode.
|
179 |
+
"""
|
180 |
+
entry = get_model_entry(model_name, mode, version)
|
181 |
+
if entry:
|
182 |
+
return entry
|
183 |
+
leaderboard_data = get_latest_leaderboard(version)
|
184 |
+
if leaderboard_data:
|
185 |
+
for entry in leaderboard_data.get("entries", []):
|
186 |
+
if entry.get("model_name") == model_name and str(entry.get("mode")).lower() == str(mode).lower():
|
187 |
+
return entry
|
188 |
+
return {}
|
src/submission/submit.py
CHANGED
@@ -1,386 +1,184 @@
|
|
1 |
"""
|
2 |
-
|
3 |
"""
|
4 |
|
5 |
-
import
|
6 |
-
import
|
7 |
-
|
8 |
-
from datetime import datetime
|
9 |
-
from
|
10 |
-
from src.leaderboard.processor import LeaderboardProcessor
|
11 |
-
from src.display.utils import get_main_leaderboard_data, get_quality_metrics_data
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
taxonomy_category: str,
|
119 |
-
bleu: float,
|
120 |
-
llm_pass_1: float,
|
121 |
-
llm_pass_5: float,
|
122 |
-
llm_pass_10: float,
|
123 |
-
readability: int,
|
124 |
-
relevance: int,
|
125 |
-
explanation_clarity: int,
|
126 |
-
problem_identification: int,
|
127 |
-
actionability: int,
|
128 |
-
completeness: int,
|
129 |
-
specificity: int,
|
130 |
-
contextual_adequacy: int,
|
131 |
-
consistency: int,
|
132 |
-
brevity: int,
|
133 |
-
) -> Tuple[List[Dict], List[List[str]], List[List[str]], str]:
|
134 |
-
"""Handle model submission with full validation"""
|
135 |
-
|
136 |
-
try:
|
137 |
-
# Get client IP
|
138 |
-
client_ip = self.get_client_ip(request)
|
139 |
-
|
140 |
-
# Check rate limiting
|
141 |
-
rate_ok, rate_msg = self.processor.check_rate_limit(client_ip)
|
142 |
-
if not rate_ok:
|
143 |
-
return current_data, [], [], f"❌ {rate_msg}"
|
144 |
-
|
145 |
-
# Validate model name
|
146 |
-
name_valid, name_msg = self.validate_model_name(model_name)
|
147 |
-
if not name_valid:
|
148 |
-
return current_data, [], [], f"❌ {name_msg}"
|
149 |
-
|
150 |
-
# Validate scores
|
151 |
-
scores = {
|
152 |
-
"bleu": bleu,
|
153 |
-
"llm_pass_1": llm_pass_1,
|
154 |
-
"llm_pass_5": llm_pass_5,
|
155 |
-
"llm_pass_10": llm_pass_10
|
156 |
-
}
|
157 |
-
scores_valid, scores_msg = self.validate_scores(scores)
|
158 |
-
if not scores_valid:
|
159 |
-
return current_data, [], [], f"❌ {scores_msg}"
|
160 |
-
|
161 |
-
# Validate metrics
|
162 |
-
metrics = {
|
163 |
-
"readability": readability,
|
164 |
-
"relevance": relevance,
|
165 |
-
"explanation_clarity": explanation_clarity,
|
166 |
-
"problem_identification": problem_identification,
|
167 |
-
"actionability": actionability,
|
168 |
-
"completeness": completeness,
|
169 |
-
"specificity": specificity,
|
170 |
-
"contextual_adequacy": contextual_adequacy,
|
171 |
-
"consistency": consistency,
|
172 |
-
"brevity": brevity,
|
173 |
-
}
|
174 |
-
metrics_valid, metrics_msg = self.validate_metrics(metrics)
|
175 |
-
if not metrics_valid:
|
176 |
-
return current_data, [], [], f"❌ {metrics_msg}"
|
177 |
-
|
178 |
-
# Create submission data
|
179 |
-
submission_data = {
|
180 |
-
"model_name": model_name.strip(),
|
181 |
-
"programming_language": programming_language,
|
182 |
-
"comment_language": comment_language,
|
183 |
-
"taxonomy_category": taxonomy_category,
|
184 |
-
"bleu": bleu,
|
185 |
-
"llm_pass_1": llm_pass_1,
|
186 |
-
"llm_pass_5": llm_pass_5,
|
187 |
-
"llm_pass_10": llm_pass_10,
|
188 |
-
"metrics": metrics
|
189 |
-
}
|
190 |
-
|
191 |
-
# Submit to processor
|
192 |
-
success, message = self.processor.add_submission(submission_data, client_ip)
|
193 |
-
|
194 |
-
if success:
|
195 |
-
# Load updated data
|
196 |
-
updated_data = self.processor.load_leaderboard_data()
|
197 |
-
|
198 |
-
# Format tables
|
199 |
-
main_table = get_main_leaderboard_data(updated_data)
|
200 |
-
quality_table = get_quality_metrics_data(updated_data)
|
201 |
-
|
202 |
-
return updated_data, main_table, quality_table, message
|
203 |
-
else:
|
204 |
-
return current_data, [], [], message
|
205 |
-
|
206 |
-
except Exception as e:
|
207 |
-
print(f"Error in submission: {e}")
|
208 |
-
return current_data, [], [], f"❌ Submission failed: {str(e)}"
|
209 |
-
|
210 |
-
def get_submission_form_components(self):
|
211 |
-
"""Create gradio components for submission form"""
|
212 |
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
""
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
)
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
)
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
minimum=0.0,
|
277 |
-
maximum=1.0,
|
278 |
-
step=0.001,
|
279 |
-
info="Success rate in 10 attempts"
|
280 |
-
)
|
281 |
-
|
282 |
-
gr.Markdown("### 📋 Quality Metrics (0 - 10)")
|
283 |
-
with gr.Row():
|
284 |
-
readability = gr.Slider(
|
285 |
-
minimum=0, maximum=10, value=5, step=1,
|
286 |
-
label="Readability",
|
287 |
-
info="How readable are the generated reviews?"
|
288 |
-
)
|
289 |
-
relevance = gr.Slider(
|
290 |
-
minimum=0, maximum=10, value=5, step=1,
|
291 |
-
label="Relevance",
|
292 |
-
info="How relevant to the code changes?"
|
293 |
-
)
|
294 |
-
explanation_clarity = gr.Slider(
|
295 |
-
minimum=0, maximum=10, value=5, step=1,
|
296 |
-
label="Explanation Clarity",
|
297 |
-
info="How clear are the explanations?"
|
298 |
-
)
|
299 |
-
problem_identification = gr.Slider(
|
300 |
-
minimum=0, maximum=10, value=5, step=1,
|
301 |
-
label="Problem Identification",
|
302 |
-
info="How well does it identify issues?"
|
303 |
-
)
|
304 |
-
actionability = gr.Slider(
|
305 |
-
minimum=0, maximum=10, value=5, step=1,
|
306 |
-
label="Actionability",
|
307 |
-
info="How actionable are the suggestions?"
|
308 |
-
)
|
309 |
-
|
310 |
-
with gr.Row():
|
311 |
-
completeness = gr.Slider(
|
312 |
-
minimum=0, maximum=10, value=5, step=1,
|
313 |
-
label="Completeness",
|
314 |
-
info="How complete are the reviews?"
|
315 |
-
)
|
316 |
-
specificity = gr.Slider(
|
317 |
-
minimum=0, maximum=10, value=5, step=1,
|
318 |
-
label="Specificity",
|
319 |
-
info="How specific are the comments?"
|
320 |
-
)
|
321 |
-
contextual_adequacy = gr.Slider(
|
322 |
-
minimum=0, maximum=10, value=5, step=1,
|
323 |
-
label="Contextual Adequacy",
|
324 |
-
info="How well does it understand context?"
|
325 |
-
)
|
326 |
-
consistency = gr.Slider(
|
327 |
-
minimum=0, maximum=10, value=5, step=1,
|
328 |
-
label="Consistency",
|
329 |
-
info="How consistent across reviews?"
|
330 |
-
)
|
331 |
-
brevity = gr.Slider(
|
332 |
-
minimum=0, maximum=10, value=5, step=1,
|
333 |
-
label="Brevity",
|
334 |
-
info="How concise are the reviews?"
|
335 |
-
)
|
336 |
-
|
337 |
-
submit_btn = gr.Button("🚀 Submit Model", variant="primary")
|
338 |
-
status_msg = gr.Markdown("")
|
339 |
-
|
340 |
-
# Return all components for use in the main app
|
341 |
-
return {
|
342 |
-
"model_name": model_name,
|
343 |
-
"programming_language": programming_language,
|
344 |
-
"comment_language": comment_language,
|
345 |
-
"taxonomy_category": taxonomy_category,
|
346 |
-
"bleu": bleu,
|
347 |
-
"pass1": pass1,
|
348 |
-
"pass5": pass5,
|
349 |
-
"pass10": pass10,
|
350 |
-
"readability": readability,
|
351 |
-
"relevance": relevance,
|
352 |
-
"explanation_clarity": explanation_clarity,
|
353 |
-
"problem_identification": problem_identification,
|
354 |
-
"actionability": actionability,
|
355 |
-
"completeness": completeness,
|
356 |
-
"specificity": specificity,
|
357 |
-
"contextual_adequacy": contextual_adequacy,
|
358 |
-
"consistency": consistency,
|
359 |
-
"brevity": brevity,
|
360 |
-
"submit_btn": submit_btn,
|
361 |
-
"status_msg": status_msg,
|
362 |
-
}
|
363 |
-
|
364 |
-
def get_submission_history(self, ip_address: str) -> List[List[str]]:
|
365 |
-
"""Get submission history for display"""
|
366 |
try:
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
row = [
|
372 |
-
sub.get("model_name", ""),
|
373 |
-
sub.get("programming_language", ""),
|
374 |
-
sub.get("comment_language", ""),
|
375 |
-
sub.get("taxonomy_category", ""),
|
376 |
-
f"{sub.get('scores', {}).get('llm_pass_1', 0):.3f}",
|
377 |
-
sub.get("submission_date", "").split("T")[0] if sub.get("submission_date") else "",
|
378 |
-
sub.get("status", "")
|
379 |
-
]
|
380 |
-
table_data.append(row)
|
381 |
-
|
382 |
-
return table_data
|
383 |
-
|
384 |
-
except Exception as e:
|
385 |
-
print(f"Error getting submission history: {e}")
|
386 |
-
return []
|
|
|
1 |
"""
|
2 |
+
Handle submissions to the CodeReview Bench leaderboard.
|
3 |
"""
|
4 |
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
import tempfile
|
8 |
+
from datetime import datetime
|
9 |
+
from typing import Dict, List, Tuple
|
|
|
|
|
10 |
|
11 |
+
from huggingface_hub import HfApi
|
12 |
+
from datasets import load_dataset
|
13 |
+
|
14 |
+
from src.display.formatting import styled_error, styled_message
|
15 |
+
from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
|
16 |
+
from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard
|
17 |
+
|
18 |
+
|
19 |
+
def validate_submission(file_path: str) -> Tuple[bool, str]:
|
20 |
+
"""
|
21 |
+
Validate a submission file.
|
22 |
+
"""
|
23 |
+
try:
|
24 |
+
entries, message = process_jsonl_submission(file_path)
|
25 |
+
if not entries:
|
26 |
+
return False, message
|
27 |
+
return True, "Submission is valid"
|
28 |
+
except Exception as e:
|
29 |
+
return False, f"Error validating submission: {e}"
|
30 |
+
|
31 |
+
|
32 |
+
def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]:
|
33 |
+
"""
|
34 |
+
Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version.
|
35 |
+
"""
|
36 |
+
try:
|
37 |
+
# Create safe model name for file path
|
38 |
+
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
39 |
+
mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
|
40 |
+
|
41 |
+
# Create entry path in entries folder
|
42 |
+
entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json"
|
43 |
+
|
44 |
+
# Save entry to temporary file
|
45 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
|
46 |
+
json.dump(entry, temp_file, indent=2)
|
47 |
+
temp_path = temp_file.name
|
48 |
+
|
49 |
+
# Upload file
|
50 |
+
api = HfApi(token=TOKEN)
|
51 |
+
api.upload_file(
|
52 |
+
path_or_fileobj=temp_path,
|
53 |
+
path_in_repo=entry_path,
|
54 |
+
repo_id=RESULTS_DATASET_ID,
|
55 |
+
repo_type="dataset",
|
56 |
+
commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})"
|
57 |
+
)
|
58 |
+
|
59 |
+
os.unlink(temp_path)
|
60 |
+
return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})"
|
61 |
+
except Exception as e:
|
62 |
+
return False, f"Error submitting entry to dataset: {e}"
|
63 |
+
|
64 |
+
|
65 |
+
def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
|
66 |
+
"""
|
67 |
+
Submit updated leaderboard to the HuggingFace dataset.
|
68 |
+
"""
|
69 |
+
try:
|
70 |
+
# Create leaderboard data
|
71 |
+
leaderboard_data = {
|
72 |
+
"entries": entries,
|
73 |
+
"last_updated": datetime.now().isoformat(),
|
74 |
+
"version": version
|
75 |
+
}
|
76 |
+
|
77 |
+
# Save to temporary file
|
78 |
+
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
|
79 |
+
json.dump(leaderboard_data, temp_file, indent=2)
|
80 |
+
temp_path = temp_file.name
|
81 |
+
|
82 |
+
# Upload file
|
83 |
+
api = HfApi(token=TOKEN)
|
84 |
+
api.upload_file(
|
85 |
+
path_or_fileobj=temp_path,
|
86 |
+
path_in_repo=f"leaderboards/leaderboard_{version}.json",
|
87 |
+
repo_id=RESULTS_DATASET_ID,
|
88 |
+
repo_type="dataset",
|
89 |
+
commit_message=f"Update leaderboard for version {version}"
|
90 |
+
)
|
91 |
+
|
92 |
+
os.unlink(temp_path)
|
93 |
+
return True, "Leaderboard updated successfully"
|
94 |
+
except Exception as e:
|
95 |
+
return False, f"Error updating leaderboard: {e}"
|
96 |
+
|
97 |
+
|
98 |
+
def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
|
99 |
+
"""
|
100 |
+
Process a submission to the CodeReview Bench leaderboard.
|
101 |
+
"""
|
102 |
+
try:
|
103 |
+
# Validate submission
|
104 |
+
is_valid, validation_message = validate_submission(file_path)
|
105 |
+
if not is_valid:
|
106 |
+
return styled_error(validation_message)
|
107 |
+
|
108 |
+
# Process the submission entries
|
109 |
+
entries, message = process_jsonl_submission(file_path)
|
110 |
+
if not entries:
|
111 |
+
return styled_error(f"Failed to process submission: {message}")
|
112 |
+
|
113 |
+
# Upload raw submission file
|
114 |
+
model_name = metadata.get("model_name", "unknown")
|
115 |
+
model_name_safe = model_name.replace("/", "_").replace(" ", "_")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
+
api = HfApi(token=TOKEN)
|
118 |
+
submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
|
119 |
+
api.upload_file(
|
120 |
+
path_or_fileobj=file_path,
|
121 |
+
path_in_repo=submission_path,
|
122 |
+
repo_id=RESULTS_DATASET_ID,
|
123 |
+
repo_type="dataset",
|
124 |
+
commit_message=f"Add raw submission for {model_name}"
|
125 |
+
)
|
126 |
+
|
127 |
+
# Process entries and add metadata
|
128 |
+
processed_entries = []
|
129 |
+
for entry in entries:
|
130 |
+
# Add metadata to entry
|
131 |
+
entry.update({
|
132 |
+
"model_name": metadata.get("model_name"),
|
133 |
+
"model_type": metadata.get("model_type"),
|
134 |
+
"review_model_type": str(metadata.get("review_model_type", "custom")).lower(),
|
135 |
+
"mode": metadata.get("mode"),
|
136 |
+
"base_model": metadata.get("base_model"),
|
137 |
+
"revision": metadata.get("revision"),
|
138 |
+
"precision": metadata.get("precision"),
|
139 |
+
"weight_type": metadata.get("weight_type"),
|
140 |
+
"version": version,
|
141 |
+
"submission_date": datetime.now().isoformat()
|
142 |
+
})
|
143 |
+
processed_entries.append(entry)
|
144 |
+
|
145 |
+
# Submit entries to entries folder
|
146 |
+
for entry in processed_entries:
|
147 |
+
success, message = submit_entry_to_hub(entry, model_name, metadata.get("mode"), version)
|
148 |
+
if not success:
|
149 |
+
return styled_error(message)
|
150 |
+
|
151 |
+
# Get all entries from HF dataset and update leaderboard
|
152 |
+
files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
|
153 |
+
entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
|
154 |
+
|
155 |
+
all_entries = []
|
156 |
+
for entry_file in entry_files:
|
157 |
+
try:
|
158 |
+
entry_path = api.hf_hub_download(
|
159 |
+
repo_id=RESULTS_DATASET_ID,
|
160 |
+
filename=entry_file,
|
161 |
+
repo_type="dataset",
|
162 |
+
)
|
163 |
+
with open(entry_path, 'r') as f:
|
164 |
+
entry_data = json.load(f)
|
165 |
+
all_entries.append(entry_data)
|
166 |
+
except Exception as e:
|
167 |
+
print(f"Error loading entry {entry_file}: {e}")
|
168 |
+
|
169 |
+
# Update leaderboard with all entries
|
170 |
+
success, message = submit_leaderboard_to_hub(all_entries, version)
|
171 |
+
if not success:
|
172 |
+
return styled_error(message)
|
173 |
+
|
174 |
+
return styled_message("Submission successful! Model evaluated and leaderboard updated.")
|
175 |
+
|
176 |
+
except Exception as e:
|
177 |
+
return styled_error(f"Error processing submission: {e}")
|
178 |
+
finally:
|
179 |
+
# Clean up temporary files if they exist
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
try:
|
181 |
+
if os.path.exists(file_path):
|
182 |
+
os.remove(file_path)
|
183 |
+
except:
|
184 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|