kenkaneki commited on
Commit
94789e6
·
2 Parent(s): b31be61 346c3c5

force merge remake into main

Browse files
.env.template ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ HF_TOKEN="your_huggingface_write_token"
2
+ OWNER="your_huggingface_username_or_org"
3
+ RESULTS_DATASET_ID="your_username/guardbench-results"
4
+ SUBMITTER_TOKEN="your_secret_submission_token"
5
+ ADMIN_USERNAME="admin"
6
+ ADMIN_PASSWORD="password" # Change this!
.gitignore CHANGED
@@ -1,13 +1,52 @@
1
- auto_evals/
2
- venv/
3
  __pycache__/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  .env
5
- .ipynb_checkpoints
6
- *ipynb
 
 
 
 
 
7
  .vscode/
 
 
 
 
 
 
8
 
 
9
  eval-queue/
10
  eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
- logs/
 
 
 
 
 
 
1
+ # Python
 
2
  __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ build/
9
+ develop-eggs/
10
+ dist/
11
+ downloads/
12
+ eggs/
13
+ .eggs/
14
+ lib/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ .venv/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ .gradio/
24
+
25
+ # Environment variables
26
  .env
27
+
28
+ # Virtual Environment
29
+ venv/
30
+ ENV/
31
+
32
+ # IDE
33
+ .idea/
34
  .vscode/
35
+ *.swp
36
+ *.swo
37
+
38
+ # OS
39
+ .DS_Store
40
+ Thumbs.db
41
 
42
+ # Hugging Face cache
43
  eval-queue/
44
  eval-results/
45
  eval-queue-bk/
46
  eval-results-bk/
47
+
48
+ # Data files
49
+ data/
50
+
51
+ # Versioned leaderboard files
52
+ data/leaderboard_v*.json
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "guard-bench-submodule"]
2
+ path = guard-bench-submodule
3
+ url = https://github.com/whitecircle-ai/circle-guard-bench.git
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,222 +1,165 @@
1
  ---
2
- title: CodeReview Leaderboard
3
- emoji: 🥇
4
- colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: true
9
- license: mit
10
- short_description: CodeReview Leaderboard for evaluating code review models
11
- sdk_version: 5.19.0
12
- storage: persistent
13
- ---
14
-
15
- # 🏆 CodeReview Leaderboard
16
-
17
- A comprehensive benchmark and leaderboard for code review generation models, inspired by [circle-guard-bench](https://huggingface.co/spaces/whitecircle-ai/circle-guard-bench).
18
-
19
- ## ✨ Features
20
-
21
- ### 🎯 Core Functionality
22
-
23
- - **Multi-dimensional Evaluation**: Track models across BLEU scores, Pass@1/5/10 metrics, and 10 quality dimensions
24
- - **Advanced Filtering**: Filter results by programming language, comment language, and taxonomy category
25
- - **Real-time Updates**: Dynamic leaderboard updates with instant filtering
26
- - **Dark Theme**: Modern, eye-friendly interface with GitHub-inspired dark theme
27
-
28
- ### 🔍 Advanced Analytics
29
-
30
- - **Language Performance**: Compare model performance across programming languages
31
- - **Category Analysis**: Analyze performance by review type (bug detection, security, etc.)
32
- - **Submission History**: Track all submissions with IP-based logging
33
- - **Statistical Insights**: Comprehensive statistics and trend analysis
34
 
35
- ### 🛡️ Security & Quality
36
-
37
- - **IP-based Rate Limiting**: Prevent spam submissions (5 per 24 hours per IP)
38
- - **Comprehensive Validation**: Multi-layer validation for all submissions
39
- - **Audit Trail**: Complete submission logging for transparency
40
- - **Data Integrity**: Automatic data validation and backup systems
41
-
42
- ### 🌐 Multi-Language Support
43
-
44
- - **Programming Languages**: Python, JavaScript, Java, C++, Go, Rust, and more
45
- - **Comment Languages**: English, Chinese, Spanish, French, German, Japanese, and more
46
- - **Taxonomy Categories**: Bug Detection, Security, Performance, Style, and more
47
-
48
- ## 🚀 Quick Start
49
 
50
- ### Installation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  ```bash
53
  pip install -r requirements.txt
54
  ```
55
 
56
- ### Run Locally
57
 
58
  ```bash
59
  python app.py
60
  ```
61
 
62
- ### Access the Interface
63
-
64
- Open your browser to `http://localhost:7860`
65
-
66
- ## 📊 Usage Guide
67
-
68
- ### 1. Viewing the Leaderboard
69
-
70
- - Navigate to the **🏆 Leaderboard** tab
71
- - Use the filter dropdowns to narrow results:
72
- - **Programming Language**: Filter by specific programming languages
73
- - **Comment Language**: Filter by natural language of comments
74
- - **Taxonomy Category**: Filter by review category type
75
- - Click **🔄 Refresh** to update data
76
-
77
- ### 2. Submitting Models
78
-
79
- - Go to the **📝 Submit Model** tab
80
- - Fill in the submission form:
81
- - **Model Name**: Use `organization/model` format
82
- - **Languages & Category**: Select appropriate filters
83
- - **Performance Scores**: Provide BLEU and Pass@k scores (0.0-1.0)
84
- - **Quality Metrics**: Rate across 10 dimensions (0-10)
85
- - Click **🚀 Submit Model** to add your results
86
-
87
- ### 3. Analytics & Insights
88
-
89
- - Visit the **📈 Analytics** tab to see:
90
- - Recent submission history
91
- - Language performance comparisons
92
- - Category performance analysis
93
- - Trends and patterns
94
-
95
- ### 4. Data Export
96
-
97
- - Use the **ℹ️ About** tab to export data in JSON or CSV format
98
- - Full leaderboard data available for research and analysis
99
-
100
- ## 🏗️ Architecture
101
-
102
- ### Directory Structure
103
-
104
  ```
105
- ├── src/
106
- │ ├── about.py # About page content
107
- │ ├── envs.py # Environment configuration
108
- │ ├── display/ # Display utilities
109
- │ │ ├── css_html_js.py # Styling and themes
110
- │ │ ├── formatting.py # Data formatting
111
- │ │ └── utils.py # Display utilities
112
- │ ├── leaderboard/ # Leaderboard processing
113
- │ │ └── processor.py # Data operations
114
- │ └── submission/ # Submission handling
115
- │ └── submit.py # Submission validation
116
- ├── data/ # Data storage
117
- │ ├── leaderboard_data.json # Main leaderboard
118
- │ └── submissions.json # Submission log
119
- ├── app.py # Main application
120
- └── requirements.txt # Dependencies
121
- ```
122
-
123
- ### Key Components
124
-
125
- - **LeaderboardProcessor**: Handles all data operations, validation, and persistence
126
- - **SubmissionHandler**: Manages model submissions with IP tracking and validation
127
- - **Display Utils**: Provides filtering, formatting, and table generation
128
- - **Dark Theme**: Custom CSS for modern, accessible interface
129
 
130
- ## 🎨 Features Inspired by circle-guard-bench
131
 
132
- ### Implemented Features
133
 
134
- - **Multi-tab Interface**: Organized navigation with dedicated sections
135
- - **Advanced Filtering**: Real-time filtering by multiple criteria
136
- - **Dark Theme**: Modern, GitHub-inspired dark interface
137
- - **IP-based Submissions**: Secure submission tracking
138
- - **Comprehensive Analytics**: Detailed performance insights
139
- - **Data Export**: Multiple export formats
140
- - **Rate Limiting**: Anti-spam protection
141
-
142
- ### 🔧 Technical Improvements
143
-
144
- - **Modular Architecture**: Clean separation of concerns
145
- - **Type Safety**: Full type annotations throughout
146
- - **Error Handling**: Comprehensive error handling and logging
147
- - **Data Validation**: Multi-layer validation with Pydantic
148
- - **Performance**: Optimized data processing and display
149
-
150
- ## 📈 Metrics & Evaluation
151
-
152
- ### Performance Metrics
153
-
154
- - **BLEU**: Text similarity score (0.0-1.0)
155
- - **Pass@1**: Success rate in single attempt (0.0-1.0)
156
- - **Pass@5**: Success rate in 5 attempts (0.0-1.0)
157
- - **Pass@10**: Success rate in 10 attempts (0.0-1.0)
158
-
159
- ### Quality Dimensions
160
-
161
- 1. **Readability**: How clear and readable are the reviews?
162
- 2. **Relevance**: How relevant to the code changes?
163
- 3. **Explanation Clarity**: How well does it explain issues?
164
- 4. **Problem Identification**: How effectively does it identify problems?
165
- 5. **Actionability**: How actionable are the suggestions?
166
- 6. **Completeness**: How thorough are the reviews?
167
- 7. **Specificity**: How specific are the comments?
168
- 8. **Contextual Adequacy**: How well does it understand context?
169
- 9. **Consistency**: How consistent across different reviews?
170
- 10. **Brevity**: How concise without losing important information?
171
-
172
- ## 🔒 Security Features
173
-
174
- ### Rate Limiting
175
-
176
- - **5 submissions per IP per 24 hours**
177
- - **Automatic IP tracking and logging**
178
- - **Graceful error handling for rate limits**
179
-
180
- ### Data Validation
181
-
182
- - **Model name format validation**
183
- - **Score range validation (0.0-1.0 for performance, 0-10 for quality)**
184
- - **Logical consistency checks (Pass@1 ≤ Pass@5 ≤ Pass@10)**
185
- - **Required field validation**
186
-
187
- ### Audit Trail
188
-
189
- - **Complete submission logging**
190
- - **IP address tracking (partially masked for privacy)**
191
- - **Timestamp recording**
192
- - **Data integrity checks**
193
-
194
- ## 🤝 Contributing
195
-
196
- 1. Fork the repository
197
- 2. Create a feature branch
198
- 3. Make your changes
199
- 4. Add tests if applicable
200
- 5. Submit a pull request
201
-
202
- ## 📄 License
203
-
204
- This project is licensed under the MIT License - see the LICENSE file for details.
205
-
206
- ## 🙏 Acknowledgments
207
-
208
- - Inspired by [circle-guard-bench](https://huggingface.co/spaces/whitecircle-ai/circle-guard-bench)
209
- - Built with [Gradio](https://gradio.app/) for the web interface
210
- - Thanks to the open-source community for tools and inspiration
211
-
212
- ## 📞 Support
213
-
214
- For questions, issues, or contributions:
215
-
216
- - Open an issue on GitHub
217
- - Check the documentation
218
- - Contact the maintainers
219
-
220
- ---
221
 
222
- **Built with ❤️ for the code review research community**
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: CircleGuardBench
3
+ emoji:
4
+ colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: true
10
+ short_description: First benchmark testing LLM guards on safety and accuracy.
11
+ models:
12
+ - AtlaAI/Selene-1-Mini-Llama-3.1-8B
13
+ - google/gemma-3-12b-it
14
+ - google/gemma-3-4b-it
15
+ - meta-llama/Llama-3.1-8B-Instruct
16
+ - meta-llama/Llama-3.2-3B-Instruct
17
+ - meta-llama/Llama-4-Maverick-17B-128E-Instruct
18
+ - meta-llama/Llama-4-Scout-17B-16E-Instruct
19
+ - meta-llama/Llama-Guard-3-1B
20
+ - meta-llama/Llama-Guard-3-8B
21
+ - meta-llama/Llama-Guard-4-12B
22
+ - mistralai/Ministral-8B-Instruct-2410
23
+ - mistralai/Mistral-Small-3.1-24B-Instruct-2503
24
+ - Qwen/Qwen2.5-7B-Instruct
25
+ - Qwen/Qwen3-0.6B
26
+ - Qwen/Qwen3-1.7B
27
+ - Qwen/Qwen3-4B
28
+ - Qwen/Qwen3-8B
 
 
 
 
 
 
29
 
30
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ # CodeReview Bench Leaderboard
33
+
34
+ A comprehensive leaderboard for evaluating automated code review systems across programming languages and review quality dimensions.
35
+
36
+ ## Features
37
+
38
+ - **Multi-Language Support**: Evaluates models across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more
39
+ - **Dual Language Comments**: Supports both Russian and English comment languages
40
+ - **Comprehensive Metrics**:
41
+ - LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity)
42
+ - Exact-match metrics (pass@1, pass@5, pass@10, BLEU@10)
43
+ - **Interactive Visualization**: Compare model performance across categories with radar plots
44
+ - **Easy Submission**: Submit your model results via web interface
45
+
46
+ ## Metrics
47
+
48
+ ### LLM-based Multimetric
49
+
50
+ - **Readability**: How easy the review is to understand
51
+ - **Relevance**: How relevant the review is to the code
52
+ - **Explanation Clarity**: How clear the explanations are
53
+ - **Problem Identification**: How well problems are identified
54
+ - **Actionability**: How actionable the suggestions are
55
+ - **Completeness**: How complete the review is
56
+ - **Specificity**: How specific the feedback is
57
+ - **Contextual Adequacy**: How well the review fits the context
58
+ - **Consistency**: How consistent the review style is
59
+ - **Brevity**: How concise the review is
60
+
61
+ ### Exact-Match Metrics
62
+
63
+ - **Pass@1**: Percentage of correct reviews on first attempt
64
+ - **Pass@5**: Percentage of correct reviews in top 5 attempts
65
+ - **Pass@10**: Percentage of correct reviews in top 10 attempts
66
+ - **BLEU@10**: BLEU score for top 10 review candidates
67
+
68
+ ## Programming Languages Supported
69
+
70
+ - Python
71
+ - JavaScript
72
+ - Java
73
+ - C++
74
+ - C#
75
+ - TypeScript
76
+ - Go
77
+ - Rust
78
+ - Swift
79
+ - Kotlin
80
+ - Ruby
81
+ - PHP
82
+ - C
83
+ - Scala
84
+ - R
85
+ - Dart
86
+ - Other
87
+
88
+ ## Comment Languages
89
+
90
+ - Russian (ru)
91
+ - English (en)
92
+
93
+ ## Example Categories
94
+
95
+ - Bug Fix
96
+ - Code Style
97
+ - Performance
98
+ - Security
99
+ - Refactoring
100
+ - Documentation
101
+ - Testing
102
+ - Architecture
103
+ - Other
104
+
105
+ ## Installation
106
 
107
  ```bash
108
  pip install -r requirements.txt
109
  ```
110
 
111
+ ## Usage
112
 
113
  ```bash
114
  python app.py
115
  ```
116
 
117
+ ## Submission Format
118
+
119
+ Submit your results as a JSONL file where each line contains:
120
+
121
+ ```json
122
+ {
123
+ "model_name": "your-model-name",
124
+ "programming_language": "python",
125
+ "comment_language": "en",
126
+ "readability": 8.5,
127
+ "relevance": 9.0,
128
+ "explanation_clarity": 7.8,
129
+ "problem_identification": 8.2,
130
+ "actionability": 8.7,
131
+ "completeness": 8.0,
132
+ "specificity": 7.5,
133
+ "contextual_adequacy": 8.3,
134
+ "consistency": 8.8,
135
+ "brevity": 7.2,
136
+ "pass_at_1": 0.75,
137
+ "pass_at_5": 0.88,
138
+ "pass_at_10": 0.92,
139
+ "bleu_at_10": 0.65,
140
+ "total_evaluations": 100
141
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
+ ## Environment Variables
145
 
146
+ Set the following environment variables:
147
 
148
+ ```bash
149
+ HF_TOKEN=your_huggingface_token
150
+ OWNER=your-organization
151
+ RESULTS_DATASET_ID=your-org/codereview-bench-results
152
+ ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ ## Citation
155
+
156
+ ```bibtex
157
+ @misc{codereviewbench2025,
158
+ author = {CodeReview Bench Team},
159
+ title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
160
+ year = {2025},
161
+ publisher = {GitHub},
162
+ journal = {GitHub repository},
163
+ howpublished = {\url{https://github.com/your-org/codereview-bench}}
164
+ }
165
+ ```
app.py CHANGED
@@ -1,365 +1,1254 @@
1
  """
2
- CodeReview Leaderboard - Inspired by circle-guard-bench
3
- A comprehensive leaderboard for code review generation models
4
  """
5
 
 
 
 
 
6
  import gradio as gr
7
- from typing import List, Dict, Any
8
- from datetime import datetime, timezone
 
 
 
 
 
9
 
10
- # Import our modules
11
- from src.envs import (
12
- PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES,
13
- MAIN_HEADERS, QUALITY_HEADERS
 
 
 
14
  )
15
- from src.about import TITLE, INTRODUCTION_TEXT
16
- from src.display.css_html_js import DARK_THEME_CSS, CUSTOM_JS, HEADER_HTML, FOOTER_HTML
17
  from src.display.utils import (
18
- get_main_leaderboard_data, get_quality_metrics_data,
19
- get_submission_history_data, get_statistics_summary
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  )
21
- from src.leaderboard.processor import LeaderboardProcessor
22
- from src.submission.submit import SubmissionHandler
23
-
24
- # Initialize processors
25
- processor = LeaderboardProcessor()
26
- submission_handler = SubmissionHandler()
27
-
28
- # Global state
29
- current_filters = {
30
- "programming_language": "All",
31
- "comment_language": "All",
32
- "taxonomy_category": "All"
33
- }
34
 
35
- def update_leaderboard_tables(
36
- programming_language: str = "All",
37
- comment_language: str = "All",
38
- taxonomy_category: str = "All"
39
- ):
40
- """Update leaderboard tables with filters"""
41
- global current_filters
42
- current_filters = {
43
- "programming_language": programming_language,
44
- "comment_language": comment_language,
45
- "taxonomy_category": taxonomy_category
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  }
47
-
48
- # Load current data
49
- data = processor.load_leaderboard_data()
50
-
51
- # Get filtered tables
52
- main_table = get_main_leaderboard_data(
53
- data, programming_language, comment_language, taxonomy_category
 
 
 
 
54
  )
55
-
56
- quality_table = get_quality_metrics_data(
57
- data, programming_language, comment_language, taxonomy_category
 
 
 
 
 
 
58
  )
59
-
60
- # Get statistics
61
- stats = get_statistics_summary(data)
62
-
63
- # Format statistics display
64
- stats_text = f"""
65
- ## 📊 Current Statistics
66
- - **Total Models**: {stats['total_models']}
67
- - **Total Submissions**: {stats['total_submissions']}
68
- - **Average Pass@1**: {stats['avg_pass_1']:.3f}
69
- - **Best Model**: {stats['best_model']}
70
- - **Languages Covered**: {stats['languages_covered']}
71
- - **Categories Covered**: {stats['categories_covered']}
72
  """
73
-
74
- return main_table, quality_table, stats_text
75
-
76
- def refresh_data():
77
- """Refresh all data from storage"""
78
- return update_leaderboard_tables(
79
- current_filters["programming_language"],
80
- current_filters["comment_language"],
81
- current_filters["taxonomy_category"]
82
- )
83
 
84
- def handle_submission(
85
- request: gr.Request,
86
- *args
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  ):
88
- """Handle model submission"""
89
- # Get current data
90
- current_data = processor.load_leaderboard_data()
91
-
92
- # Call submission handler
93
- result = submission_handler.submit_model(request, current_data, *args)
94
-
95
- # If submission was successful, refresh tables
96
- if result[0] != current_data: # Data was updated
97
- main_table, quality_table, stats_text = update_leaderboard_tables(
98
- current_filters["programming_language"],
99
- current_filters["comment_language"],
100
- current_filters["taxonomy_category"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  )
102
- return result[0], main_table, quality_table, result[3], stats_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  else:
104
- return result[0], result[1], result[2], result[3], None
105
-
106
- # Create the Gradio interface
107
- with gr.Blocks(
108
- theme=gr.themes.Base(),
109
- css=DARK_THEME_CSS,
110
- js=CUSTOM_JS,
111
- title=TITLE,
112
- head="<meta name='viewport' content='width=device-width, initial-scale=1'>"
113
- ) as demo:
114
-
115
- # Header
116
- gr.HTML(HEADER_HTML)
117
-
118
- # State to store leaderboard data
119
- leaderboard_state = gr.State(value=processor.load_leaderboard_data())
120
-
121
- # Main content tabs
122
- with gr.Tabs():
123
-
124
- # Leaderboard Tab
125
- with gr.Tab("🏆 Leaderboard"):
126
-
127
- # Filters
128
- with gr.Row():
129
- prog_lang_filter = gr.Dropdown(
130
- choices=PROGRAMMING_LANGUAGES,
131
- value="All",
132
- label="🔍 Programming Language",
133
- info="Filter by programming language"
134
- )
135
- comment_lang_filter = gr.Dropdown(
136
- choices=COMMENT_LANGUAGES,
137
- value="All",
138
- label="🌍 Comment Language",
139
- info="Filter by comment language"
140
- )
141
- taxonomy_filter = gr.Dropdown(
142
- choices=TAXONOMY_CATEGORIES,
143
- value="All",
144
- label="🏷️ Taxonomy Category",
145
- info="Filter by review category"
146
  )
147
- refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
148
-
149
- # Statistics
150
- stats_display = gr.Markdown("")
151
-
152
- # Main leaderboard table
153
- with gr.Row():
154
- main_leaderboard = gr.Dataframe(
155
- headers=MAIN_HEADERS,
156
- label="🏅 Main Leaderboard",
157
- interactive=False,
158
- wrap=True,
159
- max_height=600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  )
161
-
162
- # Quality metrics table
163
- with gr.Row():
164
- quality_metrics = gr.Dataframe(
165
- headers=QUALITY_HEADERS,
166
- label="📊 Quality Metrics",
167
- interactive=False,
168
- wrap=True,
169
- max_height=600
 
 
 
 
 
170
  )
171
-
172
- # Submission Tab
173
- with gr.Tab("📝 Submit Model"):
174
-
175
- # Create submission form
176
- form_components = submission_handler.get_submission_form_components()
177
-
178
- # Connect submission handler
179
- form_components["submit_btn"].click(
180
- fn=handle_submission,
181
- inputs=[
182
- leaderboard_state,
183
- form_components["model_name"],
184
- form_components["programming_language"],
185
- form_components["comment_language"],
186
- form_components["taxonomy_category"],
187
- form_components["bleu"],
188
- form_components["pass1"],
189
- form_components["pass5"],
190
- form_components["pass10"],
191
- form_components["readability"],
192
- form_components["relevance"],
193
- form_components["explanation_clarity"],
194
- form_components["problem_identification"],
195
- form_components["actionability"],
196
- form_components["completeness"],
197
- form_components["specificity"],
198
- form_components["contextual_adequacy"],
199
- form_components["consistency"],
200
- form_components["brevity"],
201
- ],
202
- outputs=[
203
- leaderboard_state,
204
- main_leaderboard,
205
- quality_metrics,
206
- form_components["status_msg"],
207
- stats_display
208
- ]
209
- )
210
-
211
- # Analytics Tab
212
- with gr.Tab("📈 Analytics"):
213
-
214
- with gr.Row():
215
- analytics_prog_lang = gr.Dropdown(
216
- choices=PROGRAMMING_LANGUAGES,
217
- value="All",
218
- label="Programming Language"
219
  )
220
- analytics_comment_lang = gr.Dropdown(
221
- choices=COMMENT_LANGUAGES,
222
- value="All",
223
- label="Comment Language"
 
 
 
 
 
 
 
 
 
 
 
224
  )
225
- analytics_taxonomy = gr.Dropdown(
226
- choices=TAXONOMY_CATEGORIES,
227
- value="All",
228
- label="Taxonomy Category"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  )
230
-
231
- # Submission history
232
- submission_history = gr.Dataframe(
233
- headers=["Model", "Programming Language", "Comment Language", "Taxonomy", "Pass@1", "Date", "IP"],
234
- label="📋 Recent Submissions",
235
- interactive=False,
236
- max_height=400
237
- )
238
-
239
- # Language performance analysis
240
- with gr.Row():
241
- with gr.Column():
242
- gr.Markdown("### 🗣️ Language Performance Analysis")
243
- language_analysis = gr.Dataframe(
244
- headers=["Language", "Avg Pass@1", "Model Count", "Best Model"],
245
- label="Programming Language Performance",
246
- interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  )
248
-
249
- with gr.Column():
250
- gr.Markdown("### 🏷️ Category Performance Analysis")
251
- category_analysis = gr.Dataframe(
252
- headers=["Category", "Avg Pass@1", "Model Count", "Best Model"],
253
- label="Taxonomy Category Performance",
254
- interactive=False
 
 
 
 
 
255
  )
256
-
257
- # About Tab
258
- with gr.Tab("ℹ️ About"):
259
- gr.Markdown(INTRODUCTION_TEXT)
260
-
261
- # Export functionality
262
- with gr.Row():
263
- export_format = gr.Dropdown(
264
- choices=["JSON", "CSV"],
265
- value="JSON",
266
- label="Export Format"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  )
268
- export_btn = gr.Button("📥 Export Data")
269
-
270
- export_output = gr.Textbox(
271
- label="Export Output",
272
- lines=10,
273
- max_lines=20,
274
- show_copy_button=True
275
- )
276
-
277
- # Footer
278
- gr.HTML(FOOTER_HTML)
279
-
280
- # Initialize with data
281
- initial_main, initial_quality, initial_stats = update_leaderboard_tables()
282
-
283
- # Update tables when filters change
284
- filter_inputs = [prog_lang_filter, comment_lang_filter, taxonomy_filter]
285
- filter_outputs = [main_leaderboard, quality_metrics, stats_display]
286
-
287
- for filter_input in filter_inputs:
288
- filter_input.change(
289
- fn=update_leaderboard_tables,
290
- inputs=filter_inputs,
291
- outputs=filter_outputs
292
- )
293
-
294
- # Refresh button
295
- refresh_btn.click(
296
- fn=refresh_data,
297
- outputs=filter_outputs
298
- )
299
-
300
- # Analytics updates
301
- analytics_inputs = [analytics_prog_lang, analytics_comment_lang, analytics_taxonomy]
302
-
303
- def update_analytics(prog_lang, comment_lang, taxonomy):
304
- """Update analytics tables"""
305
- data = processor.load_leaderboard_data()
306
-
307
- # Get submission history
308
- history = get_submission_history_data(data, prog_lang, comment_lang, taxonomy)
309
-
310
- # Get language performance
311
- lang_perf = []
312
- for lang in PROGRAMMING_LANGUAGES[1:]:
313
- lang_data = [d for d in data if d.get("programming_language") == lang]
314
- if lang_data:
315
- avg_score = sum(d.get("llm_pass_1", 0) for d in lang_data) / len(lang_data)
316
- best_model = max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
317
- lang_perf.append([lang, f"{avg_score:.3f}", len(lang_data), best_model])
318
-
319
- # Get category performance
320
- cat_perf = []
321
- for cat in TAXONOMY_CATEGORIES[1:]:
322
- cat_data = [d for d in data if d.get("taxonomy_category") == cat]
323
- if cat_data:
324
- avg_score = sum(d.get("llm_pass_1", 0) for d in cat_data) / len(cat_data)
325
- best_model = max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
326
- cat_perf.append([cat, f"{avg_score:.3f}", len(cat_data), best_model])
327
-
328
- return history, lang_perf, cat_perf
329
-
330
- for analytics_input in analytics_inputs:
331
- analytics_input.change(
332
- fn=update_analytics,
333
- inputs=analytics_inputs,
334
- outputs=[submission_history, language_analysis, category_analysis]
335
- )
336
-
337
- # Export functionality
338
- def export_data(format_type):
339
- """Export leaderboard data"""
340
- return processor.export_data(format_type.lower())
341
-
342
- export_btn.click(
343
- fn=export_data,
344
- inputs=[export_format],
345
- outputs=[export_output]
346
- )
347
-
348
- # Set initial values
349
- demo.load(
350
- fn=lambda: (initial_main, initial_quality, initial_stats),
351
- outputs=[main_leaderboard, quality_metrics, stats_display]
352
- )
353
 
354
- # Launch configuration
355
- if __name__ == "__main__":
356
- demo.queue(max_size=20).launch(
357
- server_name="0.0.0.0",
358
- server_port=7860,
359
- share=False,
360
- show_error=True,
361
- debug=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  )
363
 
364
- # For deployment (HuggingFace Spaces, etc.)
365
- app = demo
 
 
 
 
 
 
 
 
1
  """
2
+ CodeReview Bench Leaderboard Application
 
3
  """
4
 
5
+ import os
6
+ import json
7
+ import tempfile
8
+ import logging
9
  import gradio as gr
10
+ import pandas as pd
11
+ import plotly.express as px
12
+ import plotly.graph_objects as go
13
+ from apscheduler.schedulers.background import BackgroundScheduler
14
+ import numpy as np
15
+ from gradio.themes.utils import fonts, colors
16
+ from dataclasses import fields, dataclass
17
 
18
+ from src.about import (
19
+ CITATION_BUTTON_LABEL,
20
+ CITATION_BUTTON_TEXT,
21
+ EVALUATION_QUEUE_TEXT,
22
+ INTRODUCTION_TEXT,
23
+ LLM_BENCHMARKS_TEXT,
24
+ TITLE,
25
  )
26
+ from src.display.css_html_js import custom_css
 
27
  from src.display.utils import (
28
+ CODEREVIEW_COLUMN,
29
+ DISPLAY_COLS,
30
+ METRIC_COLS,
31
+ HIDDEN_COLS,
32
+ NEVER_HIDDEN_COLS,
33
+ CATEGORIES,
34
+ COMMENT_LANGUAGES,
35
+ EXAMPLE_CATEGORIES,
36
+ ModelType,
37
+ Mode,
38
+ Precision,
39
+ WeightType,
40
+ ReviewModelType,
41
+ get_all_column_choices,
42
+ get_default_visible_columns,
43
  )
44
+ from src.display.formatting import styled_message, styled_error, styled_warning
45
+ from src.envs import (
46
+ ADMIN_USERNAME,
47
+ ADMIN_PASSWORD,
48
+ RESULTS_DATASET_ID,
49
+ SUBMITTER_TOKEN,
50
+ TOKEN,
51
+ DATA_PATH,
52
+ )
53
+ from src.populate import get_leaderboard_df, get_category_leaderboard_df
54
+ from src.submission.submit import process_submission
 
 
55
 
56
+ # Configure logging
57
+ logging.basicConfig(
58
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
59
+ )
60
+ logger = logging.getLogger(__name__)
61
+
62
+ # Ensure data directory exists
63
+ os.makedirs(DATA_PATH, exist_ok=True)
64
+
65
+ # Available benchmark versions
66
+ BENCHMARK_VERSIONS = ["v0"]
67
+ CURRENT_VERSION = "v0"
68
+
69
+ # Initialize leaderboard data
70
+ try:
71
+ logger.info("Initializing leaderboard data...")
72
+ LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
73
+ logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
74
+ except Exception as e:
75
+ logger.error(f"Error loading leaderboard data: {e}")
76
+ LEADERBOARD_DF = pd.DataFrame()
77
+
78
+ custom_theme = gr.themes.Default(
79
+ primary_hue=colors.slate,
80
+ secondary_hue=colors.slate,
81
+ neutral_hue=colors.neutral,
82
+ font=(fonts.GoogleFont("Inter"), "sans-serif"),
83
+ ).set(
84
+ # font_size="16px",
85
+ body_background_fill="#0f0f10",
86
+ body_background_fill_dark="#0f0f10",
87
+ body_text_color="#f4f4f5",
88
+ body_text_color_subdued="#a1a1aa",
89
+ block_background_fill="#1e1e1e", # Cooler Grey
90
+ block_border_color="#333333", # Cooler Grey
91
+ block_shadow="none",
92
+ # Swapped primary and secondary button styles
93
+ button_primary_background_fill="#121212", # Changed to specific color for Refresh button
94
+ button_primary_text_color="#f4f4f5",
95
+ button_primary_border_color="#333333", # Keep border grey or change to #121212?
96
+ button_secondary_background_fill="#f4f4f5",
97
+ button_secondary_text_color="#0f0f10",
98
+ button_secondary_border_color="#f4f4f5",
99
+ input_background_fill="#1e1e1e", # Cooler Grey
100
+ input_border_color="#333333", # Cooler Grey
101
+ input_placeholder_color="#71717a",
102
+ table_border_color="#333333", # Cooler Grey
103
+ table_even_background_fill="#2d2d2d", # Cooler Grey (Slightly lighter)
104
+ table_odd_background_fill="#1e1e1e", # Cooler Grey
105
+ table_text_color="#f4f4f5",
106
+ link_text_color="#ffffff",
107
+ border_color_primary="#333333", # Cooler Grey
108
+ background_fill_secondary="#333333", # Cooler Grey
109
+ color_accent="#f4f4f5",
110
+ border_color_accent="#333333", # Cooler Grey
111
+ button_primary_background_fill_hover="#424242", # Cooler Grey
112
+ block_title_text_color="#f4f4f5",
113
+ accordion_text_color="#f4f4f5",
114
+ panel_background_fill="#1e1e1e", # Cooler Grey
115
+ panel_border_color="#333333", # Cooler Grey
116
+ # Explicitly setting primary/secondary/accent colors/borders
117
+ background_fill_primary="#0f0f10",
118
+ background_fill_primary_dark="#0f0f10",
119
+ background_fill_secondary_dark="#333333", # Cooler Grey
120
+ border_color_primary_dark="#333333", # Cooler Grey
121
+ border_color_accent_dark="#333333", # Cooler Grey
122
+ border_color_accent_subdued="#424242", # Cooler Grey
123
+ border_color_accent_subdued_dark="#424242", # Cooler Grey
124
+ color_accent_soft="#a1a1aa",
125
+ color_accent_soft_dark="#a1a1aa",
126
+ # Explicitly setting input hover/focus states
127
+ input_background_fill_dark="#1e1e1e", # Cooler Grey
128
+ input_background_fill_focus="#424242", # Cooler Grey
129
+ input_background_fill_focus_dark="#424242", # Cooler Grey
130
+ input_background_fill_hover="#2d2d2d", # Cooler Grey
131
+ input_background_fill_hover_dark="#2d2d2d", # Cooler Grey
132
+ input_border_color_dark="#333333", # Cooler Grey
133
+ input_border_color_focus="#f4f4f5",
134
+ input_border_color_focus_dark="#f4f4f5",
135
+ input_border_color_hover="#424242", # Cooler Grey
136
+ input_border_color_hover_dark="#424242", # Cooler Grey
137
+ input_placeholder_color_dark="#71717a",
138
+ # Explicitly set dark variants for table backgrounds
139
+ table_even_background_fill_dark="#2d2d2d", # Cooler Grey
140
+ table_odd_background_fill_dark="#1e1e1e", # Cooler Grey
141
+ # Explicitly set dark text variants
142
+ body_text_color_dark="#f4f4f5",
143
+ body_text_color_subdued_dark="#a1a1aa",
144
+ block_title_text_color_dark="#f4f4f5",
145
+ accordion_text_color_dark="#f4f4f5",
146
+ table_text_color_dark="#f4f4f5",
147
+ # Explicitly set dark panel/block variants
148
+ panel_background_fill_dark="#1e1e1e", # Cooler Grey
149
+ panel_border_color_dark="#333333", # Cooler Grey
150
+ block_background_fill_dark="#1e1e1e", # Cooler Grey
151
+ block_border_color_dark="#333333", # Cooler Grey
152
+ )
153
+
154
+
155
+ @dataclass
156
+ class ColumnInfo:
157
+ """Information about a column in the leaderboard."""
158
+
159
+ name: str
160
+ display_name: str
161
+ type: str = "text"
162
+ hidden: bool = False
163
+ never_hidden: bool = False
164
+ displayed_by_default: bool = True
165
+
166
+
167
+ def update_column_choices(df):
168
+ """Update column choices based on what's actually in the dataframe"""
169
+ if df is None or df.empty:
170
+ return get_all_column_choices()
171
+
172
+ # Get columns that actually exist in the dataframe
173
+ existing_columns = list(df.columns)
174
+
175
+ # Get all possible columns with their display names
176
+ all_columns = get_all_column_choices()
177
+
178
+ # Filter to only include columns that exist in the dataframe
179
+ valid_columns = [
180
+ (col_name, display_name)
181
+ for col_name, display_name in all_columns
182
+ if col_name in existing_columns
183
+ ]
184
+
185
+ # Return default if there are no valid columns
186
+ if not valid_columns:
187
+ return get_all_column_choices()
188
+
189
+ return valid_columns
190
+
191
+
192
+ # Update the column_selector initialization
193
+ def get_initial_columns():
194
+ """Get initial columns to show in the dropdown"""
195
+ try:
196
+ # Get available columns in the main dataframe
197
+ available_cols = list(LEADERBOARD_DF.columns)
198
+ logger.info(f"Available columns in LEADERBOARD_DF: {available_cols}")
199
+
200
+ # If dataframe is empty, use default visible columns
201
+ if not available_cols:
202
+ return get_default_visible_columns()
203
+
204
+ # Get default visible columns that actually exist in the dataframe
205
+ valid_defaults = [
206
+ col for col in get_default_visible_columns() if col in available_cols
207
+ ]
208
+
209
+ # If none of the defaults exist, return all available columns
210
+ if not valid_defaults:
211
+ return available_cols
212
+
213
+ return valid_defaults
214
+ except Exception as e:
215
+ logger.error(f"Error getting initial columns: {e}")
216
+ return get_default_visible_columns()
217
+
218
+
219
+ def init_leaderboard(dataframe, visible_columns=None):
220
+ """
221
+ Initialize a standard Gradio Dataframe component for the leaderboard.
222
+ """
223
+ if dataframe is None or dataframe.empty:
224
+ # Create an empty dataframe with the right columns
225
+ columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
226
+ dataframe = pd.DataFrame(columns=columns)
227
+ logger.warning("Initializing empty leaderboard")
228
+
229
+ # Lowercase model_name for display
230
+ if "model_name" in dataframe.columns:
231
+ dataframe = dataframe.copy()
232
+ dataframe["model_name"] = dataframe["model_name"].str.lower()
233
+
234
+ if "model_type" in dataframe.columns:
235
+ dataframe = dataframe.copy()
236
+ dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
237
+
238
+ if "review_model_type" in dataframe.columns:
239
+ dataframe = dataframe.copy()
240
+ dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
241
+
242
+ # print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
243
+
244
+ # Determine which columns to display
245
+ display_column_names = [
246
+ getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
247
+ ]
248
+ hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
249
+
250
+ # Columns that should always be shown
251
+ always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
252
+
253
+ # Use provided visible columns if specified, otherwise use default
254
+ if visible_columns is None:
255
+ # Determine which columns to show initially
256
+ visible_columns = [
257
+ col for col in display_column_names if col not in hidden_column_names
258
+ ]
259
+
260
+ # Always include the never-hidden columns
261
+ for col in always_visible:
262
+ if col not in visible_columns and col in dataframe.columns:
263
+ visible_columns.append(col)
264
+
265
+ # Make sure we only include columns that actually exist in the dataframe
266
+ visible_columns = [col for col in visible_columns if col in dataframe.columns]
267
+
268
+ # Map GuardBench column types to Gradio's expected datatype strings
269
+ # Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
270
+ type_mapping = {
271
+ "text": "str",
272
+ "number": "number",
273
+ "bool": "bool",
274
+ "date": "date",
275
+ "markdown": "markdown",
276
+ "html": "html",
277
+ "image": "image",
278
+ }
279
+
280
+ # Create a list of datatypes in the format Gradio expects
281
+ datatypes = []
282
+ for col in visible_columns:
283
+ # Find the corresponding CODEREVIEW_COLUMN entry
284
+ col_type = None
285
+ for display_col in DISPLAY_COLS:
286
+ if getattr(CODEREVIEW_COLUMN, display_col).name == col:
287
+ orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
288
+ # Map to Gradio's expected types
289
+ col_type = type_mapping.get(orig_type, "str")
290
+ break
291
+
292
+ # Default to 'str' if type not found or not mappable
293
+ if col_type is None:
294
+ col_type = "str"
295
+
296
+ datatypes.append(col_type)
297
+
298
+ # Create a dummy column for search functionality if it doesn't exist
299
+ if "search_dummy" not in dataframe.columns:
300
+ dataframe["search_dummy"] = dataframe.apply(
301
+ lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
302
+ axis=1,
303
+ )
304
+
305
+ # Select only the visible columns for display
306
+ visible_columns.remove("model_name")
307
+
308
+ visible_columns = ["model_name"] + visible_columns
309
+ display_df = dataframe[visible_columns].copy()
310
+
311
+ # print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
312
+ # print(display_df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head() if all(c in display_df.columns for c in ['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']) else "Relevant columns not present")
313
+ # print(f"-------------------------------------------------------------")
314
+
315
+ # Round numeric columns to 3 decimal places for display
316
+ numeric_cols = display_df.select_dtypes(include=np.number).columns
317
+ for col in numeric_cols:
318
+ # Avoid rounding integer columns like counts
319
+ if not pd.api.types.is_integer_dtype(display_df[col]):
320
+ # Format floats to exactly 3 decimal places, preserving trailing zeros
321
+ display_df[col] = display_df[col].apply(
322
+ lambda x: f"{x:.3f}" if pd.notna(x) else None
323
+ )
324
+
325
+ column_info_map = {
326
+ f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
327
  }
328
+ column_mapping = {
329
+ col: column_info_map.get(col, ColumnInfo(col, col)).display_name
330
+ for col in visible_columns
331
+ }
332
+
333
+ # Rename columns in the DataFrame
334
+ display_df.rename(columns=column_mapping, inplace=True)
335
+
336
+ # Apply styling - note: styling might need adjustment if it relies on column names
337
+ styler = display_df.style.set_properties(**{"text-align": "right"}).set_properties(
338
+ subset=["Model"], **{"width": "200px"}
339
  )
340
+
341
+ return gr.Dataframe(
342
+ value=styler,
343
+ datatype=datatypes,
344
+ interactive=False,
345
+ wrap=True,
346
+ height=2500,
347
+ elem_id="leaderboard-table",
348
+ row_count=len(display_df),
349
  )
350
+
351
+
352
+ def search_filter_leaderboard(
353
+ df, search_query="", model_types=None, version=CURRENT_VERSION
354
+ ):
 
 
 
 
 
 
 
 
355
  """
356
+ Filter the leaderboard based on search query and model types.
357
+ """
358
+ if df is None or df.empty:
359
+ return df
 
 
 
 
 
 
360
 
361
+ filtered_df = df.copy()
362
+
363
+ # Add search dummy column if it doesn't exist
364
+ if "search_dummy" not in filtered_df.columns:
365
+ filtered_df["search_dummy"] = filtered_df.apply(
366
+ lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
367
+ axis=1,
368
+ )
369
+
370
+ # Apply model type filter
371
+ if model_types and len(model_types) > 0:
372
+ filtered_df = filtered_df[
373
+ filtered_df[GUARDBENCH_COLUMN.model_type.name].isin(model_types)
374
+ ]
375
+
376
+ # Apply search query
377
+ if search_query:
378
+ search_terms = [
379
+ term.strip() for term in search_query.split(";") if term.strip()
380
+ ]
381
+ if search_terms:
382
+ combined_mask = None
383
+ for term in search_terms:
384
+ mask = filtered_df["search_dummy"].str.contains(
385
+ term, case=False, na=False
386
+ )
387
+ if combined_mask is None:
388
+ combined_mask = mask
389
+ else:
390
+ combined_mask = combined_mask | mask
391
+
392
+ if combined_mask is not None:
393
+ filtered_df = filtered_df[combined_mask]
394
+
395
+ # Drop the search dummy column before returning
396
+ visible_columns = [col for col in filtered_df.columns if col != "search_dummy"]
397
+ return filtered_df[visible_columns]
398
+
399
+
400
+ def refresh_data_with_filters(
401
+ version=CURRENT_VERSION, search_query="", model_types=None, selected_columns=None
402
  ):
403
+ """
404
+ Refresh the leaderboard data and update all components with filtering.
405
+ Ensures we handle cases where dataframes might have limited columns.
406
+ """
407
+ global LEADERBOARD_DF
408
+ try:
409
+ logger.info(f"Performing refresh of leaderboard data with filters...")
410
+ # Get new data
411
+ main_df = get_leaderboard_df(version=version)
412
+ LEADERBOARD_DF = main_df
413
+ category_dfs = [
414
+ get_category_leaderboard_df(category, version=version)
415
+ for category in CATEGORIES
416
+ ]
417
+ selected_columns = [
418
+ x.lower()
419
+ .replace(" ", "_")
420
+ .replace("(", "")
421
+ .replace(")", "")
422
+ .replace("_recall", "_recall_binary")
423
+ .replace("_precision", "_precision_binary")
424
+ for x in selected_columns
425
+ ]
426
+
427
+ # Log the actual columns we have
428
+ logger.info(f"Main dataframe columns: {list(main_df.columns)}")
429
+
430
+ # Apply filters to each dataframe
431
+ filtered_main_df = search_filter_leaderboard(
432
+ main_df, search_query, model_types, version
433
+ )
434
+ filtered_category_dfs = [
435
+ search_filter_leaderboard(df, search_query, model_types, version)
436
+ for df in category_dfs
437
+ ]
438
+
439
+ # Get available columns from the dataframe
440
+ available_columns = list(filtered_main_df.columns)
441
+
442
+ # Filter selected columns to only those available in the data
443
+ if selected_columns:
444
+ # Convert display names to internal names first
445
+ internal_selected_columns = [
446
+ x.lower()
447
+ .replace(" ", "_")
448
+ .replace("(", "")
449
+ .replace(")", "")
450
+ .replace("_recall", "_recall_binary")
451
+ .replace("_precision", "_precision_binary")
452
+ for x in selected_columns
453
+ ]
454
+ valid_selected_columns = [
455
+ col for col in internal_selected_columns if col in available_columns
456
+ ]
457
+ if not valid_selected_columns and "model_name" in available_columns:
458
+ # Fallback if conversion/filtering leads to empty selection
459
+ valid_selected_columns = ["model_name"] + [
460
+ col
461
+ for col in get_default_visible_columns()
462
+ if col in available_columns
463
+ ]
464
+ else:
465
+ # If no columns were selected in the dropdown, use default visible columns that exist
466
+ valid_selected_columns = [
467
+ col for col in get_default_visible_columns() if col in available_columns
468
+ ]
469
+
470
+ # Initialize dataframes for display with valid selected columns
471
+ main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
472
+
473
+ # For category dataframes, get columns that actually exist in each one
474
+ category_dataframes = []
475
+ for df in filtered_category_dfs:
476
+ df_columns = list(df.columns)
477
+ df_valid_columns = [
478
+ col for col in valid_selected_columns if col in df_columns
479
+ ]
480
+ if not df_valid_columns and "model_name" in df_columns:
481
+ df_valid_columns = ["model_name"] + get_default_visible_columns()
482
+ category_dataframes.append(init_leaderboard(df, df_valid_columns))
483
+
484
+ return main_dataframe, *category_dataframes
485
+
486
+ except Exception as e:
487
+ logger.error(f"Error in refresh with filters: {e}")
488
+ # Return the current leaderboards on error
489
+ return leaderboard, *[
490
+ tab.children[0] for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
491
+ ]
492
+
493
+
494
+ def submit_results(
495
+ model_name: str,
496
+ base_model: str,
497
+ revision: str,
498
+ precision: str,
499
+ weight_type: str,
500
+ model_type: str,
501
+ mode: str,
502
+ submission_file: tempfile._TemporaryFileWrapper,
503
+ version: str,
504
+ review_model_type: ReviewModelType,
505
+ ):
506
+ """
507
+ Handle submission of results with model metadata.
508
+ """
509
+ if submission_file is None:
510
+ return styled_error("No submission file provided")
511
+
512
+ if not model_name:
513
+ return styled_error("Model name is required")
514
+
515
+ if not model_type:
516
+ return styled_error("Please select a model type")
517
+
518
+ if not mode:
519
+ return styled_error("Please select an inference mode")
520
+
521
+ file_path = submission_file.name
522
+ logger.info(f"Received submission for model {model_name}: {file_path}")
523
+
524
+ # Add metadata to the submission
525
+ metadata = {
526
+ "model_name": model_name,
527
+ "base_model": base_model,
528
+ "revision": revision if revision else "main",
529
+ "precision": precision,
530
+ "weight_type": weight_type,
531
+ "model_type": model_type,
532
+ "mode": mode,
533
+ "version": version,
534
+ "review_model_type": review_model_type,
535
+ }
536
+
537
+ # Process the submission
538
+ result = process_submission(file_path, metadata, version=version)
539
+
540
+ # Refresh the leaderboard data
541
+ global LEADERBOARD_DF
542
+ try:
543
+ logger.info(
544
+ f"Refreshing leaderboard data after submission for version {version}..."
545
  )
546
+ LEADERBOARD_DF = get_leaderboard_df(version=version)
547
+ logger.info("Refreshed leaderboard data after submission")
548
+ except Exception as e:
549
+ logger.error(f"Error refreshing leaderboard data: {e}")
550
+
551
+ return result
552
+
553
+
554
+ def refresh_data(version=CURRENT_VERSION):
555
+ """
556
+ Refresh the leaderboard data and update all components.
557
+ """
558
+ try:
559
+ logger.info(f"Performing scheduled refresh of leaderboard data...")
560
+ # Get new data
561
+ main_df = get_leaderboard_df(version=version)
562
+ category_dfs = [
563
+ get_category_leaderboard_df(category, version=version)
564
+ for category in CATEGORIES
565
+ ]
566
+
567
+ # For gr.Dataframe, we return the actual dataframes
568
+ return main_df, *category_dfs
569
+
570
+ except Exception as e:
571
+ logger.error(f"Error in scheduled refresh: {e}")
572
+ return None, *[None for _ in CATEGORIES]
573
+
574
+
575
+ def update_leaderboards(version):
576
+ """
577
+ Update all leaderboard components with data for the selected version.
578
+ """
579
+ try:
580
+ new_df = get_leaderboard_df(version=version)
581
+ category_dfs = [
582
+ get_category_leaderboard_df(category, version=version)
583
+ for category in CATEGORIES
584
+ ]
585
+ return new_df, *category_dfs
586
+ except Exception as e:
587
+ logger.error(f"Error updating leaderboards for version {version}: {e}")
588
+ return None, *[None for _ in CATEGORIES]
589
+
590
+
591
+ def create_performance_plot(
592
+ selected_models, category, metric="f1_binary", version=CURRENT_VERSION
593
+ ):
594
+ """
595
+ Create a radar plot comparing model performance for selected models.
596
+ """
597
+ if category == "All Results":
598
+ df = get_leaderboard_df(version=version)
599
  else:
600
+ df = get_category_leaderboard_df(category, version=version)
601
+
602
+ if df.empty:
603
+ return go.Figure()
604
+
605
+ # Lowercase model_name in df and selected_models
606
+ df = df.copy()
607
+ df["model_name"] = df["model_name"].str.lower()
608
+ selected_models = [m.lower() for m in selected_models]
609
+ df = df[df["model_name"].isin(selected_models)]
610
+ metric_cols = [col for col in df.columns if metric in col]
611
+ fig = go.Figure()
612
+ colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
613
+ for idx, model in enumerate(selected_models):
614
+ model_data = df[df["model_name"] == model]
615
+ if not model_data.empty:
616
+ values = model_data[metric_cols].values[0].tolist()
617
+ values = values + [values[0]]
618
+ categories = [col.replace(f"_{metric}", "") for col in metric_cols]
619
+ # Replace 'jailbreaked' with 'jailbroken' in categories
620
+ categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
621
+ categories = categories + [categories[0]]
622
+ fig.add_trace(
623
+ go.Scatterpolar(
624
+ r=values,
625
+ theta=categories,
626
+ name=model,
627
+ line_color=colors[idx % len(colors)],
628
+ fill="toself",
 
 
 
 
 
 
 
 
 
 
 
 
 
629
  )
630
+ )
631
+ fig.update_layout(
632
+ paper_bgcolor="#000000",
633
+ plot_bgcolor="#000000",
634
+ font={"color": "#ffffff"},
635
+ title={
636
+ "text": f"{category} - {metric.upper()} Score Comparison",
637
+ "font": {"color": "#ffffff", "size": 24},
638
+ },
639
+ polar=dict(
640
+ bgcolor="#000000",
641
+ radialaxis=dict(
642
+ visible=True,
643
+ range=[0, 1],
644
+ gridcolor="#333333",
645
+ linecolor="#333333",
646
+ tickfont={"color": "#ffffff"},
647
+ ),
648
+ angularaxis=dict(
649
+ gridcolor="#333333",
650
+ linecolor="#333333",
651
+ tickfont={"color": "#ffffff"},
652
+ ),
653
+ ),
654
+ height=600,
655
+ showlegend=True,
656
+ legend=dict(
657
+ yanchor="top",
658
+ y=0.99,
659
+ xanchor="right",
660
+ x=0.99,
661
+ bgcolor="rgba(0,0,0,0.5)",
662
+ font={"color": "#ffffff"},
663
+ ),
664
+ )
665
+ return fig
666
+
667
+
668
+ def update_model_choices(version):
669
+ """
670
+ Update the list of available models for the given version.
671
+ """
672
+ df = get_leaderboard_df(version=version)
673
+ if df.empty:
674
+ return []
675
+ return sorted(df["model_name"].str.lower().unique().tolist())
676
+
677
+
678
+ def update_visualization(selected_models, selected_category, selected_metric, version):
679
+ """
680
+ Update the visualization based on user selections.
681
+ """
682
+ if not selected_models:
683
+ return go.Figure()
684
+ return create_performance_plot(
685
+ selected_models, selected_category, selected_metric, version
686
+ )
687
+
688
+
689
+ # Create Gradio app
690
+ demo = gr.Blocks(css=custom_css, theme=custom_theme)
691
+
692
+ CATEGORY_DISPLAY_MAP = {
693
+ "Python": "Python",
694
+ "JavaScript": "JavaScript",
695
+ "Java": "Java",
696
+ "C++": "C++",
697
+ "C#": "C#",
698
+ "TypeScript": "TypeScript",
699
+ "Go": "Go",
700
+ "Rust": "Rust",
701
+ "Swift": "Swift",
702
+ "Kotlin": "Kotlin",
703
+ "Ruby": "Ruby",
704
+ "PHP": "PHP",
705
+ "C": "C",
706
+ "Scala": "Scala",
707
+ "R": "R",
708
+ "Dart": "Dart",
709
+ "Other": "Other"
710
+ }
711
+ # Create reverse mapping for lookups
712
+ CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
713
+
714
+ with demo:
715
+ gr.HTML(TITLE)
716
+ # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
717
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
718
+
719
+ with gr.Row():
720
+ tabs = gr.Tabs(elem_classes="tab-buttons")
721
+
722
+ with tabs:
723
+ with gr.TabItem("Leaderboard", elem_id="codereview-leaderboard-tab", id=0):
724
+ with gr.Row():
725
+ version_selector = gr.Dropdown(
726
+ choices=BENCHMARK_VERSIONS,
727
+ label="Benchmark Version",
728
+ value=CURRENT_VERSION,
729
+ interactive=True,
730
+ elem_classes="version-selector",
731
+ scale=1,
732
+ visible=False,
733
+ )
734
+
735
+ with gr.Row():
736
+ search_input = gr.Textbox(
737
+ placeholder="Search by models (use ; to split)",
738
+ label="Search",
739
+ elem_id="search-bar",
740
+ scale=2,
741
+ )
742
+ model_type_filter = gr.Dropdown(
743
+ choices=[
744
+ t.to_str("-") for t in ModelType if t != ModelType.Unknown and t != ModelType.ClosedSource
745
+ ],
746
+ label="Access Type",
747
+ multiselect=True,
748
+ value=[],
749
+ interactive=True,
750
+ scale=1,
751
+ )
752
+ column_selector = gr.Dropdown(
753
+ choices=get_all_column_choices(),
754
+ label="Columns",
755
+ multiselect=True,
756
+ value=get_initial_columns(),
757
+ interactive=True,
758
+ visible=False,
759
+ scale=1,
760
+ )
761
+ with gr.Row():
762
+ refresh_button = gr.Button(
763
+ "Refresh", scale=0, elem_id="refresh-button"
764
+ )
765
+
766
+ # Create tabs for each category
767
+ with gr.Tabs(elem_classes="category-tabs") as category_tabs:
768
+ # First tab for average metrics across all categories
769
+ with gr.TabItem("All Results", elem_id="overall-tab"):
770
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
771
+
772
+ # Create a tab for each category using display names
773
+ for category in CATEGORIES:
774
+ display_name = CATEGORY_DISPLAY_MAP.get(category, category)
775
+ elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
776
+ with gr.TabItem(display_name, elem_id=elem_id):
777
+ category_df = get_category_leaderboard_df(
778
+ category, version=CURRENT_VERSION
779
+ )
780
+ category_leaderboard = init_leaderboard(category_df)
781
+
782
+ # Connect search and filter inputs to update function
783
+ def update_with_search_filters(
784
+ version=CURRENT_VERSION,
785
+ search_query="",
786
+ model_types=None,
787
+ selected_columns=None,
788
+ ):
789
+ """
790
+ Update the leaderboards with search and filter settings.
791
+ """
792
+ return refresh_data_with_filters(
793
+ version, search_query, model_types, selected_columns
794
+ )
795
+
796
+ # Refresh button functionality
797
+ def refresh_and_update(
798
+ version, search_query, model_types, selected_columns
799
+ ):
800
+ """
801
+ Refresh data, update LEADERBOARD_DF, and return updated components.
802
+ """
803
+ global LEADERBOARD_DF
804
+ main_df = get_leaderboard_df(version=version)
805
+ LEADERBOARD_DF = main_df # Update the global DataFrame
806
+ return refresh_data_with_filters(
807
+ version, search_query, model_types, selected_columns
808
+ )
809
+
810
+ refresh_button.click(
811
+ fn=refresh_and_update,
812
+ inputs=[
813
+ version_selector,
814
+ search_input,
815
+ model_type_filter,
816
+ column_selector,
817
+ ],
818
+ outputs=[leaderboard]
819
+ + [
820
+ category_tabs.children[i].children[0]
821
+ for i in range(1, len(CATEGORIES) + 1)
822
+ ],
823
  )
824
+ # Search input functionality
825
+ search_input.change(
826
+ fn=refresh_data_with_filters,
827
+ inputs=[
828
+ version_selector,
829
+ search_input,
830
+ model_type_filter,
831
+ column_selector,
832
+ ],
833
+ outputs=[leaderboard]
834
+ + [
835
+ category_tabs.children[i].children[0]
836
+ for i in range(1, len(CATEGORIES) + 1)
837
+ ],
838
  )
839
+
840
+ # Model type filter functionality
841
+ model_type_filter.change(
842
+ fn=refresh_data_with_filters,
843
+ inputs=[
844
+ version_selector,
845
+ search_input,
846
+ model_type_filter,
847
+ column_selector,
848
+ ],
849
+ outputs=[leaderboard]
850
+ + [
851
+ category_tabs.children[i].children[0]
852
+ for i in range(1, len(CATEGORIES) + 1)
853
+ ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
854
  )
855
+
856
+ # Version selector functionality
857
+ version_selector.change(
858
+ fn=refresh_data_with_filters,
859
+ inputs=[
860
+ version_selector,
861
+ search_input,
862
+ model_type_filter,
863
+ column_selector,
864
+ ],
865
+ outputs=[leaderboard]
866
+ + [
867
+ category_tabs.children[i].children[0]
868
+ for i in range(1, len(CATEGORIES) + 1)
869
+ ],
870
  )
871
+
872
+ # Update the update_columns function to handle updating all tabs at once
873
+ def update_columns(selected_columns):
874
+ """
875
+ Update all leaderboards to show the selected columns.
876
+ Ensures all selected columns are preserved in the update.
877
+
878
+ """
879
+
880
+ try:
881
+ logger.info(f"Updating columns to show: {selected_columns}")
882
+
883
+ # If no columns are selected, use default visible columns
884
+ if not selected_columns or len(selected_columns) == 0:
885
+ selected_columns = get_default_visible_columns()
886
+ logger.info(
887
+ f"No columns selected, using defaults: {selected_columns}"
888
+ )
889
+
890
+ # Convert display names to internal names
891
+ internal_selected_columns = [
892
+ x.lower()
893
+ .replace(" ", "_")
894
+ .replace("(", "")
895
+ .replace(")", "")
896
+ .replace("_recall", "_recall_binary")
897
+ .replace("_precision", "_precision_binary")
898
+ for x in selected_columns
899
+ ]
900
+
901
+ # Get the current data with ALL columns preserved
902
+ main_df = get_leaderboard_df(version=version_selector.value)
903
+
904
+ # Get category dataframes with ALL columns preserved
905
+ category_dfs = [
906
+ get_category_leaderboard_df(
907
+ category, version=version_selector.value
908
+ )
909
+ for category in CATEGORIES
910
+ ]
911
+
912
+ # Log columns for debugging
913
+ logger.info(f"Main dataframe columns: {list(main_df.columns)}")
914
+ logger.info(
915
+ f"Selected columns (internal): {internal_selected_columns}"
916
+ )
917
+
918
+ # IMPORTANT: Make sure model_name is always included
919
+ if (
920
+ "model_name" in main_df.columns
921
+ and "model_name" not in internal_selected_columns
922
+ ):
923
+ internal_selected_columns = [
924
+ "model_name"
925
+ ] + internal_selected_columns
926
+
927
+ # Initialize the main leaderboard with the selected columns
928
+ # We're passing the internal_selected_columns directly to preserve the selection
929
+ main_leaderboard = init_leaderboard(
930
+ main_df, internal_selected_columns
931
+ )
932
+
933
+ # Initialize category dataframes with the same selected columns
934
+ # This ensures consistency across all tabs
935
+ category_leaderboards = []
936
+ for df in category_dfs:
937
+ # Use the same selected columns for each category
938
+ # init_leaderboard will automatically handle filtering to columns that exist
939
+ category_leaderboards.append(
940
+ init_leaderboard(df, internal_selected_columns)
941
+ )
942
+
943
+ return main_leaderboard, *category_leaderboards
944
+
945
+ except Exception as e:
946
+ logger.error(f"Error updating columns: {e}")
947
+ import traceback
948
+
949
+ logger.error(traceback.format_exc())
950
+ return leaderboard, *[
951
+ tab.children[0]
952
+ for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
953
+ ]
954
+
955
+ # Connect column selector to update function
956
+ column_selector.change(
957
+ fn=update_columns,
958
+ inputs=[column_selector],
959
+ outputs=[leaderboard]
960
+ + [
961
+ category_tabs.children[i].children[0]
962
+ for i in range(1, len(CATEGORIES) + 1)
963
+ ],
964
  )
965
+
966
+ with gr.TabItem("Visualize", elem_id="codereview-viz-tab", id=1):
967
+ with gr.Row():
968
+ with gr.Column():
969
+ viz_version_selector = gr.Dropdown(
970
+ choices=BENCHMARK_VERSIONS,
971
+ label="Benchmark Version",
972
+ value=CURRENT_VERSION,
973
+ interactive=True,
974
+ visible=False,
975
+ )
976
+
977
+ # New: Mode selector
978
+ def get_model_mode_choices(version):
979
+ df = get_leaderboard_df(version=version)
980
+ if df.empty:
981
+ return []
982
+ return sorted([
983
+ f"{str(row['model_name']).lower()} [{row['mode']}]"
984
+ for _, row in df.drop_duplicates(subset=["model_name", "mode"]).iterrows()
985
+ ])
986
+
987
+ model_mode_selector = gr.Dropdown(
988
+ choices=get_model_mode_choices(CURRENT_VERSION),
989
+ label="Select Model(s) [Mode] to Compare",
990
+ multiselect=True,
991
+ interactive=True,
992
+ )
993
+ with gr.Column():
994
+ # Add Overall Performance to categories, use display names
995
+ viz_categories_display = ["All Results"] + [
996
+ CATEGORY_DISPLAY_MAP.get(cat, cat) for cat in CATEGORIES
997
+ ]
998
+ category_selector = gr.Dropdown(
999
+ choices=viz_categories_display,
1000
+ label="Select Category",
1001
+ value=viz_categories_display[0],
1002
+ interactive=True,
1003
+ )
1004
+ metric_selector = gr.Dropdown(
1005
+ choices=[
1006
+ "accuracy",
1007
+ "f1_binary",
1008
+ "precision_binary",
1009
+ "recall_binary",
1010
+ "error_ratio",
1011
+ ],
1012
+ label="Select Metric",
1013
+ value="accuracy",
1014
+ interactive=True,
1015
+ )
1016
+
1017
+ plot_output = gr.Plot()
1018
+
1019
+ # Update visualization when any selector changes
1020
+ def update_visualization_with_mode(
1021
+ selected_model_modes, selected_category, selected_metric, version
1022
+ ):
1023
+ if not selected_model_modes:
1024
+ return go.Figure()
1025
+ df = (
1026
+ get_leaderboard_df(version=version)
1027
+ if selected_category == "All Results"
1028
+ else get_category_leaderboard_df(selected_category, version=version)
1029
  )
1030
+ if df.empty:
1031
+ return go.Figure()
1032
+ df = df.copy()
1033
+ df["model_name"] = df["model_name"].str.lower()
1034
+ selected_pairs = [s.rsplit(" [", 1) for s in selected_model_modes]
1035
+ selected_pairs = [
1036
+ (name.strip().lower(), mode.strip("] "))
1037
+ for name, mode in selected_pairs
1038
+ ]
1039
+ mask = df.apply(
1040
+ lambda row: (row["model_name"], str(row["mode"])) in selected_pairs,
1041
+ axis=1,
1042
  )
1043
+ filtered_df = df[mask]
1044
+ metric_cols = [col for col in filtered_df.columns if selected_metric in col]
1045
+ fig = go.Figure()
1046
+ colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
1047
+ for idx, (model_name, mode) in enumerate(selected_pairs):
1048
+ model_data = filtered_df[
1049
+ (filtered_df["model_name"] == model_name)
1050
+ & (filtered_df["mode"] == mode)
1051
+ ]
1052
+ if not model_data.empty:
1053
+ values = model_data[metric_cols].values[0].tolist()
1054
+ values = values + [values[0]]
1055
+ categories = [col.replace(f"_{selected_metric}", "") for col in metric_cols]
1056
+ # Replace 'jailbreaked' with 'jailbroken' in categories
1057
+ categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
1058
+ categories = categories + [categories[0]]
1059
+ fig.add_trace(
1060
+ go.Scatterpolar(
1061
+ r=values,
1062
+ theta=categories,
1063
+ name=f"{model_name} [{mode}]",
1064
+ line_color=colors[idx % len(colors)],
1065
+ fill="toself",
1066
+ )
1067
+ )
1068
+ fig.update_layout(
1069
+ paper_bgcolor="#000000",
1070
+ plot_bgcolor="#000000",
1071
+ font={"color": "#ffffff"},
1072
+ title={
1073
+ "text": f"{selected_category} - {selected_metric.upper()} Score Comparison",
1074
+ "font": {"color": "#ffffff", "size": 24},
1075
+ },
1076
+ polar=dict(
1077
+ bgcolor="#000000",
1078
+ radialaxis=dict(
1079
+ visible=True,
1080
+ range=[0, 1],
1081
+ gridcolor="#333333",
1082
+ linecolor="#333333",
1083
+ tickfont={"color": "#ffffff"},
1084
+ ),
1085
+ angularaxis=dict(
1086
+ gridcolor="#333333",
1087
+ linecolor="#333333",
1088
+ tickfont={"color": "#ffffff"},
1089
+ ),
1090
+ ),
1091
+ height=600,
1092
+ showlegend=True,
1093
+ legend=dict(
1094
+ yanchor="top",
1095
+ y=0.99,
1096
+ xanchor="right",
1097
+ x=0.99,
1098
+ bgcolor="rgba(0,0,0,0.5)",
1099
+ font={"color": "#ffffff"},
1100
+ ),
1101
+ )
1102
+ return fig
1103
+
1104
+ # Connect selectors to update function
1105
+ for control in [
1106
+ viz_version_selector,
1107
+ model_mode_selector,
1108
+ category_selector,
1109
+ metric_selector,
1110
+ ]:
1111
+ control.change(
1112
+ fn=lambda smm, sc, s_metric, v: update_visualization_with_mode(
1113
+ smm, CATEGORY_REVERSE_MAP.get(sc, sc), s_metric, v
1114
+ ),
1115
+ inputs=[
1116
+ model_mode_selector,
1117
+ category_selector,
1118
+ metric_selector,
1119
+ viz_version_selector,
1120
+ ],
1121
+ outputs=plot_output,
1122
+ )
1123
+
1124
+ # Update model_mode_selector choices when version changes
1125
+ viz_version_selector.change(
1126
+ fn=get_model_mode_choices,
1127
+ inputs=[viz_version_selector],
1128
+ outputs=[model_mode_selector],
1129
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1130
 
1131
+ # with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
1132
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
1133
+
1134
+ with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=3):
1135
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
1136
+
1137
+ with gr.Row():
1138
+ # with gr.Column(scale=3):
1139
+ # gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
1140
+ with gr.Column(scale=1):
1141
+ # Add version selector specifically for the submission tab
1142
+ submission_version_selector = gr.Dropdown(
1143
+ choices=BENCHMARK_VERSIONS,
1144
+ label="Benchmark Version",
1145
+ value=CURRENT_VERSION,
1146
+ interactive=True,
1147
+ elem_classes="version-selector",
1148
+ visible=False,
1149
+ )
1150
+
1151
+ with gr.Row():
1152
+ with gr.Column():
1153
+ model_name_textbox = gr.Textbox(label="Model name")
1154
+ mode_selector = gr.Dropdown(
1155
+ choices=[m.name for m in Mode],
1156
+ label="Mode",
1157
+ multiselect=False,
1158
+ value=None,
1159
+ interactive=True,
1160
+ )
1161
+ revision_name_textbox = gr.Textbox(
1162
+ label="Revision commit", placeholder="main"
1163
+ )
1164
+ model_type = gr.Dropdown(
1165
+ choices=[
1166
+ t.to_str("-")
1167
+ for t in ModelType
1168
+ if t != ModelType.Unknown and t != ModelType.ClosedSource
1169
+ ],
1170
+ label="Model type",
1171
+ multiselect=False,
1172
+ value=None,
1173
+ interactive=True,
1174
+ )
1175
+ review_model_type = gr.Dropdown(
1176
+ choices=[t.name for t in ReviewModelType],
1177
+ label="Review model type",
1178
+ multiselect=False,
1179
+ value=ReviewModelType.CUSTOM.name,
1180
+ interactive=True,
1181
+ )
1182
+
1183
+ with gr.Column():
1184
+ precision = gr.Dropdown(
1185
+ choices=[
1186
+ i.name for i in Precision if i != Precision.Unknown
1187
+ ],
1188
+ label="Precision",
1189
+ multiselect=False,
1190
+ value="float16",
1191
+ interactive=True,
1192
+ )
1193
+ weight_type = gr.Dropdown(
1194
+ choices=[i.name for i in WeightType],
1195
+ label="Weights type",
1196
+ multiselect=False,
1197
+ value="Original",
1198
+ interactive=True,
1199
+ )
1200
+ base_model_name_textbox = gr.Textbox(
1201
+ label="Base model (for delta or adapter weights)"
1202
+ )
1203
+
1204
+ with gr.Row():
1205
+ file_input = gr.File(
1206
+ label="Upload JSONL Results File", file_types=[".jsonl"]
1207
+ )
1208
+
1209
+ submit_button = gr.Button("Submit Results")
1210
+ result_output = gr.Markdown()
1211
+
1212
+ submit_button.click(
1213
+ fn=submit_results,
1214
+ inputs=[
1215
+ model_name_textbox,
1216
+ base_model_name_textbox,
1217
+ revision_name_textbox,
1218
+ precision,
1219
+ weight_type,
1220
+ model_type,
1221
+ mode_selector,
1222
+ file_input,
1223
+ submission_version_selector,
1224
+ review_model_type,
1225
+ ],
1226
+ outputs=result_output,
1227
+ )
1228
+
1229
+ # Version selector functionality
1230
+ version_selector.change(
1231
+ fn=update_leaderboards,
1232
+ inputs=[version_selector],
1233
+ outputs=[leaderboard]
1234
+ + [
1235
+ category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
1236
+ ],
1237
+ ).then(
1238
+ lambda version: refresh_data_with_filters(version),
1239
+ inputs=[version_selector],
1240
+ outputs=[leaderboard]
1241
+ + [
1242
+ category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
1243
+ ],
1244
  )
1245
 
1246
+
1247
+ # Set up the scheduler to refresh data periodically
1248
+ scheduler = BackgroundScheduler()
1249
+ scheduler.add_job(refresh_data, "interval", minutes=30)
1250
+ scheduler.start()
1251
+
1252
+ # Launch the app
1253
+ if __name__ == "__main__":
1254
+ demo.launch()
example_submission.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"model_name": "GPT-4-CodeReview", "programming_language": "python", "comment_language": "en", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
2
+ {"model_name": "GPT-4-CodeReview", "programming_language": "javascript", "comment_language": "en", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
3
+ {"model_name": "Claude-3-CodeReview", "programming_language": "python", "comment_language": "en", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 8.0, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
4
+ {"model_name": "Llama-CodeReview", "programming_language": "java", "comment_language": "en", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}
requirements.txt CHANGED
@@ -1,19 +1,8 @@
1
- APScheduler
2
- black
3
- datasets
4
- gradio>=4.0.0
5
- gradio[oauth]
6
- gradio_leaderboard==0.0.13
7
- gradio_client
8
- huggingface-hub>=0.18.0
9
- matplotlib
10
- numpy
11
- pandas>=1.3.0
12
- python-dateutil
13
- tqdm
14
- transformers
15
- tokenizers>=0.15.0
16
- sentencepiece
17
- fastapi
18
- uvicorn
19
- pydantic>=2.0.0
 
1
+ gradio==4.44.1
2
+ pandas>=2.0.0
3
+ huggingface_hub>=0.20.0
4
+ datasets>=2.0.0
5
+ apscheduler>=3.10.0
6
+ python-dotenv>=1.0.0
7
+ plotly>=5.18.0
8
+ pydantic==2.10.6
 
 
 
 
 
 
 
 
 
 
 
src/about.py CHANGED
@@ -1,48 +1,60 @@
1
  """
2
- About page content for CodeReview Leaderboard
3
  """
4
 
5
- TITLE = "🏆 CodeReview Leaderboard"
 
 
 
 
6
 
7
  INTRODUCTION_TEXT = """
8
- # CodeReview Leaderboard
9
-
10
- A comprehensive benchmark for evaluating code review generation models across multiple programming languages and comment types.
11
 
12
- ## Overview
 
 
13
 
14
- This leaderboard tracks the performance of various models on code review tasks, providing insights into:
15
- - **Programming Language Performance**: How well models perform across different programming languages
16
- - **Comment Language Support**: Effectiveness in generating reviews in different natural languages
17
- - **Taxonomy Categories**: Performance across different types of code review feedback
18
 
19
- ## Metrics
 
20
 
21
- - **BLEU**: Measures similarity between generated and reference reviews
22
- - **Pass@1/5/10**: Percentage of reviews that pass quality checks in 1, 5, or 10 attempts
23
- - **Multi-dimensional Quality Scores**: Detailed evaluation across 10 quality dimensions
24
 
25
- ## Features
26
 
27
- **Filter by Programming Language**: View results for specific programming languages (Python, JavaScript, Java, etc.)
28
- ✨ **Comment Language Support**: Filter by the natural language of code comments
29
- ✨ **Taxonomy Categories**: Browse results by review type (bug detection, style, performance, etc.)
30
- ✨ **IP-based Submissions**: Secure submission system with IP tracking
31
- ✨ **Dark Theme**: Modern, eye-friendly interface
32
  """
33
 
34
- SUBMISSION_GUIDELINES = """
35
- ## Submission Guidelines
36
 
37
- 1. **Model Requirements**: Submit results for at least 100 test cases
38
- 2. **Format**: Provide scores in the specified format ranges
39
- 3. **Reproducibility**: Include model details and evaluation setup
40
- 4. **Quality Metrics**: Rate your model across all 10 quality dimensions
41
- 5. **Metadata**: Specify programming language, comment language, and taxonomy focus
42
- """
43
 
44
- CONTACT_INFO = """
45
- ## Contact & Support
 
46
 
47
- For questions, issues, or contributions, please reach out through our repository or contact the maintainers.
48
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Text content for the CodeReview Bench Leaderboard.
3
  """
4
 
5
+ TITLE = """
6
+ <div style="text-align: center; margin-bottom: 1rem">
7
+ <h1>CodeReview Bench Leaderboard</h1>
8
+ </div>
9
+ """
10
 
11
  INTRODUCTION_TEXT = """
12
+ ## Introduction
 
 
13
 
14
+ CodeReview Bench is a comprehensive benchmark for evaluating the quality and effectiveness of automated code review systems.
15
+ This leaderboard tracks model performance across various programming languages and review criteria,
16
+ including readability, relevance, explanation clarity, and actionability.
17
 
18
+ Models are evaluated on their ability to provide high-quality code reviews that are helpful,
19
+ accurate, and actionable across multiple programming languages and review categories.
20
+ """
 
21
 
22
+ LLM_BENCHMARKS_TEXT = """
23
+ CodeReview Bench is a comprehensive benchmark for evaluating automated code review systems across programming languages and review quality dimensions.
24
 
25
+ It evaluates models on their ability to provide high-quality code reviews using both LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity) and exact-match metrics (pass@1, pass@5, pass@10, BLEU@10).
 
 
26
 
27
+ The benchmark supports both Russian and English comment languages across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more.
28
 
29
+ Learn more about automated code review evaluation and best practices.
 
 
 
 
30
  """
31
 
32
+ EVALUATION_QUEUE_TEXT = """
33
+ ## Submit Your Model
34
 
35
+ To add your model to the CodeReview Bench leaderboard:
 
 
 
 
 
36
 
37
+ 1. Run your evaluation using the CodeReview Bench framework
38
+ 2. Upload your results in .jsonl format using this form.
39
+ 3. Once validated, your model will appear on the leaderboard.
40
 
41
+ ### Requirements:
42
+ - Results must include all required metrics: LLM-based multimetric scores and exact-match metrics
43
+ - Submissions should cover multiple programming languages where applicable
44
+ - Both Russian and English comment languages are supported
45
+
46
+ ### ✉️✨ Ready? Upload your results below!
47
+ """
48
+
49
+ CITATION_BUTTON_LABEL = "Cite CodeReview Bench"
50
+
51
+ CITATION_BUTTON_TEXT = """
52
+ @misc{codereviewbench2025,
53
+ author = {CodeReview Bench Team},
54
+ title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
55
+ year = {2025},
56
+ publisher = {GitHub},
57
+ journal = {GitHub repository},
58
+ howpublished = {\\url{https://github.com/your-org/codereview-bench}}
59
+ }
60
+ """
src/display/css_html_js.py CHANGED
@@ -1,306 +1,97 @@
1
  """
2
- Custom CSS, HTML, and JavaScript for the CodeReview Leaderboard
3
  """
4
 
5
- # Dark theme CSS
6
- DARK_THEME_CSS = """
7
- /* Dark Theme Styling */
8
- :root {
9
- --bg-primary: #0d1117;
10
- --bg-secondary: #161b22;
11
- --bg-tertiary: #21262d;
12
- --text-primary: #e6edf3;
13
- --text-secondary: #7d8590;
14
- --border-color: #30363d;
15
- --accent-color: #ffffff;
16
- --accent-hover: #f0f0f0;
17
- --danger-color: #da3633;
18
- --warning-color: #d29922;
19
- --info-color: #1f6feb;
20
  }
21
 
22
- /* Global dark theme */
23
- .gradio-container {
24
- background: var(--bg-primary) !important;
25
- color: var(--text-primary) !important;
26
  }
27
 
28
- /* Headers and text */
29
- .gradio-container h1, .gradio-container h2, .gradio-container h3 {
30
- color: var(--text-primary) !important;
31
  }
32
 
33
- .gradio-container p, .gradio-container span {
34
- color: var(--text-secondary) !important;
35
  }
36
 
37
- /* Tabs */
38
- .gradio-container .tab-nav {
39
- background: var(--bg-secondary) !important;
40
- border-bottom: 1px solid var(--border-color) !important;
 
 
41
  }
42
 
43
- .gradio-container .tab-nav button {
44
- background: transparent !important;
45
- color: var(--text-secondary) !important;
46
- border: none !important;
47
- padding: 12px 24px !important;
48
- transition: all 0.2s ease !important;
49
  }
50
 
51
- .gradio-container .tab-nav button:hover {
52
- color: var(--text-primary) !important;
53
- background: var(--bg-tertiary) !important;
54
  }
55
 
56
- .gradio-container .tab-nav button.selected {
57
- color: var(--text-primary) !important;
58
- background: var(--bg-tertiary) !important;
59
- border-bottom: 2px solid var(--accent-color) !important;
60
  }
61
 
62
- /* Tables */
63
- .gradio-container .dataframe {
64
- background: var(--bg-secondary) !important;
65
- border: 1px solid var(--border-color) !important;
66
- border-radius: 8px !important;
67
- overflow: hidden !important;
68
  }
69
 
70
- .gradio-container .dataframe table {
71
- background: var(--bg-secondary) !important;
 
72
  }
73
 
74
- .gradio-container .dataframe th {
75
- background: var(--bg-tertiary) !important;
76
- color: var(--text-primary) !important;
77
- border-bottom: 2px solid var(--border-color) !important;
78
- padding: 12px !important;
79
- font-weight: 600 !important;
80
  }
81
 
82
- .gradio-container .dataframe td {
83
- background: var(--bg-secondary) !important;
84
- color: var(--text-primary) !important;
85
- border-bottom: 1px solid var(--border-color) !important;
86
- padding: 10px 12px !important;
87
  }
88
 
89
- .gradio-container .dataframe tr:hover td {
90
- background: var(--bg-tertiary) !important;
 
91
  }
92
 
93
- /* Form inputs */
94
- .gradio-container input, .gradio-container select, .gradio-container textarea {
95
- background: var(--bg-tertiary) !important;
96
- color: var(--text-primary) !important;
97
- border: 1px solid var(--border-color) !important;
98
- border-radius: 6px !important;
99
- padding: 8px 12px !important;
 
 
100
  }
101
 
102
- .gradio-container input:focus, .gradio-container select:focus, .gradio-container textarea:focus {
103
- border-color: var(--accent-color) !important;
104
- box-shadow: 0 0 0 2px rgba(255, 255, 255, 0.2) !important;
 
105
  }
106
 
107
- /* Buttons */
108
- .gradio-container button {
109
- background: var(--accent-color) !important;
110
- color: var(--bg-primary) !important;
111
- border: 1px solid var(--border-color) !important;
112
- border-radius: 6px !important;
113
- padding: 8px 16px !important;
114
- font-weight: 500 !important;
115
- transition: all 0.2s ease !important;
116
- }
117
-
118
- .gradio-container button:hover {
119
- background: var(--accent-hover) !important;
120
- transform: translateY(-1px) !important;
121
- color: var(--bg-primary) !important;
122
- }
123
-
124
- .gradio-container button:active {
125
- transform: translateY(0) !important;
126
- }
127
-
128
- /* Dropdowns */
129
- .gradio-container .dropdown {
130
- background: var(--bg-tertiary) !important;
131
- border: 1px solid var(--border-color) !important;
132
- border-radius: 6px !important;
133
- }
134
-
135
- .gradio-container .dropdown-menu {
136
- background: var(--bg-secondary) !important;
137
- border: 1px solid var(--border-color) !important;
138
- border-radius: 6px !important;
139
- box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important;
140
- }
141
-
142
- .gradio-container .dropdown-menu .dropdown-item {
143
- color: var(--text-primary) !important;
144
- padding: 8px 12px !important;
145
- }
146
-
147
- .gradio-container .dropdown-menu .dropdown-item:hover {
148
- background: var(--bg-tertiary) !important;
149
- }
150
-
151
- /* Sliders */
152
- .gradio-container .slider {
153
- background: var(--bg-tertiary) !important;
154
- }
155
-
156
- .gradio-container .slider input[type="range"] {
157
- background: var(--bg-tertiary) !important;
158
- }
159
-
160
- .gradio-container .slider input[type="range"]::-webkit-slider-thumb {
161
- background: var(--accent-color) !important;
162
- border: 2px solid var(--bg-primary) !important;
163
- border-radius: 50% !important;
164
- width: 18px !important;
165
- height: 18px !important;
166
- }
167
-
168
- .gradio-container .slider input[type="range"]::-webkit-slider-track {
169
- background: var(--border-color) !important;
170
- border-radius: 4px !important;
171
- height: 6px !important;
172
- }
173
-
174
- /* Accordions */
175
- .gradio-container .accordion {
176
- background: var(--bg-secondary) !important;
177
- border: 1px solid var(--border-color) !important;
178
- border-radius: 8px !important;
179
- margin: 16px 0 !important;
180
- }
181
-
182
- .gradio-container .accordion-header {
183
- background: var(--bg-tertiary) !important;
184
- color: var(--text-primary) !important;
185
- padding: 16px !important;
186
- border-bottom: 1px solid var(--border-color) !important;
187
- cursor: pointer !important;
188
- font-weight: 500 !important;
189
- }
190
-
191
- .gradio-container .accordion-header:hover {
192
- background: var(--bg-primary) !important;
193
- }
194
-
195
- /* Status messages */
196
- .gradio-container .success {
197
- background: rgba(255, 255, 255, 0.1) !important;
198
- color: var(--text-primary) !important;
199
- border: 1px solid var(--accent-color) !important;
200
- border-radius: 6px !important;
201
- padding: 12px 16px !important;
202
- margin: 8px 0 !important;
203
- }
204
-
205
- .gradio-container .error {
206
- background: rgba(218, 54, 51, 0.1) !important;
207
- color: var(--danger-color) !important;
208
- border: 1px solid var(--danger-color) !important;
209
- border-radius: 6px !important;
210
- padding: 12px 16px !important;
211
- margin: 8px 0 !important;
212
- }
213
-
214
- /* Responsive design */
215
- @media (max-width: 768px) {
216
- .gradio-container {
217
- padding: 16px !important;
218
- }
219
-
220
- .gradio-container .tab-nav button {
221
- padding: 8px 16px !important;
222
- font-size: 14px !important;
223
- }
224
-
225
- .gradio-container .dataframe {
226
- font-size: 14px !important;
227
- }
228
  }
229
  """
230
-
231
- # Custom JavaScript for enhanced functionality
232
- CUSTOM_JS = """
233
- // Enhanced table sorting and filtering
234
- function enhanceTable() {
235
- const tables = document.querySelectorAll('.dataframe table');
236
- tables.forEach(table => {
237
- // Add sorting functionality
238
- const headers = table.querySelectorAll('th');
239
- headers.forEach((header, index) => {
240
- header.style.cursor = 'pointer';
241
- header.addEventListener('click', () => sortTable(table, index));
242
- });
243
- });
244
- }
245
-
246
- function sortTable(table, columnIndex) {
247
- const tbody = table.querySelector('tbody');
248
- const rows = Array.from(tbody.querySelectorAll('tr'));
249
-
250
- rows.sort((a, b) => {
251
- const aText = a.cells[columnIndex].textContent.trim();
252
- const bText = b.cells[columnIndex].textContent.trim();
253
-
254
- // Try to parse as numbers first
255
- const aNum = parseFloat(aText);
256
- const bNum = parseFloat(bText);
257
-
258
- if (!isNaN(aNum) && !isNaN(bNum)) {
259
- return bNum - aNum; // Descending for numbers
260
- }
261
-
262
- return aText.localeCompare(bText); // Ascending for text
263
- });
264
-
265
- rows.forEach(row => tbody.appendChild(row));
266
- }
267
-
268
- // Auto-refresh functionality
269
- function autoRefresh() {
270
- setInterval(() => {
271
- const refreshBtn = document.querySelector('button[aria-label="Refresh"]');
272
- if (refreshBtn) {
273
- refreshBtn.click();
274
- }
275
- }, 30000); // Refresh every 30 seconds
276
- }
277
-
278
- // Initialize enhancements
279
- document.addEventListener('DOMContentLoaded', function() {
280
- enhanceTable();
281
- autoRefresh();
282
- });
283
- """
284
-
285
- # HTML components
286
- HEADER_HTML = """
287
- <div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-bottom: 20px;">
288
- <h1 style="color: var(--text-primary); margin: 0; font-size: 2.5em; font-weight: 700;">
289
- 🏆 CodeReview Leaderboard
290
- </h1>
291
- <p style="color: var(--text-secondary); margin: 10px 0 0 0; font-size: 1.2em;">
292
- Benchmarking code review generation models across languages and categories
293
- </p>
294
- </div>
295
- """
296
-
297
- FOOTER_HTML = """
298
- <div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-top: 20px;">
299
- <p style="color: var(--text-secondary); margin: 0; font-size: 0.9em;">
300
- Built with ❤️ for the code review community |
301
- <a href="https://github.com/your-repo" style="color: var(--accent-color); text-decoration: none;">
302
- GitHub
303
- </a>
304
- </p>
305
- </div>
306
- """
 
1
  """
2
+ CSS and styling for the CodeReview Bench Leaderboard.
3
  """
4
 
5
+ custom_css = """
6
+ .markdown-text {
7
+ font-size: 16px !important;
8
+ text-align: justify !important;
9
+ line-height: 1.0 !important;
10
+ margin-top: 10px !important;
11
+ margin-bottom: 10px !important;
 
 
 
 
 
 
 
 
12
  }
13
 
14
+ .tab-buttons button.selected {
15
+ border-color: #f4f4f5 !important;
16
+ background: #3f3f46 !important;
17
+ color: #f4f4f5 !important;
18
  }
19
 
20
+ #citation-button textarea {
21
+ font-family: monospace !important;
 
22
  }
23
 
24
+ .leaderboard-container {
25
+ margin-top: 20px;
26
  }
27
 
28
+ .category-header {
29
+ font-weight: bold;
30
+ background-color: #f5f5f5;
31
+ padding: 10px;
32
+ margin-top: 15px;
33
+ border-radius: 5px;
34
  }
35
 
36
+ .metric-name {
37
+ font-weight: bold;
38
+ color: #a1a1aa !important;
 
 
 
39
  }
40
 
41
+ .model-name {
42
+ font-weight: bold;
 
43
  }
44
 
45
+ .model-link:hover {
46
+ text-decoration: underline;
47
+ color: #ffffff !important;
 
48
  }
49
 
50
+ .version-selector {
51
+ margin: 0 !important;
52
+ padding: 5px;
53
+ border-radius: 5px;
 
 
54
  }
55
 
56
+ .version-selector label {
57
+ font-weight: bold;
58
+ color: #f4f4f5 !important;
59
  }
60
 
61
+ .version-selector select {
62
+ border-color: #3f3f46 !important;
63
+ border-radius: 5px;
 
 
 
64
  }
65
 
66
+ /* Make sure the version selector is properly aligned with refresh button */
67
+ .version-selector > .block {
68
+ padding: 0 !important;
 
 
69
  }
70
 
71
+ .version-selector > .block > .wrap {
72
+ position: relative;
73
+ top: -5px;
74
  }
75
 
76
+ /* Force background/border for common layout containers */
77
+ .gradio-row > .block,
78
+ .gradio-column > .block,
79
+ .form,
80
+ .panel {
81
+ /* background: #18181b !important; */ /* Removed background override */
82
+ border-color: #27272a80 !important; /* Made border color semi-transparent */
83
+ border-width: 1px !important; /* Ensure border is visible */
84
+ border-style: solid !important;
85
  }
86
 
87
+ /* Target the specific file upload component area */
88
+ .gradio-file .wrap {
89
+ /* background: #18181b !important; */ /* Removed background override */
90
+ border-color: #27272a !important;
91
  }
92
 
93
+ #refresh-button {
94
+ margin-top: 5px !important;
95
+ margin-bottom: 5px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  }
97
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py CHANGED
@@ -1,182 +1,71 @@
1
  """
2
- Formatting utilities for display components
3
  """
4
 
5
- import re
6
- from typing import List, Dict, Any, Optional
7
- from datetime import datetime, timezone
8
 
9
- def format_score(score: float, precision: int = 3) -> str:
10
- """Format a score with specified precision"""
11
- if isinstance(score, (int, float)):
12
- return f"{score:.{precision}f}"
13
- return str(score)
14
 
15
- def format_percentage(score: float, precision: int = 1) -> str:
16
- """Format a score as percentage"""
17
- if isinstance(score, (int, float)):
18
- return f"{score * 100:.{precision}f}%"
19
- return str(score)
20
 
21
- def format_model_name(name: str) -> str:
22
- """Format model name for display"""
23
- # Remove common prefixes and make more readable
24
- name = name.strip()
25
- if "/" in name:
26
- org, model = name.split("/", 1)
27
- return f"<span style='color: var(--text-secondary); font-size: 0.9em;'>{org}/</span><strong>{model}</strong>"
28
- return f"<strong>{name}</strong>"
29
 
30
- def format_timestamp(timestamp: str) -> str:
31
- """Format timestamp for display"""
32
- try:
33
- dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
34
- return dt.strftime("%Y-%m-%d %H:%M UTC")
35
- except:
36
- return timestamp
37
 
38
- def format_ip_address(ip: str) -> str:
39
- """Format IP address for display (partial masking)"""
40
- if not ip:
41
- return "Unknown"
42
-
43
- # Mask part of IP for privacy
44
- parts = ip.split(".")
45
- if len(parts) == 4:
46
- return f"{parts[0]}.{parts[1]}.{parts[2]}.xxx"
47
- return "xxx.xxx.xxx.xxx"
48
 
49
- def format_metric_score(score: int, metric_name: str) -> str:
50
- """Format metric score with color coding"""
51
- if not isinstance(score, (int, float)):
52
- return str(score)
53
-
54
- # Color coding based on score
55
- if score >= 8:
56
- color = "#ffffff" # White
57
- elif score >= 6:
58
- color = "#d0d0d0" # Light gray
59
- elif score >= 4:
60
- color = "#a0a0a0" # Gray
61
- else:
62
- color = "#707070" # Dark gray
63
-
64
- return f"<span style='color: {color}; font-weight: 600;'>{score}</span>"
65
 
66
- def format_language_badge(language: str) -> str:
67
- """Format programming language as a badge"""
68
- if not language or language == "All":
69
- return language
70
-
71
- # Language-specific colors
72
- colors = {
73
- "Python": "#3776ab",
74
- "JavaScript": "#f7df1e",
75
- "Java": "#ed8b00",
76
- "C++": "#00599c",
77
- "C#": "#239120",
78
- "Go": "#00add8",
79
- "Rust": "#ce422b",
80
- "TypeScript": "#3178c6",
81
- "PHP": "#777bb4",
82
- "Ruby": "#cc342d",
83
- "Swift": "#fa7343",
84
- "Kotlin": "#7f52ff",
85
- "Scala": "#dc322f",
86
- "R": "#276dc3",
87
- "MATLAB": "#e16737"
88
- }
89
-
90
- color = colors.get(language, "#6c757d")
91
- return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{language}</span>"
92
 
93
- def format_taxonomy_badge(category: str) -> str:
94
- """Format taxonomy category as a badge"""
95
- if not category or category == "All":
96
- return category
97
-
98
- # Category-specific colors
99
- colors = {
100
- "Bug Detection": "#dc3545",
101
- "Code Style": "#6f42c1",
102
- "Performance": "#fd7e14",
103
- "Security": "#e83e8c",
104
- "Maintainability": "#ffffff",
105
- "Documentation": "#17a2b8",
106
- "Testing": "#ffffff",
107
- "Architecture": "#6c757d",
108
- "Best Practices": "#007bff",
109
- "Refactoring": "#ffc107"
110
- }
111
-
112
- color = colors.get(category, "#6c757d")
113
- return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{category}</span>"
114
 
115
- def format_comment_language_flag(language: str) -> str:
116
- """Format comment language with flag emoji"""
117
- if not language or language == "All":
118
- return language
119
-
120
- # Language-specific flags
121
- flags = {
122
- "English": "🇺🇸",
123
- "Chinese": "🇨🇳",
124
- "Spanish": "🇪🇸",
125
- "French": "🇫🇷",
126
- "German": "🇩🇪",
127
- "Japanese": "🇯🇵",
128
- "Korean": "🇰🇷",
129
- "Russian": "🇷🇺",
130
- "Portuguese": "🇵🇹",
131
- "Italian": "🇮🇹",
132
- "Dutch": "🇳🇱"
133
- }
134
-
135
- flag = flags.get(language, "🌐")
136
- return f"{flag} {language}"
137
 
138
- def sanitize_html(text: str) -> str:
139
- """Sanitize HTML content to prevent XSS"""
140
- if not isinstance(text, str):
141
- return str(text)
142
-
143
- # Remove potentially dangerous HTML tags
144
- text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
145
- text = re.sub(r'<iframe[^>]*>.*?</iframe>', '', text, flags=re.DOTALL | re.IGNORECASE)
146
- text = re.sub(r'on\w+="[^"]*"', '', text, flags=re.IGNORECASE)
147
- text = re.sub(r'on\w+=\'[^\']*\'', '', text, flags=re.IGNORECASE)
148
-
149
- return text
150
 
151
- def truncate_text(text: str, max_length: int = 50) -> str:
152
- """Truncate text with ellipsis"""
153
- if not isinstance(text, str):
154
- text = str(text)
155
-
156
- if len(text) <= max_length:
157
- return text
158
-
159
- return text[:max_length-3] + "..."
160
 
161
- def format_table_cell(value: Any, column_name: str) -> str:
162
- """Format table cell based on column type"""
163
- if value is None:
164
- return "N/A"
165
-
166
- # Handle different column types
167
- if column_name.lower() in ["bleu", "pass@1", "pass@5", "pass@10"]:
168
- return format_percentage(value)
169
- elif column_name.lower() == "model":
170
- return format_model_name(str(value))
171
- elif column_name.lower() == "programming language":
172
- return format_language_badge(str(value))
173
- elif column_name.lower() == "comment language":
174
- return format_comment_language_flag(str(value))
175
- elif column_name.lower() == "taxonomy":
176
- return format_taxonomy_badge(str(value))
177
- elif column_name.lower() in ["readability", "relevance", "explanation clarity",
178
- "problem identification", "actionability", "completeness",
179
- "specificity", "contextual adequacy", "consistency", "brevity"]:
180
- return format_metric_score(value, column_name.lower())
181
- else:
182
- return sanitize_html(str(value))
 
1
  """
2
+ Formatting utilities for the GuardBench Leaderboard.
3
  """
4
 
5
+ import pandas as pd
6
+ import numpy as np
 
7
 
 
 
 
 
 
8
 
9
+ def make_clickable_model(model_name: str) -> str:
10
+ """
11
+ Create a clickable link for a model name.
12
+ """
13
+ return f'<a href="https://huggingface.co/{model_name}" target="_blank">{model_name}</a>'
14
 
 
 
 
 
 
 
 
 
15
 
16
+ def has_no_nan_values(df: pd.DataFrame, columns: list) -> pd.Series:
17
+ """
18
+ Check if a row has no NaN values in the specified columns.
19
+ """
20
+ return ~df[columns].isna().any(axis=1)
 
 
21
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ def format_percentage(value: float) -> str:
24
+ """
25
+ Format a value as a percentage.
26
+ """
27
+ if pd.isna(value):
28
+ return "N/A"
29
+ return f"{value * 100:.2f}%"
 
 
 
 
 
 
 
 
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ def format_number(value: float, precision: int = 2) -> str:
33
+ """
34
+ Format a number with specified precision.
35
+ """
36
+ if pd.isna(value):
37
+ return "N/A"
38
+ return f"{value:.{precision}f}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ def styled_message(message: str) -> str:
42
+ """
43
+ Format a success message with styling.
44
+ """
45
+ return f"""
46
+ <div style="padding: 10px; border-radius: 5px; background-color: #e6f7e6; color: #2e7d32; border: 1px solid #2e7d32;">
47
+ {message}
48
+ </div>
49
+ """
 
 
 
50
 
 
 
 
 
 
 
 
 
 
51
 
52
+ def styled_warning(message: str) -> str:
53
+ """
54
+ Format a warning message with styling.
55
+ """
56
+ return f"""
57
+ <div style="padding: 10px; border-radius: 5px; background-color: #fff8e1; color: #ff8f00; border: 1px solid #ff8f00;">
58
+ ⚠️ {message}
59
+ </div>
60
+ """
61
+
62
+
63
+ def styled_error(message: str) -> str:
64
+ """
65
+ Format an error message with styling.
66
+ """
67
+ return f"""
68
+ <div style="padding: 10px; border-radius: 5px; background-color: #ffebee; color: #c62828; border: 1px solid #c62828;">
69
+ {message}
70
+ </div>
71
+ """
 
 
src/display/utils.py CHANGED
@@ -1,292 +1,415 @@
1
  """
2
- Display utilities for the CodeReview Leaderboard
3
  """
4
 
5
- from typing import List, Dict, Any, Optional, Tuple
6
- import json
7
- from datetime import datetime, timezone
8
- from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
9
- from src.display.formatting import format_table_cell, format_timestamp
10
-
11
- def filter_leaderboard_data(
12
- data: List[Dict],
13
- programming_language: str = "All",
14
- comment_language: str = "All",
15
- taxonomy_category: str = "All",
16
- sort_by: str = "llm_pass_1",
17
- sort_order: str = "desc"
18
- ) -> List[Dict]:
19
- """Filter and sort leaderboard data based on criteria"""
20
-
21
- if not data:
22
- return []
23
-
24
- # Apply filters
25
- filtered_data = data.copy()
26
-
27
- if programming_language != "All":
28
- filtered_data = [
29
- entry for entry in filtered_data
30
- if entry.get("programming_language", "").lower() == programming_language.lower()
31
- ]
32
-
33
- if comment_language != "All":
34
- filtered_data = [
35
- entry for entry in filtered_data
36
- if entry.get("comment_language", "").lower() == comment_language.lower()
37
- ]
38
-
39
- if taxonomy_category != "All":
40
- filtered_data = [
41
- entry for entry in filtered_data
42
- if entry.get("taxonomy_category", "").lower() == taxonomy_category.lower()
43
- ]
44
-
45
- # Sort data
46
- reverse = sort_order.lower() == "desc"
47
-
48
- try:
49
- if sort_by in ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]:
50
- filtered_data.sort(key=lambda x: x.get(sort_by, 0), reverse=reverse)
51
- elif sort_by in QUALITY_METRICS:
52
- filtered_data.sort(key=lambda x: x.get("metrics", {}).get(sort_by, 0), reverse=reverse)
53
- else:
54
- filtered_data.sort(key=lambda x: str(x.get(sort_by, "")), reverse=reverse)
55
- except Exception as e:
56
- print(f"Error sorting data: {e}")
57
- # Default sort by pass@1
58
- filtered_data.sort(key=lambda x: x.get("llm_pass_1", 0), reverse=True)
59
-
60
- return filtered_data
61
-
62
- def get_main_leaderboard_data(
63
- data: List[Dict],
64
- programming_language: str = "All",
65
- comment_language: str = "All",
66
- taxonomy_category: str = "All",
67
- sort_by: str = "llm_pass_1"
68
- ) -> List[List[str]]:
69
- """Get formatted main leaderboard table data"""
70
-
71
- filtered_data = filter_leaderboard_data(
72
- data, programming_language, comment_language, taxonomy_category, sort_by
73
- )
74
-
75
- table_rows = []
76
- for entry in filtered_data:
77
- row = [
78
- format_table_cell(entry.get("model_name", ""), "model"),
79
- format_table_cell(entry.get("programming_language", ""), "programming language"),
80
- format_table_cell(entry.get("comment_language", ""), "comment language"),
81
- format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
82
- format_table_cell(entry.get("bleu", 0), "bleu"),
83
- format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
84
- format_table_cell(entry.get("llm_pass_5", 0), "pass@5"),
85
- format_table_cell(entry.get("llm_pass_10", 0), "pass@10"),
86
- ]
87
- table_rows.append(row)
88
-
89
- return table_rows
90
-
91
- def get_quality_metrics_data(
92
- data: List[Dict],
93
- programming_language: str = "All",
94
- comment_language: str = "All",
95
- taxonomy_category: str = "All",
96
- sort_by: str = "llm_pass_1"
97
- ) -> List[List[str]]:
98
- """Get formatted quality metrics table data"""
99
-
100
- filtered_data = filter_leaderboard_data(
101
- data, programming_language, comment_language, taxonomy_category, sort_by
102
- )
103
-
104
- table_rows = []
105
- for entry in filtered_data:
106
- metrics = entry.get("metrics", {})
107
- row = [format_table_cell(entry.get("model_name", ""), "model")]
108
-
109
- for metric in QUALITY_METRICS:
110
- formatted_value = format_table_cell(metrics.get(metric, 0), metric.replace("_", " "))
111
- row.append(formatted_value)
112
-
113
- table_rows.append(row)
114
-
115
- return table_rows
116
-
117
- def get_submission_history_data(
118
- data: List[Dict],
119
- programming_language: str = "All",
120
- comment_language: str = "All",
121
- taxonomy_category: str = "All",
122
- limit: int = 50
123
- ) -> List[List[str]]:
124
- """Get formatted submission history data"""
125
-
126
- filtered_data = filter_leaderboard_data(
127
- data, programming_language, comment_language, taxonomy_category, "submission_date", "desc"
128
- )
129
-
130
- # Limit results
131
- filtered_data = filtered_data[:limit]
132
-
133
- table_rows = []
134
- for entry in filtered_data:
135
- row = [
136
- format_table_cell(entry.get("model_name", ""), "model"),
137
- format_table_cell(entry.get("programming_language", ""), "programming language"),
138
- format_table_cell(entry.get("comment_language", ""), "comment language"),
139
- format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
140
- format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
141
- format_timestamp(entry.get("submission_date", "")),
142
- entry.get("submission_ip", "").split(".")[0] + ".xxx.xxx.xxx" if entry.get("submission_ip") else "Unknown"
143
- ]
144
- table_rows.append(row)
145
-
146
- return table_rows
147
 
148
- def get_statistics_summary(data: List[Dict]) -> Dict[str, Any]:
149
- """Get summary statistics for the leaderboard"""
150
-
151
- if not data:
152
- return {
153
- "total_models": 0,
154
- "total_submissions": 0,
155
- "avg_pass_1": 0,
156
- "best_model": "None",
157
- "languages_covered": 0,
158
- "categories_covered": 0
159
- }
160
-
161
- # Calculate statistics
162
- total_models = len(set(entry.get("model_name", "") for entry in data))
163
- total_submissions = len(data)
164
-
165
- pass_1_scores = [entry.get("llm_pass_1", 0) for entry in data if entry.get("llm_pass_1") is not None]
166
- avg_pass_1 = sum(pass_1_scores) / len(pass_1_scores) if pass_1_scores else 0
167
-
168
- best_entry = max(data, key=lambda x: x.get("llm_pass_1", 0)) if data else None
169
- best_model = best_entry.get("model_name", "None") if best_entry else "None"
170
-
171
- languages_covered = len(set(entry.get("programming_language", "") for entry in data if entry.get("programming_language")))
172
- categories_covered = len(set(entry.get("taxonomy_category", "") for entry in data if entry.get("taxonomy_category")))
173
-
174
- return {
175
- "total_models": total_models,
176
- "total_submissions": total_submissions,
177
- "avg_pass_1": avg_pass_1,
178
- "best_model": best_model,
179
- "languages_covered": languages_covered,
180
- "categories_covered": categories_covered
181
- }
182
-
183
- def validate_submission_data(data: Dict[str, Any]) -> Tuple[bool, str]:
184
- """Validate submission data"""
185
-
186
- required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
187
-
188
- # Check required fields
189
- for field in required_fields:
190
- if not data.get(field):
191
- return False, f"Missing required field: {field}"
192
-
193
- # Validate scores
194
- score_fields = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
195
- for field in score_fields:
196
- value = data.get(field)
197
- if value is None:
198
- return False, f"Missing score: {field}"
199
- if not isinstance(value, (int, float)):
200
- return False, f"Invalid score format: {field}"
201
- if not 0 <= value <= 1:
202
- return False, f"Score out of range (0-1): {field}"
203
-
204
- # Validate metrics
205
- metrics = data.get("metrics", {})
206
- for metric in QUALITY_METRICS:
207
- value = metrics.get(metric)
208
- if value is None:
209
- return False, f"Missing metric: {metric}"
210
- if not isinstance(value, (int, float)):
211
- return False, f"Invalid metric format: {metric}"
212
- if not 0 <= value <= 10:
213
- return False, f"Metric out of range (0-10): {metric}"
214
-
215
- # Validate language and category choices
216
- if data.get("programming_language") not in PROGRAMMING_LANGUAGES:
217
- return False, "Invalid programming language"
218
-
219
- if data.get("comment_language") not in COMMENT_LANGUAGES:
220
- return False, "Invalid comment language"
221
-
222
- if data.get("taxonomy_category") not in TAXONOMY_CATEGORIES:
223
- return False, "Invalid taxonomy category"
224
-
225
- return True, "Valid submission"
226
 
227
- def get_leaderboard_insights(data: List[Dict]) -> Dict[str, Any]:
228
- """Get insights and trends from leaderboard data"""
229
-
230
- if not data:
231
- return {}
232
-
233
- # Language performance analysis
234
- lang_performance = {}
235
- for lang in PROGRAMMING_LANGUAGES[1:]: # Skip "All"
236
- lang_data = [entry for entry in data if entry.get("programming_language") == lang]
237
- if lang_data:
238
- avg_score = sum(entry.get("llm_pass_1", 0) for entry in lang_data) / len(lang_data)
239
- lang_performance[lang] = {
240
- "avg_score": avg_score,
241
- "model_count": len(lang_data),
242
- "best_model": max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
243
- }
244
-
245
- # Category performance analysis
246
- category_performance = {}
247
- for category in TAXONOMY_CATEGORIES[1:]: # Skip "All"
248
- cat_data = [entry for entry in data if entry.get("taxonomy_category") == category]
249
- if cat_data:
250
- avg_score = sum(entry.get("llm_pass_1", 0) for entry in cat_data) / len(cat_data)
251
- category_performance[category] = {
252
- "avg_score": avg_score,
253
- "model_count": len(cat_data),
254
- "best_model": max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
255
- }
256
-
257
- return {
258
- "language_performance": lang_performance,
259
- "category_performance": category_performance,
260
- "top_performers": sorted(data, key=lambda x: x.get("llm_pass_1", 0), reverse=True)[:5]
261
- }
262
-
263
- def export_leaderboard_data(data: List[Dict], format_type: str = "json") -> str:
264
- """Export leaderboard data in specified format"""
265
-
266
- if format_type.lower() == "json":
267
- return json.dumps(data, indent=2, ensure_ascii=False)
268
- elif format_type.lower() == "csv":
269
- # Simple CSV export
270
- if not data:
271
- return ""
272
-
273
- # Get headers
274
- headers = ["model_name", "programming_language", "comment_language", "taxonomy_category",
275
- "bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
276
- headers.extend(QUALITY_METRICS)
277
-
278
- lines = [",".join(headers)]
279
-
280
- for entry in data:
281
- row = []
282
- for header in headers:
283
- if header in QUALITY_METRICS:
284
- value = entry.get("metrics", {}).get(header, "")
285
- else:
286
- value = entry.get(header, "")
287
- row.append(str(value))
288
- lines.append(",".join(row))
289
-
290
- return "\n".join(lines)
291
- else:
292
- return "Unsupported format"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Utility classes and functions for the CodeReview Bench Leaderboard display.
3
  """
4
 
5
+ from dataclasses import dataclass, field, fields
6
+ from enum import Enum, auto
7
+ from typing import List, Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ class Mode(Enum):
11
+ """Inference mode for the review model."""
12
+ CoT = auto() # Chain of Thought
13
+ Strict = auto()
14
+
15
+ def __str__(self):
16
+ """String representation of the mode."""
17
+ return self.name
18
+
19
+
20
+ class ModelType(Enum):
21
+ """Model types for the leaderboard."""
22
+ Unknown = auto()
23
+ OpenSource = auto()
24
+ ClosedSource = auto()
25
+ API = auto()
26
+
27
+ def to_str(self, separator: str = "-") -> str:
28
+ """Convert enum to string with separator."""
29
+ if self == ModelType.Unknown:
30
+ return "Unknown"
31
+ elif self == ModelType.OpenSource:
32
+ return f"Open{separator}Source"
33
+ elif self == ModelType.ClosedSource:
34
+ return f"Closed{separator}Source"
35
+ elif self == ModelType.API:
36
+ return "API"
37
+ return "Unknown"
38
+
39
+
40
+ class ReviewModelType(str, Enum):
41
+ """Review model types for the leaderboard."""
42
+ GPT_4 = "gpt-4"
43
+ GPT_3_5 = "gpt-3.5-turbo"
44
+ CLAUDE = "claude"
45
+ LLAMA = "llama"
46
+ GEMINI = "gemini"
47
+ CUSTOM = "custom"
48
+
49
+ def __str__(self):
50
+ """String representation of the review model type."""
51
+ return self.value
52
+
53
+
54
+ class Precision(Enum):
55
+ """Model precision types."""
56
+ Unknown = auto()
57
+ float16 = auto()
58
+ bfloat16 = auto()
59
+ float32 = auto()
60
+ int8 = auto()
61
+ int4 = auto()
62
+ NA = auto()
63
+
64
+ def __str__(self):
65
+ """String representation of the precision type."""
66
+ return self.name
67
+
68
+
69
+ class WeightType(Enum):
70
+ """Model weight types."""
71
+ Original = auto()
72
+ Delta = auto()
73
+ Adapter = auto()
74
+
75
+ def __str__(self):
76
+ """String representation of the weight type."""
77
+ return self.name
78
+
79
+
80
+ @dataclass
81
+ class ColumnInfo:
82
+ """Information about a column in the leaderboard."""
83
+ name: str
84
+ display_name: str
85
+ type: str = "text"
86
+ hidden: bool = False
87
+ never_hidden: bool = False
88
+ displayed_by_default: bool = True
89
+
90
+
91
+ @dataclass
92
+ class CodeReviewBenchColumn:
93
+ """Columns for the CodeReview Bench leaderboard."""
94
+ # Core metadata
95
+ model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
96
+ name="model_name",
97
+ display_name="Model",
98
+ never_hidden=True,
99
+ displayed_by_default=True
100
+ ))
101
+ mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
102
+ name="mode",
103
+ display_name="Mode",
104
+ displayed_by_default=True
105
+ ))
106
+ model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
107
+ name="model_type",
108
+ display_name="Access_Type",
109
+ displayed_by_default=True
110
+ ))
111
+ submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
112
+ name="submission_date",
113
+ display_name="Submission_Date",
114
+ displayed_by_default=False
115
+ ))
116
+ version: ColumnInfo = field(default_factory=lambda: ColumnInfo(
117
+ name="version",
118
+ display_name="Version",
119
+ displayed_by_default=False
120
+ ))
121
+ review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
122
+ name="review_model_type",
123
+ display_name="Type",
124
+ displayed_by_default=False
125
+ ))
126
+ base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
127
+ name="base_model",
128
+ display_name="Base Model",
129
+ displayed_by_default=False
130
+ ))
131
+ revision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
132
+ name="revision",
133
+ display_name="Revision",
134
+ displayed_by_default=False
135
+ ))
136
+ precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
137
+ name="precision",
138
+ display_name="Precision",
139
+ displayed_by_default=False
140
+ ))
141
+ weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
142
+ name="weight_type",
143
+ display_name="Weight Type",
144
+ displayed_by_default=False
145
+ ))
146
+
147
+ # LLM-based multimetric scores
148
+ readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
149
+ name="readability",
150
+ display_name="Readability",
151
+ type="number",
152
+ displayed_by_default=True
153
+ ))
154
+ relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
155
+ name="relevance",
156
+ display_name="Relevance",
157
+ type="number",
158
+ displayed_by_default=True
159
+ ))
160
+ explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
161
+ name="explanation_clarity",
162
+ display_name="Explanation_Clarity",
163
+ type="number",
164
+ displayed_by_default=True
165
+ ))
166
+ problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo(
167
+ name="problem_identification",
168
+ display_name="Problem_Identification",
169
+ type="number",
170
+ displayed_by_default=True
171
+ ))
172
+ actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
173
+ name="actionability",
174
+ display_name="Actionability",
175
+ type="number",
176
+ displayed_by_default=True
177
+ ))
178
+ completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo(
179
+ name="completeness",
180
+ display_name="Completeness",
181
+ type="number",
182
+ displayed_by_default=True
183
+ ))
184
+ specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
185
+ name="specificity",
186
+ display_name="Specificity",
187
+ type="number",
188
+ displayed_by_default=True
189
+ ))
190
+ contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
191
+ name="contextual_adequacy",
192
+ display_name="Contextual_Adequacy",
193
+ type="number",
194
+ displayed_by_default=True
195
+ ))
196
+ consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo(
197
+ name="consistency",
198
+ display_name="Consistency",
199
+ type="number",
200
+ displayed_by_default=True
201
+ ))
202
+ brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
203
+ name="brevity",
204
+ display_name="Brevity",
205
+ type="number",
206
+ displayed_by_default=True
207
+ ))
208
+
209
+ # LLM-based-exact-match metrics
210
+ pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
211
+ name="pass_at_1",
212
+ display_name="Pass@1",
213
+ type="number",
214
+ displayed_by_default=True
215
+ ))
216
+ pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo(
217
+ name="pass_at_5",
218
+ display_name="Pass@5",
219
+ type="number",
220
+ displayed_by_default=True
221
+ ))
222
+ pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
223
+ name="pass_at_10",
224
+ display_name="Pass@10",
225
+ type="number",
226
+ displayed_by_default=True
227
+ ))
228
+ bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
229
+ name="bleu_at_10",
230
+ display_name="BLEU@10",
231
+ type="number",
232
+ displayed_by_default=True
233
+ ))
234
+
235
+ # Overall aggregated metrics
236
+ overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
237
+ name="overall_score",
238
+ display_name="Overall_Score",
239
+ type="number",
240
+ displayed_by_default=True
241
+ ))
242
+ multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
243
+ name="multimetric_average",
244
+ display_name="Multimetric_Average",
245
+ type="number",
246
+ displayed_by_default=True
247
+ ))
248
+ exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
249
+ name="exact_match_average",
250
+ display_name="Exact_Match_Average",
251
+ type="number",
252
+ displayed_by_default=True
253
+ ))
254
+ total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo(
255
+ name="total_evaluations",
256
+ display_name="Total_Evaluations",
257
+ type="number",
258
+ displayed_by_default=True
259
+ ))
260
+
261
+ # Language-specific metrics (Russian)
262
+ ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
263
+ name="ru_readability",
264
+ display_name="RU_Readability",
265
+ type="number",
266
+ displayed_by_default=False
267
+ ))
268
+ ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
269
+ name="ru_relevance",
270
+ display_name="RU_Relevance",
271
+ type="number",
272
+ displayed_by_default=False
273
+ ))
274
+ ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
275
+ name="ru_overall_score",
276
+ display_name="RU_Overall_Score",
277
+ type="number",
278
+ displayed_by_default=False
279
+ ))
280
+
281
+ # Language-specific metrics (English)
282
+ en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
283
+ name="en_readability",
284
+ display_name="EN_Readability",
285
+ type="number",
286
+ displayed_by_default=False
287
+ ))
288
+ en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
289
+ name="en_relevance",
290
+ display_name="EN_Relevance",
291
+ type="number",
292
+ displayed_by_default=False
293
+ ))
294
+ en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
295
+ name="en_overall_score",
296
+ display_name="EN_Overall_Score",
297
+ type="number",
298
+ displayed_by_default=False
299
+ ))
300
+
301
+
302
+ # Create instances for easy access
303
+ CODEREVIEW_COLUMN = CodeReviewBenchColumn()
304
+
305
+ # Extract column lists for different views
306
+ COLS = [f.name for f in fields(CODEREVIEW_COLUMN)]
307
+ DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
308
+ if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
309
+
310
+ # Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
311
+ def reorder_display_cols():
312
+ cols = DISPLAY_COLS
313
+ if 'model_name' in cols and 'mode' in cols:
314
+ cols.remove('mode')
315
+ model_name_index = cols.index('model_name')
316
+ cols.insert(model_name_index + 1, 'mode')
317
+ return cols
318
+ DISPLAY_COLS = reorder_display_cols()
319
+
320
+ METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
321
+ if getattr(CODEREVIEW_COLUMN, f.name).type == "number"]
322
+ HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
323
+ if getattr(CODEREVIEW_COLUMN, f.name).hidden]
324
+ NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
325
+ if getattr(CODEREVIEW_COLUMN, f.name).never_hidden]
326
+
327
+ # Categories for CodeReview Bench (Programming Languages)
328
+ CATEGORIES = [
329
+ 'Python',
330
+ 'JavaScript',
331
+ 'Java',
332
+ 'C++',
333
+ 'C#',
334
+ 'TypeScript',
335
+ 'Go',
336
+ 'Rust',
337
+ 'Swift',
338
+ 'Kotlin',
339
+ 'Ruby',
340
+ 'PHP',
341
+ 'C',
342
+ 'Scala',
343
+ 'R',
344
+ 'Dart',
345
+ 'Other'
346
+ ]
347
+
348
+ # Language taxonomies for CodeReview Bench
349
+ COMMENT_LANGUAGES = [
350
+ 'ru', # Russian
351
+ 'en' # English
352
+ ]
353
+
354
+ # Example categories
355
+ EXAMPLE_CATEGORIES = [
356
+ 'Bug_Fix',
357
+ 'Code_Style',
358
+ 'Performance',
359
+ 'Security',
360
+ 'Refactoring',
361
+ 'Documentation',
362
+ 'Testing',
363
+ 'Architecture',
364
+ 'Other'
365
+ ]
366
+
367
+ # Metrics for CodeReview Bench
368
+ MULTIMETRIC_METRICS = [
369
+ "readability",
370
+ "relevance",
371
+ "explanation_clarity",
372
+ "problem_identification",
373
+ "actionability",
374
+ "completeness",
375
+ "specificity",
376
+ "contextual_adequacy",
377
+ "consistency",
378
+ "brevity"
379
+ ]
380
+
381
+ EXACT_MATCH_METRICS = [
382
+ "pass_at_1",
383
+ "pass_at_5",
384
+ "pass_at_10",
385
+ "bleu_at_10"
386
+ ]
387
+
388
+ def get_all_column_choices():
389
+ """
390
+ Get all available column choices for the multiselect dropdown.
391
+
392
+ Returns:
393
+ List of tuples with (column_name, display_name) for all columns.
394
+ """
395
+ column_choices = []
396
+
397
+ default_visible_columns = get_default_visible_columns()
398
+
399
+ for f in fields(CODEREVIEW_COLUMN):
400
+ column_info = getattr(CODEREVIEW_COLUMN, f.name)
401
+ # Create a tuple with both the internal name and display name
402
+ if column_info.name not in default_visible_columns:
403
+ column_choices.append((column_info.name, column_info.display_name))
404
+
405
+ return column_choices
406
+
407
+ def get_default_visible_columns():
408
+ """
409
+ Get the list of column names that should be visible by default.
410
+
411
+ Returns:
412
+ List of column names that are displayed by default.
413
+ """
414
+ return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
415
+ if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
src/envs.py CHANGED
@@ -1,106 +1,27 @@
1
- """
2
- Environment configuration and constants
3
- """
4
-
5
  import os
6
- from pathlib import Path
7
-
8
- # Data paths
9
- DATA_DIR = Path("data")
10
- LEADERBOARD_PATH = DATA_DIR / "leaderboard_data.json"
11
- SUBMISSIONS_PATH = DATA_DIR / "submissions.json"
12
-
13
- # Create data directory if it doesn't exist
14
- DATA_DIR.mkdir(exist_ok=True)
15
-
16
- # Programming languages supported
17
- PROGRAMMING_LANGUAGES = [
18
- "All",
19
- "Python",
20
- "JavaScript",
21
- "Java",
22
- "C++",
23
- "C#",
24
- "Go",
25
- "Rust",
26
- "TypeScript",
27
- "PHP",
28
- "Ruby",
29
- "Swift",
30
- "Kotlin",
31
- "Scala",
32
- "R",
33
- "MATLAB",
34
- "Other"
35
- ]
36
 
37
- # Comment languages supported
38
- COMMENT_LANGUAGES = [
39
- "All",
40
- "English",
41
- "Chinese",
42
- "Spanish",
43
- "French",
44
- "German",
45
- "Japanese",
46
- "Korean",
47
- "Russian",
48
- "Portuguese",
49
- "Italian",
50
- "Dutch",
51
- "Other"
52
- ]
53
 
54
- # Taxonomy categories
55
- TAXONOMY_CATEGORIES = [
56
- "All",
57
- "Bug Detection",
58
- "Code Style",
59
- "Performance",
60
- "Security",
61
- "Maintainability",
62
- "Documentation",
63
- "Testing",
64
- "Architecture",
65
- "Best Practices",
66
- "Refactoring",
67
- "Other"
68
- ]
69
 
70
- # Quality metrics
71
- QUALITY_METRICS = [
72
- "readability",
73
- "relevance",
74
- "explanation_clarity",
75
- "problem_identification",
76
- "actionability",
77
- "completeness",
78
- "specificity",
79
- "contextual_adequacy",
80
- "consistency",
81
- "brevity"
82
- ]
83
 
84
- # Table headers
85
- MAIN_HEADERS = ["Model", "Programming Language", "Comment Language", "Taxonomy", "BLEU", "Pass@1", "Pass@5", "Pass@10"]
 
86
 
87
- QUALITY_HEADERS = ["Model"] + [metric.replace("_", " ").title() for metric in QUALITY_METRICS]
 
88
 
89
- # Default data
90
- DEFAULT_DATA = [{
91
- "model_name": "example/model",
92
- "programming_language": "Python",
93
- "comment_language": "English",
94
- "taxonomy_category": "Bug Detection",
95
- "bleu": 0.5,
96
- "llm_pass_1": 0.5,
97
- "llm_pass_5": 0.5,
98
- "llm_pass_10": 0.5,
99
- "metrics": {
100
- "readability": 5, "relevance": 5, "explanation_clarity": 5,
101
- "problem_identification": 5, "actionability": 5, "completeness": 5,
102
- "specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5
103
- },
104
- "submission_ip": "127.0.0.1",
105
- "submission_date": "2024-01-01T00:00:00Z"
106
- }]
 
 
 
 
 
1
  import os
2
+ from huggingface_hub import HfApi
3
+ from dotenv import load_dotenv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # Load environment variables
6
+ load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Hugging Face configuration
9
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
10
+ OWNER = os.environ.get("OWNER", "codereview-bench") # Change to your org
11
+ SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
12
+ ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
13
+ ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
 
 
 
 
 
 
 
 
 
14
 
15
+ # Repository IDs
16
+ REPO_ID = f"{OWNER}/codereview-bench"
17
+ RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/codereview-bench-results")
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # Cache paths
20
+ CACHE_PATH = os.getenv("HF_HOME", ".")
21
+ DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
22
 
23
+ # Local data paths
24
+ LEADERBOARD_FILE = os.path.join(DATA_PATH, "leaderboard.json")
25
 
26
+ # HF API instance
27
+ API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/processor.py CHANGED
@@ -1,306 +1,271 @@
1
  """
2
- Leaderboard data processor for CodeReview Leaderboard
3
  """
4
 
5
  import json
6
- import traceback
7
- from typing import List, Dict, Any, Optional
8
- from datetime import datetime, timezone, timedelta
9
- from pathlib import Path
10
- from src.envs import LEADERBOARD_PATH, SUBMISSIONS_PATH, DEFAULT_DATA
11
- from src.display.utils import validate_submission_data, get_statistics_summary
12
-
13
- class LeaderboardProcessor:
14
- """Handles all leaderboard data operations"""
 
 
 
 
 
 
15
 
16
- def __init__(self):
17
- self.leaderboard_path = LEADERBOARD_PATH
18
- self.submissions_path = SUBMISSIONS_PATH
19
- self._ensure_data_files()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- def _ensure_data_files(self):
22
- """Ensure data files exist with default data"""
23
- if not self.leaderboard_path.exists():
24
- self.save_leaderboard_data(DEFAULT_DATA)
25
 
26
- if not self.submissions_path.exists():
27
- self.save_submission_log([])
 
 
 
 
 
 
28
 
29
- def load_leaderboard_data(self) -> List[Dict]:
30
- """Load leaderboard data from storage"""
31
- try:
32
- with open(self.leaderboard_path, 'r', encoding='utf-8') as f:
33
- data = json.load(f)
34
- return data.get("leaderboard", [])
35
- except Exception as e:
36
- print(f"Error loading leaderboard: {e}")
37
- return DEFAULT_DATA.copy()
38
 
39
- def save_leaderboard_data(self, data: List[Dict]) -> bool:
40
- """Save leaderboard data to storage"""
41
- try:
42
- to_store = {
43
- "leaderboard": data,
44
- "last_updated": datetime.now(timezone.utc).isoformat(),
45
- "total_entries": len(data)
46
- }
47
-
48
- with open(self.leaderboard_path, 'w', encoding='utf-8') as f:
49
- json.dump(to_store, f, indent=2, ensure_ascii=False)
50
-
51
- return True
52
- except Exception as e:
53
- print(f"Error saving leaderboard: {e}")
54
- return False
55
 
56
- def load_submission_log(self) -> List[Dict]:
57
- """Load submission log from storage"""
58
- try:
59
- with open(self.submissions_path, 'r', encoding='utf-8') as f:
60
- data = json.load(f)
61
- return data.get("submissions", [])
62
- except Exception as e:
63
- print(f"Error loading submission log: {e}")
64
- return []
65
 
66
- def save_submission_log(self, submissions: List[Dict]) -> bool:
67
- """Save submission log to storage"""
68
- try:
69
- to_store = {
70
- "submissions": submissions,
71
- "last_updated": datetime.now(timezone.utc).isoformat(),
72
- "total_submissions": len(submissions)
73
- }
74
-
75
- with open(self.submissions_path, 'w', encoding='utf-8') as f:
76
- json.dump(to_store, f, indent=2, ensure_ascii=False)
77
-
78
- return True
79
- except Exception as e:
80
- print(f"Error saving submission log: {e}")
81
- return False
82
 
83
- def add_submission(self, submission_data: Dict[str, Any], ip_address: str) -> tuple[bool, str]:
84
- """Add a new submission to the leaderboard"""
85
- try:
86
- # Validate submission data
87
- is_valid, message = validate_submission_data(submission_data)
88
- if not is_valid:
89
- return False, message
90
-
91
- # Add metadata
92
- submission_data["submission_ip"] = ip_address
93
- submission_data["submission_date"] = datetime.now(timezone.utc).isoformat()
94
-
95
- # Load current data
96
- current_data = self.load_leaderboard_data()
97
-
98
- # Check for existing model and replace if found
99
- model_name = submission_data.get("model_name", "")
100
- current_data = [entry for entry in current_data if entry.get("model_name") != model_name]
101
-
102
- # Add new submission
103
- current_data.append(submission_data)
104
-
105
- # Save updated data
106
- if self.save_leaderboard_data(current_data):
107
- # Log the submission
108
- self._log_submission(submission_data, ip_address)
109
- return True, "✅ Submission recorded successfully!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  else:
111
- return False, "❌ Failed to save submission"
112
-
113
- except Exception as e:
114
- print(f"Error adding submission: {e}")
115
- traceback.print_exc()
116
- return False, f"❌ Submission failed: {str(e)}"
117
-
118
- def _log_submission(self, submission_data: Dict[str, Any], ip_address: str):
119
- """Log submission for audit trail"""
120
- try:
121
- submissions = self.load_submission_log()
122
-
123
- log_entry = {
124
- "model_name": submission_data.get("model_name"),
125
- "programming_language": submission_data.get("programming_language"),
126
- "comment_language": submission_data.get("comment_language"),
127
- "taxonomy_category": submission_data.get("taxonomy_category"),
128
- "scores": {
129
- "bleu": submission_data.get("bleu"),
130
- "llm_pass_1": submission_data.get("llm_pass_1"),
131
- "llm_pass_5": submission_data.get("llm_pass_5"),
132
- "llm_pass_10": submission_data.get("llm_pass_10")
133
- },
134
- "submission_ip": ip_address,
135
- "submission_date": submission_data.get("submission_date"),
136
- "status": "accepted"
137
- }
138
-
139
- submissions.append(log_entry)
140
-
141
- # Keep only last 1000 submissions
142
- submissions = submissions[-1000:]
143
-
144
- self.save_submission_log(submissions)
145
-
146
- except Exception as e:
147
- print(f"Error logging submission: {e}")
148
-
149
- def get_model_history(self, model_name: str) -> List[Dict]:
150
- """Get submission history for a specific model"""
151
- try:
152
- submissions = self.load_submission_log()
153
- return [
154
- sub for sub in submissions
155
- if sub.get("model_name") == model_name
156
- ]
157
- except Exception as e:
158
- print(f"Error getting model history: {e}")
159
- return []
160
-
161
- def get_ip_submissions(self, ip_address: str, limit: int = 10) -> List[Dict]:
162
- """Get recent submissions from a specific IP"""
163
- try:
164
- submissions = self.load_submission_log()
165
- ip_submissions = [
166
- sub for sub in submissions
167
- if sub.get("submission_ip") == ip_address
168
- ]
169
-
170
- # Sort by date and limit
171
- ip_submissions.sort(key=lambda x: x.get("submission_date", ""), reverse=True)
172
- return ip_submissions[:limit]
173
-
174
- except Exception as e:
175
- print(f"Error getting IP submissions: {e}")
176
- return []
177
-
178
- def check_rate_limit(self, ip_address: str, max_submissions: int = 5, hours: int = 24) -> tuple[bool, str]:
179
- """Check if IP has exceeded rate limit"""
180
- try:
181
- submissions = self.get_ip_submissions(ip_address, max_submissions * 2)
182
-
183
- # Count submissions within the time window
184
- cutoff_time = datetime.now(timezone.utc) - timedelta(hours=hours)
185
- recent_submissions = [
186
- sub for sub in submissions
187
- if datetime.fromisoformat(sub.get("submission_date", "")).replace(tzinfo=timezone.utc) > cutoff_time
188
- ]
189
-
190
- if len(recent_submissions) >= max_submissions:
191
- return False, f"Rate limit exceeded: {len(recent_submissions)}/{max_submissions} submissions in {hours} hours"
192
-
193
- return True, f"Rate limit OK: {len(recent_submissions)}/{max_submissions} submissions in {hours} hours"
194
-
195
- except Exception as e:
196
- print(f"Error checking rate limit: {e}")
197
- return True, "Rate limit check failed, allowing submission"
198
-
199
- def get_leaderboard_stats(self) -> Dict[str, Any]:
200
- """Get comprehensive leaderboard statistics"""
201
- try:
202
- data = self.load_leaderboard_data()
203
- submissions = self.load_submission_log()
204
-
205
- basic_stats = get_statistics_summary(data)
206
-
207
- # Additional stats
208
- recent_submissions = len([
209
- sub for sub in submissions
210
- if datetime.fromisoformat(sub.get("submission_date", "")).replace(tzinfo=timezone.utc) >
211
- datetime.now(timezone.utc) - timedelta(days=7)
212
- ])
213
-
214
- return {
215
- **basic_stats,
216
- "recent_submissions_7d": recent_submissions,
217
- "total_logged_submissions": len(submissions),
218
- "last_updated": datetime.now(timezone.utc).isoformat()
219
- }
220
-
221
- except Exception as e:
222
- print(f"Error getting leaderboard stats: {e}")
223
- return {}
224
-
225
- def backup_data(self) -> bool:
226
- """Create backup of current data"""
227
- try:
228
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
229
- backup_dir = Path("backups")
230
- backup_dir.mkdir(exist_ok=True)
231
-
232
- # Backup leaderboard
233
- if self.leaderboard_path.exists():
234
- backup_path = backup_dir / f"leaderboard_{timestamp}.json"
235
- with open(self.leaderboard_path, 'r') as src, open(backup_path, 'w') as dst:
236
- dst.write(src.read())
237
-
238
- # Backup submissions
239
- if self.submissions_path.exists():
240
- backup_path = backup_dir / f"submissions_{timestamp}.json"
241
- with open(self.submissions_path, 'r') as src, open(backup_path, 'w') as dst:
242
- dst.write(src.read())
243
-
244
- return True
245
-
246
- except Exception as e:
247
- print(f"Error creating backup: {e}")
248
- return False
249
-
250
- def export_data(self, format_type: str = "json") -> str:
251
- """Export leaderboard data in specified format"""
252
- try:
253
- from src.display.utils import export_leaderboard_data
254
-
255
- data = self.load_leaderboard_data()
256
- return export_leaderboard_data(data, format_type)
257
-
258
- except Exception as e:
259
- print(f"Error exporting data: {e}")
260
- return f"Export failed: {str(e)}"
261
-
262
- def validate_data_integrity(self) -> Dict[str, Any]:
263
- """Validate data integrity and return report"""
264
- try:
265
- data = self.load_leaderboard_data()
266
- submissions = self.load_submission_log()
267
-
268
- issues = []
269
-
270
- # Check for duplicate models
271
- model_names = [entry.get("model_name") for entry in data]
272
- duplicates = [name for name in model_names if model_names.count(name) > 1]
273
- if duplicates:
274
- issues.append(f"Duplicate models found: {set(duplicates)}")
275
-
276
- # Check for missing required fields
277
- required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
278
- for i, entry in enumerate(data):
279
- missing = [field for field in required_fields if not entry.get(field)]
280
- if missing:
281
- issues.append(f"Entry {i}: Missing fields {missing}")
282
-
283
- # Check score ranges
284
- for i, entry in enumerate(data):
285
- scores = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
286
- for score in scores:
287
- value = entry.get(score)
288
- if value is not None and (value < 0 or value > 1):
289
- issues.append(f"Entry {i}: {score} out of range: {value}")
290
-
291
- return {
292
- "is_valid": len(issues) == 0,
293
- "issues": issues,
294
- "total_entries": len(data),
295
- "total_submissions": len(submissions),
296
- "check_date": datetime.now(timezone.utc).isoformat()
297
- }
298
-
299
- except Exception as e:
300
- return {
301
- "is_valid": False,
302
- "issues": [f"Validation failed: {str(e)}"],
303
- "total_entries": 0,
304
- "total_submissions": 0,
305
- "check_date": datetime.now(timezone.utc).isoformat()
306
- }
 
1
  """
2
+ Process CodeReview Bench leaderboard data and submissions.
3
  """
4
 
5
  import json
6
+ import os
7
+ import pandas as pd
8
+ from datetime import datetime
9
+ from typing import Dict, List, Tuple, Optional
10
+ import numpy as np
11
+
12
+ from src.display.utils import (
13
+ CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES,
14
+ MULTIMETRIC_METRICS, EXACT_MATCH_METRICS
15
+ )
16
+
17
+
18
+ def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
19
+ """
20
+ Process a JSONL submission file for CodeReview Bench.
21
 
22
+ Args:
23
+ file_path: Path to the JSONL submission file
24
+
25
+ Returns:
26
+ Tuple of (entries_list, message)
27
+ """
28
+ try:
29
+ entries = []
30
+ with open(file_path, 'r', encoding='utf-8') as f:
31
+ for line_num, line in enumerate(f, 1):
32
+ line = line.strip()
33
+ if not line:
34
+ continue
35
+
36
+ try:
37
+ entry = json.loads(line)
38
+
39
+ # Validate required fields
40
+ required_fields = ['model_name', 'programming_language', 'comment_language']
41
+ missing_fields = [field for field in required_fields if field not in entry]
42
+ if missing_fields:
43
+ return [], f"Missing required fields {missing_fields} in line {line_num}"
44
+
45
+ # Validate metrics exist
46
+ has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS)
47
+ has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS)
48
+
49
+ if not has_multimetric and not has_exact_match:
50
+ return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}"
51
+
52
+ entries.append(entry)
53
+
54
+ except json.JSONDecodeError as e:
55
+ return [], f"Invalid JSON in line {line_num}: {e}"
56
+
57
+ if not entries:
58
+ return [], "No valid entries found in submission file"
59
+
60
+ return entries, f"Successfully processed {len(entries)} entries"
61
+
62
+ except Exception as e:
63
+ return [], f"Error processing submission: {e}"
64
+
65
+
66
+ def calculate_overall_score(entry: Dict) -> float:
67
+ """
68
+ Calculate overall score for a CodeReview Bench entry.
69
 
70
+ Args:
71
+ entry: Dictionary containing model evaluation results
 
 
72
 
73
+ Returns:
74
+ Overall score as float
75
+ """
76
+ # Calculate multimetric average
77
+ multimetric_scores = []
78
+ for metric in MULTIMETRIC_METRICS:
79
+ if metric in entry and isinstance(entry[metric], (int, float)):
80
+ multimetric_scores.append(entry[metric])
81
 
82
+ multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0
 
 
 
 
 
 
 
 
83
 
84
+ # Calculate exact match average
85
+ exact_match_scores = []
86
+ for metric in EXACT_MATCH_METRICS:
87
+ if metric in entry and isinstance(entry[metric], (int, float)):
88
+ exact_match_scores.append(entry[metric])
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0
 
 
 
 
 
 
 
 
91
 
92
+ # Weighted combination (can be adjusted based on requirements)
93
+ overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ return overall_score
96
+
97
+
98
+ def load_leaderboard_data(file_path: str) -> Dict:
99
+ """
100
+ Load the leaderboard data from a JSON file.
101
+ """
102
+ if not os.path.exists(file_path):
103
+ version = "v0"
104
+ if "_v" in file_path:
105
+ version = file_path.split("_")[-1].split(".")[0]
106
+ return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
107
+
108
+ with open(file_path, 'r') as f:
109
+ data = json.load(f)
110
+
111
+ # Ensure version field exists
112
+ if "version" not in data:
113
+ version = "v0"
114
+ if "_v" in file_path:
115
+ version = file_path.split("_")[-1].split(".")[0]
116
+ data["version"] = version
117
+
118
+ return data
119
+
120
+
121
+ def save_leaderboard_data(data: Dict, file_path: str) -> None:
122
+ """
123
+ Save the leaderboard data to a JSON file.
124
+ """
125
+ # Ensure the directory exists
126
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
127
+
128
+ # Update the last_updated timestamp
129
+ data["last_updated"] = datetime.now().isoformat()
130
+
131
+ # Ensure version is set
132
+ if "version" not in data:
133
+ version = "v0"
134
+ if "_v" in file_path:
135
+ version = file_path.split("_")[-1].split(".")[0]
136
+ data["version"] = version
137
+
138
+ with open(file_path, 'w') as f:
139
+ json.dump(data, f, indent=2)
140
+
141
+
142
+ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
143
+ """
144
+ Convert leaderboard data to a pandas DataFrame for display.
145
+ """
146
+ rows = []
147
+
148
+ for entry in leaderboard_data.get("entries", []):
149
+ model_name = entry.get("model_name", "Unknown Model")
150
+
151
+ # Extract basic metadata
152
+ row = {
153
+ "model_name": model_name,
154
+ "model_type": entry.get("model_type", "Unknown"),
155
+ "mode": entry.get("mode", "Strict"),
156
+ "submission_date": entry.get("submission_date", ""),
157
+ "version": entry.get("version", "v0"),
158
+ "review_model_type": entry.get("review_model_type", "custom").lower()
159
+ }
160
+
161
+ # Add additional metadata fields if present
162
+ for key in ["base_model", "revision", "precision", "weight_type"]:
163
+ if key in entry:
164
+ row[key] = entry[key]
165
+
166
+ # Add multimetric scores
167
+ for metric in MULTIMETRIC_METRICS:
168
+ if metric in entry:
169
+ row[metric] = entry[metric]
170
  else:
171
+ row[metric] = pd.NA
172
+
173
+ # Add exact match metrics
174
+ for metric in EXACT_MATCH_METRICS:
175
+ if metric in entry:
176
+ row[metric] = entry[metric]
177
+ else:
178
+ row[metric] = pd.NA
179
+
180
+ # Calculate aggregated metrics
181
+ multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])]
182
+ exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])]
183
+
184
+ if multimetric_scores:
185
+ row["multimetric_average"] = np.mean(multimetric_scores)
186
+ else:
187
+ row["multimetric_average"] = pd.NA
188
+
189
+ if exact_match_scores:
190
+ row["exact_match_average"] = np.mean(exact_match_scores)
191
+ else:
192
+ row["exact_match_average"] = pd.NA
193
+
194
+ # Calculate overall score
195
+ row["overall_score"] = calculate_overall_score(entry)
196
+
197
+ # Add language-specific metrics if available
198
+ for lang in COMMENT_LANGUAGES:
199
+ for metric in ["readability", "relevance", "overall_score"]:
200
+ lang_key = f"{lang}_{metric}"
201
+ if lang_key in entry:
202
+ row[lang_key] = entry[lang_key]
203
+ else:
204
+ row[lang_key] = pd.NA
205
+
206
+ # Add evaluation count
207
+ row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA))
208
+
209
+ rows.append(row)
210
+
211
+ # Create DataFrame and sort by overall score
212
+ df = pd.DataFrame(rows)
213
+
214
+ # Ensure all expected columns exist
215
+ for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS:
216
+ if metric not in df.columns:
217
+ df[metric] = pd.NA
218
+
219
+ # Sort by overall score (descending)
220
+ if not df.empty:
221
+ df = df.sort_values(by="overall_score", ascending=False, na_position='last')
222
+
223
+ # Ensure summary columns exist
224
+ summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"]
225
+ for col in summary_cols:
226
+ if col not in df.columns:
227
+ df[col] = pd.NA
228
+
229
+ return df
230
+
231
+
232
+ def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict:
233
+ """
234
+ Add new entries to the leaderboard, replacing any with the same model name.
235
+ """
236
+ # Create a mapping of existing entries by model name and version
237
+ existing_entries = {
238
+ (entry["model_name"], entry.get("version", "v0")): i
239
+ for i, entry in enumerate(leaderboard_data.get("entries", []))
240
+ }
241
+
242
+ # Process each new entry
243
+ for new_entry in new_entries:
244
+ model_name = new_entry.get("model_name")
245
+ version = new_entry.get("version", "v0")
246
+
247
+ # Add calculated metrics
248
+ new_entry["overall_score"] = calculate_overall_score(new_entry)
249
+
250
+ # Calculate averages
251
+ multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
252
+ exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
253
+
254
+ if multimetric_scores:
255
+ new_entry["multimetric_average"] = np.mean(multimetric_scores)
256
+ if exact_match_scores:
257
+ new_entry["exact_match_average"] = np.mean(exact_match_scores)
258
+
259
+ if (model_name, version) in existing_entries:
260
+ # Replace existing entry
261
+ leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
262
+ else:
263
+ # Add new entry
264
+ if "entries" not in leaderboard_data:
265
+ leaderboard_data["entries"] = []
266
+ leaderboard_data["entries"].append(new_entry)
267
+
268
+ # Update the last_updated timestamp
269
+ leaderboard_data["last_updated"] = datetime.now().isoformat()
270
+
271
+ return leaderboard_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Populate the CodeReview Bench leaderboard from HuggingFace datasets.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import pandas as pd
8
+ import tempfile
9
+ from typing import Dict, List, Optional
10
+ from datetime import datetime
11
+ import numpy as np
12
+
13
+ from huggingface_hub import hf_hub_download, HfApi
14
+ from datasets import load_dataset
15
+
16
+ from src.display.utils import CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES
17
+ from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
18
+ from src.leaderboard.processor import leaderboard_to_dataframe
19
+
20
+
21
+ def get_latest_leaderboard(version="v0") -> Optional[Dict]:
22
+ """
23
+ Get the latest leaderboard data from HuggingFace dataset.
24
+ """
25
+ try:
26
+ # Try to download the leaderboard file
27
+ leaderboard_path = hf_hub_download(
28
+ repo_id=RESULTS_DATASET_ID,
29
+ filename=f"leaderboards/leaderboard_{version}.json",
30
+ repo_type="dataset",
31
+ token=TOKEN
32
+ )
33
+
34
+ with open(leaderboard_path, 'r') as f:
35
+ return json.load(f)
36
+ except Exception as e:
37
+ print(f"Error downloading leaderboard: {e}")
38
+ return None
39
+
40
+
41
+ def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
42
+ """
43
+ Get a specific model's entry from the entries folder, uniquely identified by model_name, mode, and version.
44
+ """
45
+ try:
46
+ model_name_safe = model_name.replace("/", "_").replace(" ", "_")
47
+ mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
48
+ entry_path = hf_hub_download(
49
+ repo_id=RESULTS_DATASET_ID,
50
+ filename=f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json",
51
+ repo_type="dataset",
52
+ token=TOKEN
53
+ )
54
+ with open(entry_path, 'r') as f:
55
+ return json.load(f)
56
+ except Exception as e:
57
+ print(f"Error downloading model entry: {e}")
58
+ return None
59
+
60
+
61
+ def get_all_entries(version="v0") -> List[Dict]:
62
+ """
63
+ Get all entries from the HuggingFace dataset.
64
+ """
65
+ try:
66
+ api = HfApi(token=TOKEN)
67
+ files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
68
+ entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
69
+
70
+ all_entries = []
71
+ for entry_file in entry_files:
72
+ try:
73
+ entry_path = hf_hub_download(
74
+ repo_id=RESULTS_DATASET_ID,
75
+ filename=entry_file,
76
+ repo_type="dataset",
77
+ token=TOKEN
78
+ )
79
+ with open(entry_path, 'r') as f:
80
+ entry_data = json.load(f)
81
+ all_entries.append(entry_data)
82
+ except Exception as e:
83
+ print(f"Error loading entry {entry_file}: {e}")
84
+
85
+ return all_entries
86
+ except Exception as e:
87
+ print(f"Error getting all entries: {e}")
88
+ return []
89
+
90
+
91
+ def get_leaderboard_df(version="v0") -> pd.DataFrame:
92
+ """
93
+ Get the leaderboard data as a DataFrame.
94
+ """
95
+ # Get latest leaderboard data
96
+ leaderboard_data = get_latest_leaderboard(version)
97
+
98
+ if not leaderboard_data:
99
+ # If no leaderboard exists, try to build it from entries
100
+ entries = get_all_entries(version)
101
+ if entries:
102
+ leaderboard_data = {
103
+ "entries": entries,
104
+ "last_updated": datetime.now().isoformat(),
105
+ "version": version
106
+ }
107
+ else:
108
+ # Return empty DataFrame if no data available
109
+ return pd.DataFrame(columns=DISPLAY_COLS)
110
+
111
+ # Convert to DataFrame
112
+ return leaderboard_to_dataframe(leaderboard_data)
113
+
114
+
115
+ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
116
+ """
117
+ Get the leaderboard data filtered by a specific programming language category.
118
+ """
119
+ # Get latest leaderboard data
120
+ leaderboard_data = get_latest_leaderboard(version)
121
+
122
+ if not leaderboard_data:
123
+ # If no leaderboard exists, try to build it from entries
124
+ entries = get_all_entries(version)
125
+ if entries:
126
+ leaderboard_data = {
127
+ "entries": entries,
128
+ "last_updated": datetime.now().isoformat(),
129
+ "version": version
130
+ }
131
+ else:
132
+ # Return empty DataFrame if no data available
133
+ return pd.DataFrame(columns=DISPLAY_COLS)
134
+
135
+ # Filter entries to only include those with data for the specified programming language
136
+ filtered_entries = []
137
+ for entry in leaderboard_data.get("entries", []):
138
+ # Check if entry has data for this programming language
139
+ programming_language = entry.get("programming_language", "").lower()
140
+ if programming_language == category.lower() or category.lower() == "other":
141
+ # For "other" category, include entries that don't match any specific language
142
+ if category.lower() == "other":
143
+ if programming_language not in [cat.lower() for cat in CATEGORIES[:-1]]: # Exclude "Other" from check
144
+ filtered_entries.append(entry)
145
+ else:
146
+ filtered_entries.append(entry)
147
+
148
+ # Create a new leaderboard data structure with the filtered entries
149
+ filtered_leaderboard = {
150
+ "entries": filtered_entries,
151
+ "last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
152
+ "version": version
153
+ }
154
+
155
+ # Convert to DataFrame
156
+ return leaderboard_to_dataframe(filtered_leaderboard)
157
+
158
+
159
+ def get_detailed_model_data(model_name: str, mode: str, version="v0") -> Dict:
160
+ """
161
+ Get detailed data for a specific model and mode.
162
+ """
163
+ entry = get_model_entry(model_name, mode, version)
164
+ if entry:
165
+ return entry
166
+ leaderboard_data = get_latest_leaderboard(version)
167
+ if leaderboard_data:
168
+ for entry in leaderboard_data.get("entries", []):
169
+ if entry.get("model_name") == model_name and str(entry.get("mode")).lower() == str(mode).lower():
170
+ return entry
171
+ return {}
src/submission/submit.py CHANGED
@@ -1,386 +1,184 @@
1
  """
2
- Submission system for CodeReview Leaderboard
3
  """
4
 
5
- import gradio as gr
6
- import re
7
- from typing import Dict, Any, List, Tuple
8
- from datetime import datetime, timezone
9
- from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
10
- from src.leaderboard.processor import LeaderboardProcessor
11
- from src.display.utils import get_main_leaderboard_data, get_quality_metrics_data
12
 
13
- class SubmissionHandler:
14
- """Handles model submissions with validation and rate limiting"""
15
-
16
- def __init__(self):
17
- self.processor = LeaderboardProcessor()
18
-
19
- def get_client_ip(self, request: gr.Request) -> str:
20
- """Extract client IP address from request"""
21
- try:
22
- # Check for forwarded headers first
23
- forwarded_for = request.headers.get('X-Forwarded-For')
24
- if forwarded_for:
25
- # Take the first IP if multiple
26
- ip = forwarded_for.split(',')[0].strip()
27
- return ip
28
-
29
- # Check for real IP header
30
- real_ip = request.headers.get('X-Real-IP')
31
- if real_ip:
32
- return real_ip.strip()
33
-
34
- # Fall back to client host
35
- if hasattr(request, 'client') and hasattr(request.client, 'host'):
36
- return request.client.host
37
-
38
- # Default fallback
39
- return "127.0.0.1"
40
-
41
- except Exception as e:
42
- print(f"Error getting client IP: {e}")
43
- return "127.0.0.1"
44
-
45
- def validate_model_name(self, model_name: str) -> Tuple[bool, str]:
46
- """Validate model name format"""
47
- if not model_name or not model_name.strip():
48
- return False, "Model name cannot be empty"
49
-
50
- model_name = model_name.strip()
51
-
52
- # Check length
53
- if len(model_name) > 100:
54
- return False, "Model name too long (max 100 characters)"
55
-
56
- # Check for valid characters
57
- if not re.match(r'^[a-zA-Z0-9._/-]+$', model_name):
58
- return False, "Model name contains invalid characters (only letters, numbers, dots, hyphens, underscores, and slashes allowed)"
59
-
60
- # Check for organization/model format
61
- if "/" in model_name:
62
- parts = model_name.split("/")
63
- if len(parts) != 2:
64
- return False, "Model name should be in format 'organization/model'"
65
- if not parts[0] or not parts[1]:
66
- return False, "Both organization and model name must be specified"
67
-
68
- return True, "Valid model name"
69
-
70
- def validate_scores(self, scores: Dict[str, float]) -> Tuple[bool, str]:
71
- """Validate score values"""
72
- required_scores = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
73
-
74
- for score_name in required_scores:
75
- value = scores.get(score_name)
76
-
77
- if value is None:
78
- return False, f"Missing score: {score_name}"
79
-
80
- if not isinstance(value, (int, float)):
81
- return False, f"Invalid score format for {score_name}: must be a number"
82
-
83
- if not (0 <= value <= 1):
84
- return False, f"Score {score_name} out of range: {value} (must be between 0 and 1)"
85
-
86
- # Check logical consistency
87
- if scores["llm_pass_1"] > scores["llm_pass_5"]:
88
- return False, "Pass@1 score cannot be higher than Pass@5"
89
-
90
- if scores["llm_pass_5"] > scores["llm_pass_10"]:
91
- return False, "Pass@5 score cannot be higher than Pass@10"
92
-
93
- return True, "Valid scores"
94
-
95
- def validate_metrics(self, metrics: Dict[str, int]) -> Tuple[bool, str]:
96
- """Validate quality metrics"""
97
- for metric_name in QUALITY_METRICS:
98
- value = metrics.get(metric_name)
99
-
100
- if value is None:
101
- return False, f"Missing metric: {metric_name}"
102
-
103
- if not isinstance(value, (int, float)):
104
- return False, f"Invalid metric format for {metric_name}: must be a number"
105
-
106
- if not (0 <= value <= 10):
107
- return False, f"Metric {metric_name} out of range: {value} (must be between 0 and 10)"
108
-
109
- return True, "Valid metrics"
110
-
111
- def submit_model(
112
- self,
113
- request: gr.Request,
114
- current_data: List[Dict],
115
- model_name: str,
116
- programming_language: str,
117
- comment_language: str,
118
- taxonomy_category: str,
119
- bleu: float,
120
- llm_pass_1: float,
121
- llm_pass_5: float,
122
- llm_pass_10: float,
123
- readability: int,
124
- relevance: int,
125
- explanation_clarity: int,
126
- problem_identification: int,
127
- actionability: int,
128
- completeness: int,
129
- specificity: int,
130
- contextual_adequacy: int,
131
- consistency: int,
132
- brevity: int,
133
- ) -> Tuple[List[Dict], List[List[str]], List[List[str]], str]:
134
- """Handle model submission with full validation"""
135
-
136
- try:
137
- # Get client IP
138
- client_ip = self.get_client_ip(request)
139
-
140
- # Check rate limiting
141
- rate_ok, rate_msg = self.processor.check_rate_limit(client_ip)
142
- if not rate_ok:
143
- return current_data, [], [], f"❌ {rate_msg}"
144
-
145
- # Validate model name
146
- name_valid, name_msg = self.validate_model_name(model_name)
147
- if not name_valid:
148
- return current_data, [], [], f"❌ {name_msg}"
149
-
150
- # Validate scores
151
- scores = {
152
- "bleu": bleu,
153
- "llm_pass_1": llm_pass_1,
154
- "llm_pass_5": llm_pass_5,
155
- "llm_pass_10": llm_pass_10
156
- }
157
- scores_valid, scores_msg = self.validate_scores(scores)
158
- if not scores_valid:
159
- return current_data, [], [], f"❌ {scores_msg}"
160
-
161
- # Validate metrics
162
- metrics = {
163
- "readability": readability,
164
- "relevance": relevance,
165
- "explanation_clarity": explanation_clarity,
166
- "problem_identification": problem_identification,
167
- "actionability": actionability,
168
- "completeness": completeness,
169
- "specificity": specificity,
170
- "contextual_adequacy": contextual_adequacy,
171
- "consistency": consistency,
172
- "brevity": brevity,
173
- }
174
- metrics_valid, metrics_msg = self.validate_metrics(metrics)
175
- if not metrics_valid:
176
- return current_data, [], [], f"❌ {metrics_msg}"
177
-
178
- # Create submission data
179
- submission_data = {
180
- "model_name": model_name.strip(),
181
- "programming_language": programming_language,
182
- "comment_language": comment_language,
183
- "taxonomy_category": taxonomy_category,
184
- "bleu": bleu,
185
- "llm_pass_1": llm_pass_1,
186
- "llm_pass_5": llm_pass_5,
187
- "llm_pass_10": llm_pass_10,
188
- "metrics": metrics
189
- }
190
-
191
- # Submit to processor
192
- success, message = self.processor.add_submission(submission_data, client_ip)
193
-
194
- if success:
195
- # Load updated data
196
- updated_data = self.processor.load_leaderboard_data()
197
-
198
- # Format tables
199
- main_table = get_main_leaderboard_data(updated_data)
200
- quality_table = get_quality_metrics_data(updated_data)
201
-
202
- return updated_data, main_table, quality_table, message
203
- else:
204
- return current_data, [], [], message
205
-
206
- except Exception as e:
207
- print(f"Error in submission: {e}")
208
- return current_data, [], [], f"❌ Submission failed: {str(e)}"
209
-
210
- def get_submission_form_components(self):
211
- """Create gradio components for submission form"""
212
 
213
- with gr.Accordion("📝 Submit New Model Results", open=False):
214
- gr.Markdown("""
215
- ### Submission Guidelines
216
- - Provide accurate scores based on proper evaluation
217
- - Model name should follow 'organization/model' format
218
- - All metrics are required
219
- - Submissions are rate-limited per IP address
220
- """)
221
-
222
- with gr.Row():
223
- model_name = gr.Textbox(
224
- label="Model Name",
225
- placeholder="e.g., microsoft/CodeT5-base",
226
- info="Use organization/model format"
227
- )
228
- programming_language = gr.Dropdown(
229
- choices=PROGRAMMING_LANGUAGES,
230
- value="All",
231
- label="Programming Language",
232
- info="Primary programming language evaluated"
233
- )
234
- comment_language = gr.Dropdown(
235
- choices=COMMENT_LANGUAGES,
236
- value="English",
237
- label="Comment Language",
238
- info="Natural language of code comments"
239
- )
240
- taxonomy_category = gr.Dropdown(
241
- choices=TAXONOMY_CATEGORIES,
242
- value="All",
243
- label="Taxonomy Category",
244
- info="Primary review category focus"
245
- )
246
-
247
- gr.Markdown("### 📊 Performance Scores (0.0 - 1.0)")
248
- with gr.Row():
249
- bleu = gr.Number(
250
- label="BLEU Score",
251
- value=0.0,
252
- minimum=0.0,
253
- maximum=1.0,
254
- step=0.001,
255
- info="BLEU similarity score"
256
- )
257
- pass1 = gr.Number(
258
- label="Pass@1",
259
- value=0.0,
260
- minimum=0.0,
261
- maximum=1.0,
262
- step=0.001,
263
- info="Success rate in 1 attempt"
264
- )
265
- pass5 = gr.Number(
266
- label="Pass@5",
267
- value=0.0,
268
- minimum=0.0,
269
- maximum=1.0,
270
- step=0.001,
271
- info="Success rate in 5 attempts"
272
- )
273
- pass10 = gr.Number(
274
- label="Pass@10",
275
- value=0.0,
276
- minimum=0.0,
277
- maximum=1.0,
278
- step=0.001,
279
- info="Success rate in 10 attempts"
280
- )
281
-
282
- gr.Markdown("### 📋 Quality Metrics (0 - 10)")
283
- with gr.Row():
284
- readability = gr.Slider(
285
- minimum=0, maximum=10, value=5, step=1,
286
- label="Readability",
287
- info="How readable are the generated reviews?"
288
- )
289
- relevance = gr.Slider(
290
- minimum=0, maximum=10, value=5, step=1,
291
- label="Relevance",
292
- info="How relevant to the code changes?"
293
- )
294
- explanation_clarity = gr.Slider(
295
- minimum=0, maximum=10, value=5, step=1,
296
- label="Explanation Clarity",
297
- info="How clear are the explanations?"
298
- )
299
- problem_identification = gr.Slider(
300
- minimum=0, maximum=10, value=5, step=1,
301
- label="Problem Identification",
302
- info="How well does it identify issues?"
303
- )
304
- actionability = gr.Slider(
305
- minimum=0, maximum=10, value=5, step=1,
306
- label="Actionability",
307
- info="How actionable are the suggestions?"
308
- )
309
-
310
- with gr.Row():
311
- completeness = gr.Slider(
312
- minimum=0, maximum=10, value=5, step=1,
313
- label="Completeness",
314
- info="How complete are the reviews?"
315
- )
316
- specificity = gr.Slider(
317
- minimum=0, maximum=10, value=5, step=1,
318
- label="Specificity",
319
- info="How specific are the comments?"
320
- )
321
- contextual_adequacy = gr.Slider(
322
- minimum=0, maximum=10, value=5, step=1,
323
- label="Contextual Adequacy",
324
- info="How well does it understand context?"
325
- )
326
- consistency = gr.Slider(
327
- minimum=0, maximum=10, value=5, step=1,
328
- label="Consistency",
329
- info="How consistent across reviews?"
330
- )
331
- brevity = gr.Slider(
332
- minimum=0, maximum=10, value=5, step=1,
333
- label="Brevity",
334
- info="How concise are the reviews?"
335
- )
336
-
337
- submit_btn = gr.Button("🚀 Submit Model", variant="primary")
338
- status_msg = gr.Markdown("")
339
-
340
- # Return all components for use in the main app
341
- return {
342
- "model_name": model_name,
343
- "programming_language": programming_language,
344
- "comment_language": comment_language,
345
- "taxonomy_category": taxonomy_category,
346
- "bleu": bleu,
347
- "pass1": pass1,
348
- "pass5": pass5,
349
- "pass10": pass10,
350
- "readability": readability,
351
- "relevance": relevance,
352
- "explanation_clarity": explanation_clarity,
353
- "problem_identification": problem_identification,
354
- "actionability": actionability,
355
- "completeness": completeness,
356
- "specificity": specificity,
357
- "contextual_adequacy": contextual_adequacy,
358
- "consistency": consistency,
359
- "brevity": brevity,
360
- "submit_btn": submit_btn,
361
- "status_msg": status_msg,
362
- }
363
-
364
- def get_submission_history(self, ip_address: str) -> List[List[str]]:
365
- """Get submission history for display"""
366
  try:
367
- submissions = self.processor.get_ip_submissions(ip_address)
368
-
369
- table_data = []
370
- for sub in submissions:
371
- row = [
372
- sub.get("model_name", ""),
373
- sub.get("programming_language", ""),
374
- sub.get("comment_language", ""),
375
- sub.get("taxonomy_category", ""),
376
- f"{sub.get('scores', {}).get('llm_pass_1', 0):.3f}",
377
- sub.get("submission_date", "").split("T")[0] if sub.get("submission_date") else "",
378
- sub.get("status", "")
379
- ]
380
- table_data.append(row)
381
-
382
- return table_data
383
-
384
- except Exception as e:
385
- print(f"Error getting submission history: {e}")
386
- return []
 
1
  """
2
+ Handle submissions to the CodeReview Bench leaderboard.
3
  """
4
 
5
+ import json
6
+ import os
7
+ import tempfile
8
+ from datetime import datetime
9
+ from typing import Dict, List, Tuple
 
 
10
 
11
+ from huggingface_hub import HfApi
12
+ from datasets import load_dataset
13
+
14
+ from src.display.formatting import styled_error, styled_message
15
+ from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
16
+ from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard
17
+
18
+
19
+ def validate_submission(file_path: str) -> Tuple[bool, str]:
20
+ """
21
+ Validate a submission file.
22
+ """
23
+ try:
24
+ entries, message = process_jsonl_submission(file_path)
25
+ if not entries:
26
+ return False, message
27
+ return True, "Submission is valid"
28
+ except Exception as e:
29
+ return False, f"Error validating submission: {e}"
30
+
31
+
32
+ def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]:
33
+ """
34
+ Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version.
35
+ """
36
+ try:
37
+ # Create safe model name for file path
38
+ model_name_safe = model_name.replace("/", "_").replace(" ", "_")
39
+ mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
40
+
41
+ # Create entry path in entries folder
42
+ entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json"
43
+
44
+ # Save entry to temporary file
45
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
46
+ json.dump(entry, temp_file, indent=2)
47
+ temp_path = temp_file.name
48
+
49
+ # Upload file
50
+ api = HfApi(token=TOKEN)
51
+ api.upload_file(
52
+ path_or_fileobj=temp_path,
53
+ path_in_repo=entry_path,
54
+ repo_id=RESULTS_DATASET_ID,
55
+ repo_type="dataset",
56
+ commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})"
57
+ )
58
+
59
+ os.unlink(temp_path)
60
+ return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})"
61
+ except Exception as e:
62
+ return False, f"Error submitting entry to dataset: {e}"
63
+
64
+
65
+ def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
66
+ """
67
+ Submit updated leaderboard to the HuggingFace dataset.
68
+ """
69
+ try:
70
+ # Create leaderboard data
71
+ leaderboard_data = {
72
+ "entries": entries,
73
+ "last_updated": datetime.now().isoformat(),
74
+ "version": version
75
+ }
76
+
77
+ # Save to temporary file
78
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
79
+ json.dump(leaderboard_data, temp_file, indent=2)
80
+ temp_path = temp_file.name
81
+
82
+ # Upload file
83
+ api = HfApi(token=TOKEN)
84
+ api.upload_file(
85
+ path_or_fileobj=temp_path,
86
+ path_in_repo=f"leaderboards/leaderboard_{version}.json",
87
+ repo_id=RESULTS_DATASET_ID,
88
+ repo_type="dataset",
89
+ commit_message=f"Update leaderboard for version {version}"
90
+ )
91
+
92
+ os.unlink(temp_path)
93
+ return True, "Leaderboard updated successfully"
94
+ except Exception as e:
95
+ return False, f"Error updating leaderboard: {e}"
96
+
97
+
98
+ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
99
+ """
100
+ Process a submission to the CodeReview Bench leaderboard.
101
+ """
102
+ try:
103
+ # Validate submission
104
+ is_valid, validation_message = validate_submission(file_path)
105
+ if not is_valid:
106
+ return styled_error(validation_message)
107
+
108
+ # Process the submission entries
109
+ entries, message = process_jsonl_submission(file_path)
110
+ if not entries:
111
+ return styled_error(f"Failed to process submission: {message}")
112
+
113
+ # Upload raw submission file
114
+ model_name = metadata.get("model_name", "unknown")
115
+ model_name_safe = model_name.replace("/", "_").replace(" ", "_")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ api = HfApi(token=TOKEN)
118
+ submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
119
+ api.upload_file(
120
+ path_or_fileobj=file_path,
121
+ path_in_repo=submission_path,
122
+ repo_id=RESULTS_DATASET_ID,
123
+ repo_type="dataset",
124
+ commit_message=f"Add raw submission for {model_name}"
125
+ )
126
+
127
+ # Process entries and add metadata
128
+ processed_entries = []
129
+ for entry in entries:
130
+ # Add metadata to entry
131
+ entry.update({
132
+ "model_name": metadata.get("model_name"),
133
+ "model_type": metadata.get("model_type"),
134
+ "review_model_type": str(metadata.get("review_model_type", "custom")).lower(),
135
+ "mode": metadata.get("mode"),
136
+ "base_model": metadata.get("base_model"),
137
+ "revision": metadata.get("revision"),
138
+ "precision": metadata.get("precision"),
139
+ "weight_type": metadata.get("weight_type"),
140
+ "version": version,
141
+ "submission_date": datetime.now().isoformat()
142
+ })
143
+ processed_entries.append(entry)
144
+
145
+ # Submit entries to entries folder
146
+ for entry in processed_entries:
147
+ success, message = submit_entry_to_hub(entry, model_name, metadata.get("mode"), version)
148
+ if not success:
149
+ return styled_error(message)
150
+
151
+ # Get all entries from HF dataset and update leaderboard
152
+ files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
153
+ entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
154
+
155
+ all_entries = []
156
+ for entry_file in entry_files:
157
+ try:
158
+ entry_path = api.hf_hub_download(
159
+ repo_id=RESULTS_DATASET_ID,
160
+ filename=entry_file,
161
+ repo_type="dataset",
162
+ )
163
+ with open(entry_path, 'r') as f:
164
+ entry_data = json.load(f)
165
+ all_entries.append(entry_data)
166
+ except Exception as e:
167
+ print(f"Error loading entry {entry_file}: {e}")
168
+
169
+ # Update leaderboard with all entries
170
+ success, message = submit_leaderboard_to_hub(all_entries, version)
171
+ if not success:
172
+ return styled_error(message)
173
+
174
+ return styled_message("Submission successful! Model evaluated and leaderboard updated.")
175
+
176
+ except Exception as e:
177
+ return styled_error(f"Error processing submission: {e}")
178
+ finally:
179
+ # Clean up temporary files if they exist
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  try:
181
+ if os.path.exists(file_path):
182
+ os.remove(file_path)
183
+ except:
184
+ pass