Alex commited on
Commit
9404fa8
·
2 Parent(s): 00327b5 f990f50

merge_resolve

Browse files
.env.template ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ HF_TOKEN="your_huggingface_write_token"
2
+ OWNER="your_huggingface_username_or_org"
3
+ RESULTS_DATASET_ID="your_username/guardbench-results"
4
+ SUBMITTER_TOKEN="your_secret_submission_token"
5
+ ADMIN_USERNAME="admin"
6
+ ADMIN_PASSWORD="password" # Change this!
.gitignore CHANGED
@@ -1,13 +1,52 @@
1
- auto_evals/
2
- venv/
3
  __pycache__/
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  .env
5
- .ipynb_checkpoints
6
- *ipynb
 
 
 
 
 
7
  .vscode/
 
 
 
 
 
 
8
 
 
9
  eval-queue/
10
  eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
- logs/
 
 
 
 
 
 
1
+ # Python
 
2
  __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ build/
9
+ develop-eggs/
10
+ dist/
11
+ downloads/
12
+ eggs/
13
+ .eggs/
14
+ lib/
15
+ lib64/
16
+ parts/
17
+ sdist/
18
+ var/
19
+ .venv/
20
+ *.egg-info/
21
+ .installed.cfg
22
+ *.egg
23
+ .gradio/
24
+
25
+ # Environment variables
26
  .env
27
+
28
+ # Virtual Environment
29
+ venv/
30
+ ENV/
31
+
32
+ # IDE
33
+ .idea/
34
  .vscode/
35
+ *.swp
36
+ *.swo
37
+
38
+ # OS
39
+ .DS_Store
40
+ Thumbs.db
41
 
42
+ # Hugging Face cache
43
  eval-queue/
44
  eval-results/
45
  eval-queue-bk/
46
  eval-results-bk/
47
+
48
+ # Data files
49
+ data/
50
+
51
+ # Versioned leaderboard files
52
+ data/leaderboard_v*.json
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "guard-bench-submodule"]
2
+ path = guard-bench-submodule
3
+ url = https://github.com/whitecircle-ai/circle-guard-bench.git
.gradio/certificate.pem ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ -----BEGIN CERTIFICATE-----
2
+ MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3
+ TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4
+ cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5
+ WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6
+ ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7
+ MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8
+ h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9
+ 0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10
+ A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11
+ T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12
+ B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13
+ B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14
+ KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15
+ OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16
+ jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17
+ qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18
+ rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19
+ HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20
+ hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21
+ ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22
+ 3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23
+ NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24
+ ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25
+ TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26
+ jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27
+ oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28
+ 4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29
+ mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30
+ emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31
+ -----END CERTIFICATE-----
README.md CHANGED
@@ -1,136 +1,158 @@
1
  ---
2
- title: CodeReview Leaderboard
3
- emoji: 🥇
4
- colorFrom: green
5
  colorTo: indigo
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: true
9
- license: mit
10
- short_description: CodeReview Leaderboard for evaluating code review models
11
- sdk_version: 5.19.0
12
- storage: persistent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  ---
14
 
15
- # 🏆 CodeReview Leaderboard
16
 
 
17
  A comprehensive benchmark and leaderboard for code review generation models, inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench).
18
-
19
- ## Features
20
-
21
- ### 🎯 Core Functionality
22
-
23
- - **Multi-dimensional Evaluation**: Track models across BLEU scores, Pass@1/5/10 metrics, and 10 quality dimensions
24
- - **Advanced Filtering**: Filter results by programming language, comment language, and taxonomy category
25
- - **Real-time Updates**: Dynamic leaderboard updates with instant filtering
26
- - **Dark Theme**: Modern, eye-friendly interface with GitHub-inspired dark theme
27
-
28
- ### 🔍 Advanced Analytics
29
-
30
- - **Language Performance**: Compare model performance across programming languages
31
- - **Category Analysis**: Analyze performance by review type (bug detection, security, etc.)
32
- - **Submission History**: Track all submissions with IP-based logging
33
- - **Statistical Insights**: Comprehensive statistics and trend analysis
34
-
35
- ### 🛡️ Security & Quality
36
-
37
- - **IP-based Rate Limiting**: Prevent spam submissions (5 per 24 hours per IP)
38
- - **Comprehensive Validation**: Multi-layer validation for all submissions
39
- - **Audit Trail**: Complete submission logging for transparency
40
- - **Data Integrity**: Automatic data validation and backup systems
41
-
42
- ### 🌐 Multi-Language Support
43
-
44
- - **Programming Languages**: Python, JavaScript, Java, C++, Go, Rust, and more
45
- - **Comment Languages**: English, Chinese, Spanish, French, German, Japanese, and more
46
- - **Taxonomy Categories**: Bug Detection, Security, Performance, Style, and more
47
-
48
- ## 🚀 Quick Start
49
-
50
- ### Installation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  ```bash
53
  pip install -r requirements.txt
54
  ```
55
 
56
- ### Run Locally
57
 
58
  ```bash
59
  python app.py
60
  ```
61
 
62
- ### Access the Interface
63
-
64
- Open your browser to `http://localhost:7860`
65
-
66
- ## 📊 Usage Guide
67
-
68
- ### 1. Viewing the Leaderboard
69
-
70
- - Navigate to the **🏆 Leaderboard** tab
71
- - Use the filter dropdowns to narrow results:
72
- - **Programming Language**: Filter by specific programming languages
73
- - **Comment Language**: Filter by natural language of comments
74
- - **Taxonomy Category**: Filter by review category type
75
- - Click **🔄 Refresh** to update data
76
-
77
- ### 2. Submitting Models
78
-
79
- - Go to the **📝 Submit Model** tab
80
- - Fill in the submission form:
81
- - **Model Name**: Use `organization/model` format
82
- - **Languages & Category**: Select appropriate filters
83
- - **Performance Scores**: Provide BLEU and Pass@k scores (0.0-1.0)
84
- - **Quality Metrics**: Rate across 10 dimensions (0-10)
85
- - Click **🚀 Submit Model** to add your results
86
-
87
- ### 3. Analytics & Insights
88
-
89
- - Visit the **📈 Analytics** tab to see:
90
- - Recent submission history
91
- - Language performance comparisons
92
- - Category performance analysis
93
- - Trends and patterns
94
-
95
- ### 4. Data Export
96
-
97
- - Use the **ℹ️ About** tab to export data in JSON or CSV format
98
- - Full leaderboard data available for research and analysis
99
-
100
- ## 🏗️ Architecture
101
-
102
- ### Directory Structure
103
-
104
- ```
105
- ├── src/
106
- │ ├── about.py # About page content
107
- │ ├── envs.py # Environment configuration
108
- │ ├── display/ # Display utilities
109
- │ │ ├── css_html_js.py # Styling and themes
110
- │ │ ├── formatting.py # Data formatting
111
- │ │ └── utils.py # Display utilities
112
- │ ├── leaderboard/ # Leaderboard processing
113
- │ │ └── processor.py # Data operations
114
- │ └── submission/ # Submission handling
115
- │ └── submit.py # Submission validation
116
- ├── data/ # Data storage
117
- │ ├── leaderboard_data.json # Main leaderboard
118
- │ └── submissions.json # Submission log
119
- ├── app.py # Main application
120
- └── requirements.txt # Dependencies
121
  ```
122
 
123
- ### Key Components
124
 
125
- - **LeaderboardProcessor**: Handles all data operations, validation, and persistence
126
- - **SubmissionHandler**: Manages model submissions with IP tracking and validation
127
- - **Display Utils**: Provides filtering, formatting, and table generation
128
- - **Dark Theme**: Custom CSS for modern, accessible interface
129
 
130
- ## 🎨 Features Inspired by CodeReviewBench
131
 
132
- ### ✅ Implemented Features
133
 
 
134
  - **Multi-tab Interface**: Organized navigation with dedicated sections
135
  - **Advanced Filtering**: Real-time filtering by multiple criteria
136
  - **Dark Theme**: Modern, GitHub-inspired dark interface
 
1
  ---
2
+ title: CircleGuardBench
3
+ emoji:
4
+ colorFrom: gray
5
  colorTo: indigo
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: true
10
+ short_description: First benchmark testing LLM guards on safety and accuracy.
11
+ models:
12
+ - AtlaAI/Selene-1-Mini-Llama-3.1-8B
13
+ - google/gemma-3-12b-it
14
+ - google/gemma-3-4b-it
15
+ - meta-llama/Llama-3.1-8B-Instruct
16
+ - meta-llama/Llama-3.2-3B-Instruct
17
+ - meta-llama/Llama-4-Maverick-17B-128E-Instruct
18
+ - meta-llama/Llama-4-Scout-17B-16E-Instruct
19
+ - meta-llama/Llama-Guard-3-1B
20
+ - meta-llama/Llama-Guard-3-8B
21
+ - meta-llama/Llama-Guard-4-12B
22
+ - mistralai/Ministral-8B-Instruct-2410
23
+ - mistralai/Mistral-Small-3.1-24B-Instruct-2503
24
+ - Qwen/Qwen2.5-7B-Instruct
25
+ - Qwen/Qwen3-0.6B
26
+ - Qwen/Qwen3-1.7B
27
+ - Qwen/Qwen3-4B
28
+ - Qwen/Qwen3-8B
29
+
30
  ---
31
 
32
+ # CodeReview Bench Leaderboard
33
 
34
+ <<<<<<< HEAD
35
  A comprehensive benchmark and leaderboard for code review generation models, inspired by [CodeReviewBench](https://huggingface.co/spaces/your-org/CodeReviewBench).
36
+ =======
37
+ A comprehensive leaderboard for evaluating automated code review systems across programming languages and review quality dimensions.
38
+ >>>>>>> f990f507d1e99e7867021841fa223fe6ca8f653b
39
+
40
+ ## Features
41
+
42
+ - **Multi-Language Support**: Evaluates models across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more
43
+ - **Dual Language Comments**: Supports both Russian and English comment languages
44
+ - **Comprehensive Metrics**:
45
+ - LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity)
46
+ - Exact-match metrics (pass@1, pass@5, pass@10, BLEU@10)
47
+ - **Interactive Visualization**: Compare model performance across categories with radar plots
48
+ - **Easy Submission**: Submit your model results via web interface
49
+
50
+ ## Metrics
51
+
52
+ ### LLM-based Multimetric
53
+
54
+ - **Readability**: How easy the review is to understand
55
+ - **Relevance**: How relevant the review is to the code
56
+ - **Explanation Clarity**: How clear the explanations are
57
+ - **Problem Identification**: How well problems are identified
58
+ - **Actionability**: How actionable the suggestions are
59
+ - **Completeness**: How complete the review is
60
+ - **Specificity**: How specific the feedback is
61
+ - **Contextual Adequacy**: How well the review fits the context
62
+ - **Consistency**: How consistent the review style is
63
+ - **Brevity**: How concise the review is
64
+
65
+ ### Exact-Match Metrics
66
+
67
+ - **Pass@1**: Percentage of correct reviews on first attempt
68
+ - **Pass@5**: Percentage of correct reviews in top 5 attempts
69
+ - **Pass@10**: Percentage of correct reviews in top 10 attempts
70
+ - **BLEU@10**: BLEU score for top 10 review candidates
71
+
72
+ ## Programming Languages Supported
73
+
74
+ - Python
75
+ - JavaScript
76
+ - Java
77
+ - C++
78
+ - C#
79
+ - TypeScript
80
+ - Go
81
+ - Rust
82
+ - Swift
83
+ - Kotlin
84
+ - Ruby
85
+ - PHP
86
+ - C
87
+ - Scala
88
+ - R
89
+ - Dart
90
+ - Other
91
+
92
+ ## Comment Languages
93
+
94
+ - Russian (ru)
95
+ - English (en)
96
+
97
+ ## Example Categories
98
+
99
+ - Bug Fix
100
+ - Code Style
101
+ - Performance
102
+ - Security
103
+ - Refactoring
104
+ - Documentation
105
+ - Testing
106
+ - Architecture
107
+ - Other
108
+
109
+ ## Installation
110
 
111
  ```bash
112
  pip install -r requirements.txt
113
  ```
114
 
115
+ ## Usage
116
 
117
  ```bash
118
  python app.py
119
  ```
120
 
121
+ ## Submission Format
122
+
123
+ Submit your results as a JSONL file where each line contains:
124
+
125
+ ```json
126
+ {
127
+ "model_name": "your-model-name",
128
+ "programming_language": "python",
129
+ "comment_language": "en",
130
+ "readability": 8.5,
131
+ "relevance": 9.0,
132
+ "explanation_clarity": 7.8,
133
+ "problem_identification": 8.2,
134
+ "actionability": 8.7,
135
+ "completeness": 8.0,
136
+ "specificity": 7.5,
137
+ "contextual_adequacy": 8.3,
138
+ "consistency": 8.8,
139
+ "brevity": 7.2,
140
+ "pass_at_1": 0.75,
141
+ "pass_at_5": 0.88,
142
+ "pass_at_10": 0.92,
143
+ "bleu_at_10": 0.65,
144
+ "total_evaluations": 100
145
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  ```
147
 
148
+ ## Environment Variables
149
 
150
+ Set the following environment variables:
 
 
 
151
 
 
152
 
153
+ ## Citation
154
 
155
+ <<<<<<< HEAD
156
  - **Multi-tab Interface**: Organized navigation with dedicated sections
157
  - **Advanced Filtering**: Real-time filtering by multiple criteria
158
  - **Dark Theme**: Modern, GitHub-inspired dark interface
app.py CHANGED
@@ -3,363 +3,1114 @@ CodeReview Leaderboard - Inspired by CodeReviewBench
3
  A comprehensive leaderboard for code review generation models
4
  """
5
 
 
 
 
 
6
  import gradio as gr
7
- from typing import List, Dict, Any
8
- from datetime import datetime, timezone
 
 
 
 
 
9
 
10
- # Import our modules
11
- from src.envs import (
12
- PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES,
13
- MAIN_HEADERS, QUALITY_HEADERS
 
 
 
14
  )
15
- from src.about import TITLE, INTRODUCTION_TEXT
16
- from src.display.css_html_js import DARK_THEME_CSS, CUSTOM_JS, HEADER_HTML, FOOTER_HTML
17
  from src.display.utils import (
18
- get_main_leaderboard_data, get_quality_metrics_data,
19
- get_submission_history_data, get_statistics_summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  )
21
- from src.leaderboard.processor import LeaderboardProcessor
22
- from src.submission.submit import SubmissionHandler
23
-
24
- # Initialize processors
25
- processor = LeaderboardProcessor()
26
- submission_handler = SubmissionHandler()
27
-
28
- # Global state
29
- current_filters = {
30
- "programming_language": "All",
31
- "comment_language": "All",
32
- "taxonomy_category": "All"
33
- }
34
 
35
- def update_leaderboard_tables(
36
- programming_language: str = "All",
37
- comment_language: str = "All",
38
- taxonomy_category: str = "All"
39
- ):
40
- """Update leaderboard tables with filters"""
41
- global current_filters
42
- current_filters = {
43
- "programming_language": programming_language,
44
- "comment_language": comment_language,
45
- "taxonomy_category": taxonomy_category
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  }
47
-
48
- # Load current data
49
- data = processor.load_leaderboard_data()
50
-
51
- # Get filtered tables
52
- main_table = get_main_leaderboard_data(
53
- data, programming_language, comment_language, taxonomy_category
54
  )
55
-
56
- quality_table = get_quality_metrics_data(
57
- data, programming_language, comment_language, taxonomy_category
 
 
 
 
 
 
58
  )
59
-
60
- # Get statistics
61
- stats = get_statistics_summary(data)
62
-
63
- # Format statistics display
64
- stats_text = f"""
65
- ## 📊 Current Statistics
66
- - **Total Models**: {stats['total_models']}
67
- - **Total Submissions**: {stats['total_submissions']}
68
- - **Average Pass@1**: {stats['avg_pass_1']:.3f}
69
- - **Best Model**: {stats['best_model']}
70
- - **Languages Covered**: {stats['languages_covered']}
71
- - **Categories Covered**: {stats['categories_covered']}
72
  """
73
-
74
- return main_table, quality_table, stats_text
75
-
76
- def refresh_data():
77
- """Refresh all data from storage"""
78
- return update_leaderboard_tables(
79
- current_filters["programming_language"],
80
- current_filters["comment_language"],
81
- current_filters["taxonomy_category"]
82
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- def handle_submission(
85
- request: gr.Request,
86
- *args
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  ):
88
- """Handle model submission"""
89
- # Get current data
90
- current_data = processor.load_leaderboard_data()
91
-
92
- # Call submission handler
93
- result = submission_handler.submit_model(request, current_data, *args)
94
-
95
- # If submission was successful, refresh tables
96
- if result[0] != current_data: # Data was updated
97
- main_table, quality_table, stats_text = update_leaderboard_tables(
98
- current_filters["programming_language"],
99
- current_filters["comment_language"],
100
- current_filters["taxonomy_category"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  )
102
- return result[0], main_table, quality_table, result[3], stats_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  else:
104
- return result[0], result[1], result[2], result[3], None
105
-
106
- # Create the Gradio interface
107
- with gr.Blocks(
108
- theme=gr.themes.Base(),
109
- css=DARK_THEME_CSS,
110
- js=CUSTOM_JS,
111
- title=TITLE,
112
- head="<meta name='viewport' content='width=device-width, initial-scale=1'>"
113
- ) as demo:
114
-
115
- # Header
116
- gr.HTML(HEADER_HTML)
117
-
118
- # State to store leaderboard data
119
- leaderboard_state = gr.State(value=processor.load_leaderboard_data())
120
-
121
- # Main content tabs
122
- with gr.Tabs():
123
-
124
- # Leaderboard Tab
125
- with gr.Tab("🏆 Leaderboard"):
126
-
127
- # Filters
128
- with gr.Row():
129
- prog_lang_filter = gr.Dropdown(
130
- choices=PROGRAMMING_LANGUAGES,
131
- value="All",
132
- label="🔍 Programming Language",
133
- info="Filter by programming language"
134
- )
135
- comment_lang_filter = gr.Dropdown(
136
- choices=COMMENT_LANGUAGES,
137
- value="All",
138
- label="🌍 Comment Language",
139
- info="Filter by comment language"
140
- )
141
- taxonomy_filter = gr.Dropdown(
142
- choices=TAXONOMY_CATEGORIES,
143
- value="All",
144
- label="🏷️ Taxonomy Category",
145
- info="Filter by review category"
146
  )
147
- refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
148
-
149
- # Statistics
150
- stats_display = gr.Markdown("")
151
-
152
- # Main leaderboard table
153
- with gr.Row():
154
- main_leaderboard = gr.Dataframe(
155
- headers=MAIN_HEADERS,
156
- label="🏅 Main Leaderboard",
157
- interactive=False,
158
- wrap=True,
159
- max_height=600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  )
161
-
162
- # Quality metrics table
163
- with gr.Row():
164
- quality_metrics = gr.Dataframe(
165
- headers=QUALITY_HEADERS,
166
- label="📊 Quality Metrics",
167
- interactive=False,
168
- wrap=True,
169
- max_height=600
 
 
 
 
 
170
  )
171
-
172
- # Submission Tab
173
- with gr.Tab("📝 Submit Model"):
174
-
175
- # Create submission form
176
- form_components = submission_handler.get_submission_form_components()
177
-
178
- # Connect submission handler
179
- form_components["submit_btn"].click(
180
- fn=handle_submission,
181
- inputs=[
182
- leaderboard_state,
183
- form_components["model_name"],
184
- form_components["programming_language"],
185
- form_components["comment_language"],
186
- form_components["taxonomy_category"],
187
- form_components["bleu"],
188
- form_components["pass1"],
189
- form_components["pass5"],
190
- form_components["pass10"],
191
- form_components["readability"],
192
- form_components["relevance"],
193
- form_components["explanation_clarity"],
194
- form_components["problem_identification"],
195
- form_components["actionability"],
196
- form_components["completeness"],
197
- form_components["specificity"],
198
- form_components["contextual_adequacy"],
199
- form_components["consistency"],
200
- form_components["brevity"],
201
- ],
202
- outputs=[
203
- leaderboard_state,
204
- main_leaderboard,
205
- quality_metrics,
206
- form_components["status_msg"],
207
- stats_display
208
- ]
209
- )
210
-
211
- # Analytics Tab
212
- with gr.Tab("📈 Analytics"):
213
-
214
- with gr.Row():
215
- analytics_prog_lang = gr.Dropdown(
216
- choices=PROGRAMMING_LANGUAGES,
217
- value="All",
218
- label="Programming Language"
219
  )
220
- analytics_comment_lang = gr.Dropdown(
221
- choices=COMMENT_LANGUAGES,
222
- value="All",
223
- label="Comment Language"
 
 
 
 
 
 
 
 
 
 
 
224
  )
225
- analytics_taxonomy = gr.Dropdown(
226
- choices=TAXONOMY_CATEGORIES,
227
- value="All",
228
- label="Taxonomy Category"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  )
230
-
231
- # Submission history
232
- submission_history = gr.Dataframe(
233
- headers=["Model", "Programming Language", "Comment Language", "Taxonomy", "Pass@1", "Date", "IP"],
234
- label="📋 Recent Submissions",
235
- interactive=False,
236
- max_height=400
237
- )
238
-
239
- # Language performance analysis
240
- with gr.Row():
241
- with gr.Column():
242
- gr.Markdown("### 🗣️ Language Performance Analysis")
243
- language_analysis = gr.Dataframe(
244
- headers=["Language", "Avg Pass@1", "Model Count", "Best Model"],
245
- label="Programming Language Performance",
246
- interactive=False
247
- )
248
-
249
- with gr.Column():
250
- gr.Markdown("### 🏷️ Category Performance Analysis")
251
- category_analysis = gr.Dataframe(
252
- headers=["Category", "Avg Pass@1", "Model Count", "Best Model"],
253
- label="Taxonomy Category Performance",
254
- interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  )
256
-
257
- # About Tab
258
- with gr.Tab("ℹ️ About"):
259
- gr.Markdown(INTRODUCTION_TEXT)
260
-
261
- # Export functionality
262
- with gr.Row():
263
- export_format = gr.Dropdown(
264
- choices=["JSON", "CSV"],
265
- value="JSON",
266
- label="Export Format"
 
 
 
 
 
 
 
 
 
 
267
  )
268
- export_btn = gr.Button("📥 Export Data")
269
-
270
- export_output = gr.Textbox(
271
- label="Export Output",
272
- lines=10,
273
- max_lines=20,
274
- show_copy_button=True
275
- )
276
-
277
- # Footer
278
- gr.HTML(FOOTER_HTML)
279
-
280
- # Initialize with data
281
- initial_main, initial_quality, initial_stats = update_leaderboard_tables()
282
-
283
- # Update tables when filters change
284
- filter_inputs = [prog_lang_filter, comment_lang_filter, taxonomy_filter]
285
- filter_outputs = [main_leaderboard, quality_metrics, stats_display]
286
-
287
- for filter_input in filter_inputs:
288
- filter_input.change(
289
- fn=update_leaderboard_tables,
290
- inputs=filter_inputs,
291
- outputs=filter_outputs
292
- )
293
-
294
- # Refresh button
295
- refresh_btn.click(
296
- fn=refresh_data,
297
- outputs=filter_outputs
298
- )
299
-
300
- # Analytics updates
301
- analytics_inputs = [analytics_prog_lang, analytics_comment_lang, analytics_taxonomy]
302
-
303
- def update_analytics(prog_lang, comment_lang, taxonomy):
304
- """Update analytics tables"""
305
- data = processor.load_leaderboard_data()
306
-
307
- # Get submission history
308
- history = get_submission_history_data(data, prog_lang, comment_lang, taxonomy)
309
-
310
- # Get language performance
311
- lang_perf = []
312
- for lang in PROGRAMMING_LANGUAGES[1:]:
313
- lang_data = [d for d in data if d.get("programming_language") == lang]
314
- if lang_data:
315
- avg_score = sum(d.get("llm_pass_1", 0) for d in lang_data) / len(lang_data)
316
- best_model = max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
317
- lang_perf.append([lang, f"{avg_score:.3f}", len(lang_data), best_model])
318
-
319
- # Get category performance
320
- cat_perf = []
321
- for cat in TAXONOMY_CATEGORIES[1:]:
322
- cat_data = [d for d in data if d.get("taxonomy_category") == cat]
323
- if cat_data:
324
- avg_score = sum(d.get("llm_pass_1", 0) for d in cat_data) / len(cat_data)
325
- best_model = max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
326
- cat_perf.append([cat, f"{avg_score:.3f}", len(cat_data), best_model])
327
-
328
- return history, lang_perf, cat_perf
329
-
330
- for analytics_input in analytics_inputs:
331
- analytics_input.change(
332
- fn=update_analytics,
333
- inputs=analytics_inputs,
334
- outputs=[submission_history, language_analysis, category_analysis]
335
- )
336
-
337
- # Export functionality
338
- def export_data(format_type):
339
- """Export leaderboard data"""
340
- return processor.export_data(format_type.lower())
341
-
342
- export_btn.click(
343
- fn=export_data,
344
- inputs=[export_format],
345
- outputs=[export_output]
346
- )
347
-
348
- # Set initial values
349
- demo.load(
350
- fn=lambda: (initial_main, initial_quality, initial_stats),
351
- outputs=[main_leaderboard, quality_metrics, stats_display]
352
- )
353
 
354
- # Launch configuration
355
- if __name__ == "__main__":
356
- demo.queue(max_size=20).launch(
357
- server_name="0.0.0.0",
358
- server_port=7860,
359
- share=False,
360
- show_error=True,
361
- debug=True
 
 
 
 
 
 
 
362
  )
363
 
364
- # For deployment (HuggingFace Spaces, etc.)
365
- app = demo
 
 
 
 
 
 
 
 
3
  A comprehensive leaderboard for code review generation models
4
  """
5
 
6
+ import os
7
+ import json
8
+ import tempfile
9
+ import logging
10
  import gradio as gr
11
+ import pandas as pd
12
+ import plotly.express as px
13
+ import plotly.graph_objects as go
14
+ from apscheduler.schedulers.background import BackgroundScheduler
15
+ import numpy as np
16
+ from gradio.themes.utils import fonts, colors
17
+ from dataclasses import fields, dataclass
18
 
19
+ from src.about import (
20
+ CITATION_BUTTON_LABEL,
21
+ CITATION_BUTTON_TEXT,
22
+ EVALUATION_QUEUE_TEXT,
23
+ INTRODUCTION_TEXT,
24
+ LLM_BENCHMARKS_TEXT,
25
+ TITLE,
26
  )
27
+ from src.display.css_html_js import custom_css
 
28
  from src.display.utils import (
29
+ CODEREVIEW_COLUMN,
30
+ DISPLAY_COLS,
31
+ METRIC_COLS,
32
+ HIDDEN_COLS,
33
+ NEVER_HIDDEN_COLS,
34
+ CATEGORIES,
35
+ COMMENT_LANGUAGES,
36
+ EXAMPLE_CATEGORIES,
37
+ TOPICS,
38
+ ModelType,
39
+ Mode,
40
+ Precision,
41
+ WeightType,
42
+ ReviewModelType,
43
+ get_all_column_choices,
44
+ get_default_visible_columns,
45
  )
46
+ from src.display.formatting import styled_message, styled_error, styled_warning
47
+ from src.envs import (
48
+ ADMIN_USERNAME,
49
+ ADMIN_PASSWORD,
50
+ RESULTS_DATASET_ID,
51
+ SUBMITTER_TOKEN,
52
+ TOKEN,
53
+ DATA_PATH,
54
+ )
55
+ from src.populate import get_leaderboard_df, get_category_leaderboard_df
56
+ from src.submission.submit import process_submission
 
 
57
 
58
+ # Configure logging
59
+ logging.basicConfig(
60
+ level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
61
+ )
62
+ logger = logging.getLogger(__name__)
63
+
64
+ # Ensure data directory exists
65
+ os.makedirs(DATA_PATH, exist_ok=True)
66
+
67
+ # Available benchmark versions
68
+ BENCHMARK_VERSIONS = ["v0"]
69
+ CURRENT_VERSION = "v0"
70
+
71
+ # Initialize leaderboard data
72
+ try:
73
+ logger.info("Initializing leaderboard data...")
74
+ LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
75
+ logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
76
+ except Exception as e:
77
+ logger.error(f"Error loading leaderboard data: {e}")
78
+ LEADERBOARD_DF = pd.DataFrame()
79
+
80
+ custom_theme = gr.themes.Default(
81
+ primary_hue=colors.slate,
82
+ secondary_hue=colors.slate,
83
+ neutral_hue=colors.neutral,
84
+ font=(fonts.GoogleFont("Inter"), "sans-serif"),
85
+ ).set(
86
+ # font_size="16px",
87
+ body_background_fill="#0f0f10",
88
+ body_background_fill_dark="#0f0f10",
89
+ body_text_color="#f4f4f5",
90
+ body_text_color_subdued="#a1a1aa",
91
+ block_background_fill="#1e1e1e", # Cooler Grey
92
+ block_border_color="#333333", # Cooler Grey
93
+ block_shadow="none",
94
+ # Swapped primary and secondary button styles
95
+ button_primary_background_fill="#121212", # Changed to specific color for Refresh button
96
+ button_primary_text_color="#f4f4f5",
97
+ button_primary_border_color="#333333", # Keep border grey or change to #121212?
98
+ button_secondary_background_fill="#f4f4f5",
99
+ button_secondary_text_color="#0f0f10",
100
+ button_secondary_border_color="#f4f4f5",
101
+ input_background_fill="#1e1e1e", # Cooler Grey
102
+ input_border_color="#333333", # Cooler Grey
103
+ input_placeholder_color="#71717a",
104
+ table_border_color="#333333", # Cooler Grey
105
+ table_even_background_fill="#2d2d2d", # Cooler Grey (Slightly lighter)
106
+ table_odd_background_fill="#1e1e1e", # Cooler Grey
107
+ table_text_color="#f4f4f5",
108
+ link_text_color="#ffffff",
109
+ border_color_primary="#333333", # Cooler Grey
110
+ background_fill_secondary="#333333", # Cooler Grey
111
+ color_accent="#f4f4f5",
112
+ border_color_accent="#333333", # Cooler Grey
113
+ button_primary_background_fill_hover="#424242", # Cooler Grey
114
+ block_title_text_color="#f4f4f5",
115
+ accordion_text_color="#f4f4f5",
116
+ panel_background_fill="#1e1e1e", # Cooler Grey
117
+ panel_border_color="#333333", # Cooler Grey
118
+ # Explicitly setting primary/secondary/accent colors/borders
119
+ background_fill_primary="#0f0f10",
120
+ background_fill_primary_dark="#0f0f10",
121
+ background_fill_secondary_dark="#333333", # Cooler Grey
122
+ border_color_primary_dark="#333333", # Cooler Grey
123
+ border_color_accent_dark="#333333", # Cooler Grey
124
+ border_color_accent_subdued="#424242", # Cooler Grey
125
+ border_color_accent_subdued_dark="#424242", # Cooler Grey
126
+ color_accent_soft="#a1a1aa",
127
+ color_accent_soft_dark="#a1a1aa",
128
+ # Explicitly setting input hover/focus states
129
+ input_background_fill_dark="#1e1e1e", # Cooler Grey
130
+ input_background_fill_focus="#424242", # Cooler Grey
131
+ input_background_fill_focus_dark="#424242", # Cooler Grey
132
+ input_background_fill_hover="#2d2d2d", # Cooler Grey
133
+ input_background_fill_hover_dark="#2d2d2d", # Cooler Grey
134
+ input_border_color_dark="#333333", # Cooler Grey
135
+ input_border_color_focus="#f4f4f5",
136
+ input_border_color_focus_dark="#f4f4f5",
137
+ input_border_color_hover="#424242", # Cooler Grey
138
+ input_border_color_hover_dark="#424242", # Cooler Grey
139
+ input_placeholder_color_dark="#71717a",
140
+ # Explicitly set dark variants for table backgrounds
141
+ table_even_background_fill_dark="#2d2d2d", # Cooler Grey
142
+ table_odd_background_fill_dark="#1e1e1e", # Cooler Grey
143
+ # Explicitly set dark text variants
144
+ body_text_color_dark="#f4f4f5",
145
+ body_text_color_subdued_dark="#a1a1aa",
146
+ block_title_text_color_dark="#f4f4f5",
147
+ accordion_text_color_dark="#f4f4f5",
148
+ table_text_color_dark="#f4f4f5",
149
+ # Explicitly set dark panel/block variants
150
+ panel_background_fill_dark="#1e1e1e", # Cooler Grey
151
+ panel_border_color_dark="#333333", # Cooler Grey
152
+ block_background_fill_dark="#1e1e1e", # Cooler Grey
153
+ block_border_color_dark="#333333", # Cooler Grey
154
+ )
155
+
156
+
157
+ @dataclass
158
+ class ColumnInfo:
159
+ """Information about a column in the leaderboard."""
160
+
161
+ name: str
162
+ display_name: str
163
+ type: str = "text"
164
+ hidden: bool = False
165
+ never_hidden: bool = False
166
+ displayed_by_default: bool = True
167
+
168
+
169
+ def update_column_choices(df):
170
+ """Update column choices based on what's actually in the dataframe"""
171
+ if df is None or df.empty:
172
+ return get_all_column_choices()
173
+
174
+ # Get columns that actually exist in the dataframe
175
+ existing_columns = list(df.columns)
176
+
177
+ # Get all possible columns with their display names
178
+ all_columns = get_all_column_choices()
179
+
180
+ # Filter to only include columns that exist in the dataframe
181
+ valid_columns = [
182
+ (col_name, display_name)
183
+ for col_name, display_name in all_columns
184
+ if col_name in existing_columns
185
+ ]
186
+
187
+ # Return default if there are no valid columns
188
+ if not valid_columns:
189
+ return get_all_column_choices()
190
+
191
+ return valid_columns
192
+
193
+
194
+ # Update the column_selector initialization
195
+ def get_initial_columns():
196
+ """Get initial columns to show in the dropdown"""
197
+ try:
198
+ # Get available columns in the main dataframe
199
+ available_cols = list(LEADERBOARD_DF.columns)
200
+ logger.info(f"Available columns in LEADERBOARD_DF: {available_cols}")
201
+
202
+ # If dataframe is empty, use default visible columns
203
+ if not available_cols:
204
+ return get_default_visible_columns()
205
+
206
+ # Get default visible columns that actually exist in the dataframe
207
+ valid_defaults = [
208
+ col for col in get_default_visible_columns() if col in available_cols
209
+ ]
210
+
211
+ # If none of the defaults exist, return all available columns
212
+ if not valid_defaults:
213
+ return available_cols
214
+
215
+ return valid_defaults
216
+ except Exception as e:
217
+ logger.error(f"Error getting initial columns: {e}")
218
+ return get_default_visible_columns()
219
+
220
+
221
+ def init_leaderboard(dataframe, visible_columns=None):
222
+ """
223
+ Initialize a standard Gradio Dataframe component for the leaderboard.
224
+ """
225
+ if dataframe is None or dataframe.empty:
226
+ # Create an empty dataframe with the right columns
227
+ columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
228
+ dataframe = pd.DataFrame(columns=columns)
229
+ logger.warning("Initializing empty leaderboard")
230
+
231
+ # Lowercase model_name for display
232
+ if "model_name" in dataframe.columns:
233
+ dataframe = dataframe.copy()
234
+ dataframe["model_name"] = dataframe["model_name"].str.lower()
235
+
236
+ if "model_type" in dataframe.columns:
237
+ dataframe = dataframe.copy()
238
+ dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
239
+
240
+ if "review_model_type" in dataframe.columns:
241
+ dataframe = dataframe.copy()
242
+ dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
243
+
244
+ # print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
245
+
246
+ # Determine which columns to display
247
+ display_column_names = [
248
+ getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
249
+ ]
250
+ hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
251
+
252
+ # Columns that should always be shown
253
+ always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
254
+
255
+ # Use provided visible columns if specified, otherwise use default
256
+ if visible_columns is None:
257
+ # Determine which columns to show initially
258
+ visible_columns = [
259
+ col for col in display_column_names if col not in hidden_column_names
260
+ ]
261
+
262
+ # Always include the never-hidden columns
263
+ for col in always_visible:
264
+ if col not in visible_columns and col in dataframe.columns:
265
+ visible_columns.append(col)
266
+
267
+ # Make sure we only include columns that actually exist in the dataframe
268
+ visible_columns = [col for col in visible_columns if col in dataframe.columns]
269
+
270
+ # Map GuardBench column types to Gradio's expected datatype strings
271
+ # Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
272
+ type_mapping = {
273
+ "text": "str",
274
+ "number": "number",
275
+ "bool": "bool",
276
+ "date": "date",
277
+ "markdown": "markdown",
278
+ "html": "html",
279
+ "image": "image",
280
+ }
281
+
282
+ # Create a list of datatypes in the format Gradio expects
283
+ datatypes = []
284
+ for col in visible_columns:
285
+ # Find the corresponding CODEREVIEW_COLUMN entry
286
+ col_type = None
287
+ for display_col in DISPLAY_COLS:
288
+ if getattr(CODEREVIEW_COLUMN, display_col).name == col:
289
+ orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
290
+ # Map to Gradio's expected types
291
+ col_type = type_mapping.get(orig_type, "str")
292
+ break
293
+
294
+ # Default to 'str' if type not found or not mappable
295
+ if col_type is None:
296
+ col_type = "str"
297
+
298
+ datatypes.append(col_type)
299
+
300
+ # Create a dummy column for search functionality if it doesn't exist
301
+ if "search_dummy" not in dataframe.columns:
302
+ dataframe["search_dummy"] = dataframe.apply(
303
+ lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
304
+ axis=1,
305
+ )
306
+
307
+ # Select only the visible columns for display
308
+ visible_columns.remove("model_name")
309
+
310
+ visible_columns = ["model_name"] + visible_columns
311
+ display_df = dataframe[visible_columns].copy()
312
+
313
+ # print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
314
+ # print(display_df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head() if all(c in display_df.columns for c in ['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']) else "Relevant columns not present")
315
+ # print(f"-------------------------------------------------------------")
316
+
317
+ # Round numeric columns to 3 decimal places for display
318
+ numeric_cols = display_df.select_dtypes(include=np.number).columns
319
+ for col in numeric_cols:
320
+ # Avoid rounding integer columns like counts
321
+ if not pd.api.types.is_integer_dtype(display_df[col]):
322
+ # Format floats to exactly 3 decimal places, preserving trailing zeros
323
+ display_df[col] = display_df[col].apply(
324
+ lambda x: f"{x:.3f}" if pd.notna(x) else None
325
+ )
326
+
327
+ column_info_map = {
328
+ f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
329
+ }
330
+ column_mapping = {
331
+ col: column_info_map.get(col, ColumnInfo(col, col)).display_name
332
+ for col in visible_columns
333
  }
334
+
335
+ # Rename columns in the DataFrame
336
+ display_df.rename(columns=column_mapping, inplace=True)
337
+
338
+ # Apply styling - note: styling might need adjustment if it relies on column names
339
+ styler = display_df.style.set_properties(**{"text-align": "right"}).set_properties(
340
+ subset=["Model"], **{"width": "200px"}
341
  )
342
+
343
+ return gr.Dataframe(
344
+ value=styler,
345
+ datatype=datatypes,
346
+ interactive=False,
347
+ wrap=True,
348
+ height=2500,
349
+ elem_id="leaderboard-table",
350
+ row_count=len(display_df),
351
  )
352
+
353
+
354
+ def search_filter_leaderboard(
355
+ df, search_query="", comment_languages=None, version=CURRENT_VERSION
356
+ ):
 
 
 
 
 
 
 
 
357
  """
358
+ Filter the leaderboard based on search query and comment languages.
359
+ """
360
+ if df is None or df.empty:
361
+ return df
362
+
363
+ filtered_df = df.copy()
364
+
365
+ # Add search dummy column if it doesn't exist
366
+ if "search_dummy" not in filtered_df.columns:
367
+ filtered_df["search_dummy"] = filtered_df.apply(
368
+ lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
369
+ axis=1,
370
+ )
371
+
372
+ # Apply comment language filter (assuming there's a comment_language column in the data)
373
+ if comment_languages and len(comment_languages) > 0:
374
+ # Look for a comment language column in the dataframe
375
+ comment_lang_cols = [col for col in filtered_df.columns if 'comment_language' in col.lower()]
376
+ if comment_lang_cols:
377
+ filtered_df = filtered_df[
378
+ filtered_df[comment_lang_cols[0]].isin(comment_languages)
379
+ ]
380
+
381
+ # Apply search query
382
+ if search_query:
383
+ search_terms = [
384
+ term.strip() for term in search_query.split(";") if term.strip()
385
+ ]
386
+ if search_terms:
387
+ combined_mask = None
388
+ for term in search_terms:
389
+ mask = filtered_df["search_dummy"].str.contains(
390
+ term, case=False, na=False
391
+ )
392
+ if combined_mask is None:
393
+ combined_mask = mask
394
+ else:
395
+ combined_mask = combined_mask | mask
396
+
397
+ if combined_mask is not None:
398
+ filtered_df = filtered_df[combined_mask]
399
+
400
+ # Drop the search dummy column before returning
401
+ visible_columns = [col for col in filtered_df.columns if col != "search_dummy"]
402
+ return filtered_df[visible_columns]
403
 
404
+
405
+ def refresh_data_with_filters(
406
+ version=CURRENT_VERSION, search_query="", comment_languages=None, selected_columns=None
407
+ ):
408
+ """
409
+ Refresh the leaderboard data and update all components with filtering.
410
+ Ensures we handle cases where dataframes might have limited columns.
411
+ """
412
+ global LEADERBOARD_DF
413
+ try:
414
+ logger.info(f"Performing refresh of leaderboard data with filters...")
415
+ # Get new data
416
+ main_df = get_leaderboard_df(version=version)
417
+ LEADERBOARD_DF = main_df
418
+ category_dfs = [
419
+ get_category_leaderboard_df(category, version=version)
420
+ for category in CATEGORIES
421
+ ]
422
+ selected_columns = [
423
+ x.lower()
424
+ .replace(" ", "_")
425
+ .replace("(", "")
426
+ .replace(")", "")
427
+ .replace("_recall", "_recall_binary")
428
+ .replace("_precision", "_precision_binary")
429
+ for x in selected_columns
430
+ ]
431
+
432
+ # Log the actual columns we have
433
+ logger.info(f"Main dataframe columns: {list(main_df.columns)}")
434
+
435
+ # Apply filters to each dataframe
436
+ filtered_main_df = search_filter_leaderboard(
437
+ main_df, search_query, comment_languages, version
438
+ )
439
+ filtered_category_dfs = [
440
+ search_filter_leaderboard(df, search_query, comment_languages, version)
441
+ for df in category_dfs
442
+ ]
443
+
444
+ # Get available columns from the dataframe
445
+ available_columns = list(filtered_main_df.columns)
446
+
447
+ # Filter selected columns to only those available in the data
448
+ if selected_columns:
449
+ # Convert display names to internal names first
450
+ internal_selected_columns = [
451
+ x.lower()
452
+ .replace(" ", "_")
453
+ .replace("(", "")
454
+ .replace(")", "")
455
+ .replace("_recall", "_recall_binary")
456
+ .replace("_precision", "_precision_binary")
457
+ for x in selected_columns
458
+ ]
459
+ valid_selected_columns = [
460
+ col for col in internal_selected_columns if col in available_columns
461
+ ]
462
+ if not valid_selected_columns and "model_name" in available_columns:
463
+ # Fallback if conversion/filtering leads to empty selection
464
+ valid_selected_columns = ["model_name"] + [
465
+ col
466
+ for col in get_default_visible_columns()
467
+ if col in available_columns
468
+ ]
469
+ else:
470
+ # If no columns were selected in the dropdown, use default visible columns that exist
471
+ valid_selected_columns = [
472
+ col for col in get_default_visible_columns() if col in available_columns
473
+ ]
474
+
475
+ # Initialize dataframes for display with valid selected columns
476
+ main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
477
+
478
+ # For category dataframes, get columns that actually exist in each one
479
+ category_dataframes = []
480
+ for df in filtered_category_dfs:
481
+ df_columns = list(df.columns)
482
+ df_valid_columns = [
483
+ col for col in valid_selected_columns if col in df_columns
484
+ ]
485
+ if not df_valid_columns and "model_name" in df_columns:
486
+ df_valid_columns = ["model_name"] + get_default_visible_columns()
487
+ category_dataframes.append(init_leaderboard(df, df_valid_columns))
488
+
489
+ return main_dataframe, *category_dataframes
490
+
491
+ except Exception as e:
492
+ logger.error(f"Error in refresh with filters: {e}")
493
+ # Return the current leaderboards on error
494
+ return leaderboard, *[
495
+ tab.children[0] for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
496
+ ]
497
+
498
+
499
+ def submit_results(
500
+ model_name: str,
501
+ base_model: str,
502
+ revision: str,
503
+ precision: str,
504
+ weight_type: str,
505
+ model_type: str,
506
+ mode: str,
507
+ submission_file: tempfile._TemporaryFileWrapper,
508
+ version: str,
509
+ review_model_type: ReviewModelType,
510
+ programming_language: str,
511
+ comment_language: str,
512
  ):
513
+ """
514
+ Handle submission of results with model metadata.
515
+ """
516
+ if submission_file is None:
517
+ return styled_error("No submission file provided")
518
+
519
+ if not model_name:
520
+ return styled_error("Model name is required")
521
+
522
+ if not model_type:
523
+ return styled_error("Please select a model type")
524
+
525
+ if not mode:
526
+ return styled_error("Please select an inference mode")
527
+
528
+ file_path = submission_file.name
529
+ logger.info(f"Received submission for model {model_name}: {file_path}")
530
+
531
+ # Add metadata to the submission
532
+ metadata = {
533
+ "model_name": model_name,
534
+ "base_model": base_model,
535
+ "revision": revision if revision else "main",
536
+ "precision": precision,
537
+ "weight_type": weight_type,
538
+ "model_type": model_type,
539
+ "mode": mode,
540
+ "version": version,
541
+ "review_model_type": review_model_type,
542
+ "programming_language": programming_language,
543
+ "comment_language": comment_language,
544
+ }
545
+
546
+ # Process the submission
547
+ result = process_submission(file_path, metadata, version=version)
548
+
549
+ # Refresh the leaderboard data
550
+ global LEADERBOARD_DF
551
+ try:
552
+ logger.info(
553
+ f"Refreshing leaderboard data after submission for version {version}..."
554
  )
555
+ LEADERBOARD_DF = get_leaderboard_df(version=version)
556
+ logger.info("Refreshed leaderboard data after submission")
557
+ except Exception as e:
558
+ logger.error(f"Error refreshing leaderboard data: {e}")
559
+
560
+ return result
561
+
562
+
563
+ def refresh_data(version=CURRENT_VERSION):
564
+ """
565
+ Refresh the leaderboard data and update all components.
566
+ """
567
+ try:
568
+ logger.info(f"Performing scheduled refresh of leaderboard data...")
569
+ # Get new data
570
+ main_df = get_leaderboard_df(version=version)
571
+ category_dfs = [
572
+ get_category_leaderboard_df(category, version=version)
573
+ for category in CATEGORIES
574
+ ]
575
+
576
+ # For gr.Dataframe, we return the actual dataframes
577
+ return main_df, *category_dfs
578
+
579
+ except Exception as e:
580
+ logger.error(f"Error in scheduled refresh: {e}")
581
+ return None, *[None for _ in CATEGORIES]
582
+
583
+
584
+ def update_leaderboards(version):
585
+ """
586
+ Update all leaderboard components with data for the selected version.
587
+ """
588
+ try:
589
+ new_df = get_leaderboard_df(version=version)
590
+ category_dfs = [
591
+ get_category_leaderboard_df(category, version=version)
592
+ for category in CATEGORIES
593
+ ]
594
+ return new_df, *category_dfs
595
+ except Exception as e:
596
+ logger.error(f"Error updating leaderboards for version {version}: {e}")
597
+ return None, *[None for _ in CATEGORIES]
598
+
599
+
600
+ def create_performance_plot(
601
+ selected_models, category, metric="f1_binary", version=CURRENT_VERSION
602
+ ):
603
+ """
604
+ Create a radar plot comparing model performance for selected models.
605
+ """
606
+ if category == "All Results":
607
+ df = get_leaderboard_df(version=version)
608
  else:
609
+ df = get_category_leaderboard_df(category, version=version)
610
+
611
+ if df.empty:
612
+ return go.Figure()
613
+
614
+ # Lowercase model_name in df and selected_models
615
+ df = df.copy()
616
+ df["model_name"] = df["model_name"].str.lower()
617
+ selected_models = [m.lower() for m in selected_models]
618
+ df = df[df["model_name"].isin(selected_models)]
619
+ metric_cols = [col for col in df.columns if metric in col]
620
+ fig = go.Figure()
621
+ colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
622
+ for idx, model in enumerate(selected_models):
623
+ model_data = df[df["model_name"] == model]
624
+ if not model_data.empty:
625
+ values = model_data[metric_cols].values[0].tolist()
626
+ values = values + [values[0]]
627
+ categories = [col.replace(f"_{metric}", "") for col in metric_cols]
628
+ # Replace 'jailbreaked' with 'jailbroken' in categories
629
+ categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
630
+ categories = categories + [categories[0]]
631
+ fig.add_trace(
632
+ go.Scatterpolar(
633
+ r=values,
634
+ theta=categories,
635
+ name=model,
636
+ line_color=colors[idx % len(colors)],
637
+ fill="toself",
 
 
 
 
 
 
 
 
 
 
 
 
 
638
  )
639
+ )
640
+ fig.update_layout(
641
+ paper_bgcolor="#000000",
642
+ plot_bgcolor="#000000",
643
+ font={"color": "#ffffff"},
644
+ title={
645
+ "text": f"{category} - {metric.upper()} Score Comparison",
646
+ "font": {"color": "#ffffff", "size": 24},
647
+ },
648
+ polar=dict(
649
+ bgcolor="#000000",
650
+ radialaxis=dict(
651
+ visible=True,
652
+ range=[0, 1],
653
+ gridcolor="#333333",
654
+ linecolor="#333333",
655
+ tickfont={"color": "#ffffff"},
656
+ ),
657
+ angularaxis=dict(
658
+ gridcolor="#333333",
659
+ linecolor="#333333",
660
+ tickfont={"color": "#ffffff"},
661
+ ),
662
+ ),
663
+ height=600,
664
+ showlegend=True,
665
+ legend=dict(
666
+ yanchor="top",
667
+ y=0.99,
668
+ xanchor="right",
669
+ x=0.99,
670
+ bgcolor="rgba(0,0,0,0.5)",
671
+ font={"color": "#ffffff"},
672
+ ),
673
+ )
674
+ return fig
675
+
676
+
677
+ def update_model_choices(version):
678
+ """
679
+ Update the list of available models for the given version.
680
+ """
681
+ df = get_leaderboard_df(version=version)
682
+ if df.empty:
683
+ return []
684
+ return sorted(df["model_name"].str.lower().unique().tolist())
685
+
686
+
687
+ def update_visualization(selected_models, selected_category, selected_metric, version):
688
+ """
689
+ Update the visualization based on user selections.
690
+ """
691
+ if not selected_models:
692
+ return go.Figure()
693
+ return create_performance_plot(
694
+ selected_models, selected_category, selected_metric, version
695
+ )
696
+
697
+
698
+ # Create Gradio app
699
+ demo = gr.Blocks(css=custom_css, theme=custom_theme)
700
+
701
+ CATEGORY_DISPLAY_MAP = {
702
+ "Python": "Python",
703
+ "Java": "Java",
704
+ "Scala": "Scala",
705
+ "Go": "Go"
706
+ }
707
+ # Create reverse mapping for lookups
708
+ CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
709
+
710
+ with demo:
711
+ gr.HTML(TITLE)
712
+ # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
713
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
714
+
715
+ with gr.Row():
716
+ tabs = gr.Tabs(elem_classes="tab-buttons")
717
+
718
+ with tabs:
719
+ with gr.TabItem("Leaderboard", elem_id="codereview-leaderboard-tab", id=0):
720
+ with gr.Row():
721
+ version_selector = gr.Dropdown(
722
+ choices=BENCHMARK_VERSIONS,
723
+ label="Benchmark Version",
724
+ value=CURRENT_VERSION,
725
+ interactive=True,
726
+ elem_classes="version-selector",
727
+ scale=1,
728
+ visible=False,
729
+ )
730
+
731
+ with gr.Row():
732
+ search_input = gr.Textbox(
733
+ placeholder="Search by models (use ; to split)",
734
+ label="Search",
735
+ elem_id="search-bar",
736
+ scale=2,
737
+ )
738
+ comment_language_filter = gr.Dropdown(
739
+ choices=["en", "ru"],
740
+ label="Comment Language",
741
+ multiselect=True,
742
+ value=[],
743
+ interactive=True,
744
+ scale=1,
745
+ )
746
+ programming_language_filter = gr.Dropdown(
747
+ choices=["Python", "Java", "Scala", "Go"],
748
+ label="Programming Language",
749
+ multiselect=True,
750
+ value=[],
751
+ interactive=True,
752
+ scale=1,
753
+ )
754
+ with gr.Row():
755
+ topic_filter = gr.Dropdown(
756
+ choices=TOPICS,
757
+ label="Topic",
758
+ multiselect=True,
759
+ value=[],
760
+ interactive=True,
761
+ scale=2,
762
+ )
763
+ column_selector = gr.Dropdown(
764
+ choices=get_all_column_choices(),
765
+ label="Columns",
766
+ multiselect=True,
767
+ value=get_initial_columns(),
768
+ interactive=True,
769
+ visible=False,
770
+ scale=1,
771
+ )
772
+ with gr.Row():
773
+ refresh_button = gr.Button(
774
+ "Refresh", scale=0, elem_id="refresh-button"
775
+ )
776
+
777
+ # Create tabs for each category
778
+ with gr.Tabs(elem_classes="category-tabs") as category_tabs:
779
+ # First tab for average metrics across all categories
780
+ with gr.TabItem("All Results", elem_id="overall-tab"):
781
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
782
+
783
+ # Create a tab for each category using display names
784
+ for category in CATEGORIES:
785
+ display_name = CATEGORY_DISPLAY_MAP.get(category, category)
786
+ elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
787
+ with gr.TabItem(display_name, elem_id=elem_id):
788
+ category_df = get_category_leaderboard_df(
789
+ category, version=CURRENT_VERSION
790
+ )
791
+ category_leaderboard = init_leaderboard(category_df)
792
+
793
+ # Connect search and filter inputs to update function
794
+ def update_with_search_filters(
795
+ version=CURRENT_VERSION,
796
+ search_query="",
797
+ comment_languages=None,
798
+ selected_columns=None,
799
+ ):
800
+ """
801
+ Update the leaderboards with search and filter settings.
802
+ """
803
+ return refresh_data_with_filters(
804
+ version, search_query, comment_languages, selected_columns
805
+ )
806
+
807
+ # Refresh button functionality
808
+ def refresh_and_update(
809
+ version, search_query, comment_languages, selected_columns
810
+ ):
811
+ """
812
+ Refresh data, update LEADERBOARD_DF, and return updated components.
813
+ """
814
+ global LEADERBOARD_DF
815
+ main_df = get_leaderboard_df(version=version)
816
+ LEADERBOARD_DF = main_df # Update the global DataFrame
817
+ return refresh_data_with_filters(
818
+ version, search_query, comment_languages, selected_columns
819
+ )
820
+
821
+ refresh_button.click(
822
+ fn=refresh_and_update,
823
+ inputs=[
824
+ version_selector,
825
+ search_input,
826
+ comment_language_filter,
827
+ column_selector,
828
+ ],
829
+ outputs=[leaderboard]
830
+ + [
831
+ category_tabs.children[i].children[0]
832
+ for i in range(1, len(CATEGORIES) + 1)
833
+ ],
834
  )
835
+ # Search input functionality
836
+ search_input.change(
837
+ fn=refresh_data_with_filters,
838
+ inputs=[
839
+ version_selector,
840
+ search_input,
841
+ comment_language_filter,
842
+ column_selector,
843
+ ],
844
+ outputs=[leaderboard]
845
+ + [
846
+ category_tabs.children[i].children[0]
847
+ for i in range(1, len(CATEGORIES) + 1)
848
+ ],
849
  )
850
+
851
+ # Comment language filter functionality
852
+ comment_language_filter.change(
853
+ fn=refresh_data_with_filters,
854
+ inputs=[
855
+ version_selector,
856
+ search_input,
857
+ comment_language_filter,
858
+ column_selector,
859
+ ],
860
+ outputs=[leaderboard]
861
+ + [
862
+ category_tabs.children[i].children[0]
863
+ for i in range(1, len(CATEGORIES) + 1)
864
+ ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
865
  )
866
+
867
+ # Version selector functionality
868
+ version_selector.change(
869
+ fn=refresh_data_with_filters,
870
+ inputs=[
871
+ version_selector,
872
+ search_input,
873
+ comment_language_filter,
874
+ column_selector,
875
+ ],
876
+ outputs=[leaderboard]
877
+ + [
878
+ category_tabs.children[i].children[0]
879
+ for i in range(1, len(CATEGORIES) + 1)
880
+ ],
881
  )
882
+
883
+ # Update the update_columns function to handle updating all tabs at once
884
+ def update_columns(selected_columns):
885
+ """
886
+ Update all leaderboards to show the selected columns.
887
+ Ensures all selected columns are preserved in the update.
888
+
889
+ """
890
+
891
+ try:
892
+ logger.info(f"Updating columns to show: {selected_columns}")
893
+
894
+ # If no columns are selected, use default visible columns
895
+ if not selected_columns or len(selected_columns) == 0:
896
+ selected_columns = get_default_visible_columns()
897
+ logger.info(
898
+ f"No columns selected, using defaults: {selected_columns}"
899
+ )
900
+
901
+ # Convert display names to internal names
902
+ internal_selected_columns = [
903
+ x.lower()
904
+ .replace(" ", "_")
905
+ .replace("(", "")
906
+ .replace(")", "")
907
+ .replace("_recall", "_recall_binary")
908
+ .replace("_precision", "_precision_binary")
909
+ for x in selected_columns
910
+ ]
911
+
912
+ # Get the current data with ALL columns preserved
913
+ main_df = get_leaderboard_df(version=version_selector.value)
914
+
915
+ # Get category dataframes with ALL columns preserved
916
+ category_dfs = [
917
+ get_category_leaderboard_df(
918
+ category, version=version_selector.value
919
+ )
920
+ for category in CATEGORIES
921
+ ]
922
+
923
+ # Log columns for debugging
924
+ logger.info(f"Main dataframe columns: {list(main_df.columns)}")
925
+ logger.info(
926
+ f"Selected columns (internal): {internal_selected_columns}"
927
+ )
928
+
929
+ # IMPORTANT: Make sure model_name is always included
930
+ if (
931
+ "model_name" in main_df.columns
932
+ and "model_name" not in internal_selected_columns
933
+ ):
934
+ internal_selected_columns = [
935
+ "model_name"
936
+ ] + internal_selected_columns
937
+
938
+ # Initialize the main leaderboard with the selected columns
939
+ # We're passing the internal_selected_columns directly to preserve the selection
940
+ main_leaderboard = init_leaderboard(
941
+ main_df, internal_selected_columns
942
+ )
943
+
944
+ # Initialize category dataframes with the same selected columns
945
+ # This ensures consistency across all tabs
946
+ category_leaderboards = []
947
+ for df in category_dfs:
948
+ # Use the same selected columns for each category
949
+ # init_leaderboard will automatically handle filtering to columns that exist
950
+ category_leaderboards.append(
951
+ init_leaderboard(df, internal_selected_columns)
952
+ )
953
+
954
+ return main_leaderboard, *category_leaderboards
955
+
956
+ except Exception as e:
957
+ logger.error(f"Error updating columns: {e}")
958
+ import traceback
959
+
960
+ logger.error(traceback.format_exc())
961
+ return leaderboard, *[
962
+ tab.children[0]
963
+ for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
964
+ ]
965
+
966
+ # Connect column selector to update function
967
+ column_selector.change(
968
+ fn=update_columns,
969
+ inputs=[column_selector],
970
+ outputs=[leaderboard]
971
+ + [
972
+ category_tabs.children[i].children[0]
973
+ for i in range(1, len(CATEGORIES) + 1)
974
+ ],
975
  )
976
+
977
+ # with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
978
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
979
+
980
+ with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=1):
981
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
982
+
983
+ with gr.Row():
984
+ # with gr.Column(scale=3):
985
+ # gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
986
+ with gr.Column(scale=1):
987
+ # Add version selector specifically for the submission tab
988
+ submission_version_selector = gr.Dropdown(
989
+ choices=BENCHMARK_VERSIONS,
990
+ label="Benchmark Version",
991
+ value=CURRENT_VERSION,
992
+ interactive=True,
993
+ elem_classes="version-selector",
994
+ visible=False,
995
+ )
996
+
997
+ with gr.Row():
998
+ with gr.Column():
999
+ model_name_textbox = gr.Textbox(label="Model name")
1000
+ mode_selector = gr.Dropdown(
1001
+ choices=[m.name for m in Mode],
1002
+ label="Mode",
1003
+ multiselect=False,
1004
+ value=None,
1005
+ interactive=True,
1006
+ )
1007
+ revision_name_textbox = gr.Textbox(
1008
+ label="Revision commit", placeholder="main"
1009
+ )
1010
+ model_type = gr.Dropdown(
1011
+ choices=[
1012
+ t.to_str("-")
1013
+ for t in ModelType
1014
+ if t != ModelType.Unknown and t != ModelType.ClosedSource
1015
+ ],
1016
+ label="Model type",
1017
+ multiselect=False,
1018
+ value=None,
1019
+ interactive=True,
1020
+ )
1021
+ review_model_type = gr.Dropdown(
1022
+ choices=[t.name for t in ReviewModelType],
1023
+ label="Review model type",
1024
+ multiselect=False,
1025
+ value=ReviewModelType.CUSTOM.name,
1026
+ interactive=True,
1027
+ )
1028
+ programming_language_selector = gr.Dropdown(
1029
+ choices=["Python", "Java", "Scala", "Go"],
1030
+ label="Programming Language",
1031
+ multiselect=False,
1032
+ value=None,
1033
+ interactive=True,
1034
+ )
1035
+ comment_language_selector = gr.Dropdown(
1036
+ choices=["en", "ru"],
1037
+ label="Comment Language",
1038
+ multiselect=False,
1039
+ value="en",
1040
+ interactive=True,
1041
+ )
1042
+
1043
+ with gr.Column():
1044
+ precision = gr.Dropdown(
1045
+ choices=[
1046
+ i.name for i in Precision if i != Precision.Unknown
1047
+ ],
1048
+ label="Precision",
1049
+ multiselect=False,
1050
+ value="float16",
1051
+ interactive=True,
1052
+ )
1053
+ weight_type = gr.Dropdown(
1054
+ choices=[i.name for i in WeightType],
1055
+ label="Weights type",
1056
+ multiselect=False,
1057
+ value="Original",
1058
+ interactive=True,
1059
+ )
1060
+ base_model_name_textbox = gr.Textbox(
1061
+ label="Base model (for delta or adapter weights)"
1062
+ )
1063
+
1064
+ with gr.Row():
1065
+ file_input = gr.File(
1066
+ label="Upload JSONL Results File", file_types=[".jsonl"]
1067
  )
1068
+
1069
+ submit_button = gr.Button("Submit Results")
1070
+ result_output = gr.Markdown()
1071
+
1072
+ submit_button.click(
1073
+ fn=submit_results,
1074
+ inputs=[
1075
+ model_name_textbox,
1076
+ base_model_name_textbox,
1077
+ revision_name_textbox,
1078
+ precision,
1079
+ weight_type,
1080
+ model_type,
1081
+ mode_selector,
1082
+ file_input,
1083
+ submission_version_selector,
1084
+ review_model_type,
1085
+ programming_language_selector,
1086
+ comment_language_selector,
1087
+ ],
1088
+ outputs=result_output,
1089
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1090
 
1091
+ # Version selector functionality
1092
+ version_selector.change(
1093
+ fn=update_leaderboards,
1094
+ inputs=[version_selector],
1095
+ outputs=[leaderboard]
1096
+ + [
1097
+ category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
1098
+ ],
1099
+ ).then(
1100
+ lambda version: refresh_data_with_filters(version),
1101
+ inputs=[version_selector],
1102
+ outputs=[leaderboard]
1103
+ + [
1104
+ category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
1105
+ ],
1106
  )
1107
 
1108
+
1109
+ # Set up the scheduler to refresh data periodically
1110
+ scheduler = BackgroundScheduler()
1111
+ scheduler.add_job(refresh_data, "interval", minutes=30)
1112
+ scheduler.start()
1113
+
1114
+ # Launch the app
1115
+ if __name__ == "__main__":
1116
+ demo.launch()
example_submission.jsonl ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {"model_name": "GPT-4-CodeReview", "programming_language": "Python", "comment_language": "en", "topic": "Code Reliability", "observation_id": "obs_001", "code_snippet": "def calculate_sum(a, b):\n return a + b", "review_text": "This function is simple and correct, but consider adding type hints and docstring for better documentation.", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
2
+ {"model_name": "GPT-4-CodeReview", "programming_language": "Java", "comment_language": "en", "topic": "Coding Standards", "observation_id": "obs_002", "code_snippet": "public class Calculator {\n public int add(int a, int b) {\n return a + b;\n }\n}", "review_text": "Consider following Java naming conventions and adding JavaDoc comments. The method is functionally correct.", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
3
+ {"model_name": "Claude-3-CodeReview", "programming_language": "Scala", "comment_language": "ru", "topic": "Performance Issues", "observation_id": "obs_003", "code_snippet": "def fibonacci(n: Int): Int = {\n if (n <= 1) n\n else fibonacci(n-1) + fibonacci(n-2)\n}", "review_text": "Эта реализация неэффективна из-за экспоненциальной сложности. Рекомендуется использовать мемоизацию или итеративный подход.", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 9.2, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
4
+ {"model_name": "Llama-CodeReview", "programming_language": "Go", "comment_language": "en", "topic": "Variables", "observation_id": "obs_004", "code_snippet": "package main\n\nimport \"fmt\"\n\nfunc main() {\n var x int = 5\n var y int = 10\n fmt.Println(x + y)\n}", "review_text": "Consider using short variable declarations (:=) for local variables. Also, the variable names could be more descriptive.", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}
gradio_test.ipynb ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": []
9
+ }
10
+ ],
11
+ "metadata": {
12
+ "kernelspec": {
13
+ "display_name": "agent_env",
14
+ "language": "python",
15
+ "name": "python3"
16
+ },
17
+ "language_info": {
18
+ "codemirror_mode": {
19
+ "name": "ipython",
20
+ "version": 3
21
+ },
22
+ "file_extension": ".py",
23
+ "mimetype": "text/x-python",
24
+ "name": "python",
25
+ "nbconvert_exporter": "python",
26
+ "pygments_lexer": "ipython3",
27
+ "version": "3.13.2"
28
+ }
29
+ },
30
+ "nbformat": 4,
31
+ "nbformat_minor": 2
32
+ }
leaderboard_data.json CHANGED
@@ -1,23 +1,32 @@
1
  {
2
- "leaderboard": [
3
  {
4
- "model_name": "example/model",
5
- "bleu": 0.5,
6
- "llm_pass_1": 0.5,
7
- "llm_pass_5": 0.5,
8
- "llm_pass_10": 0.5,
9
- "metrics": {
10
- "readability": 5,
11
- "relevance": 5,
12
- "explanation_clarity": 5,
13
- "problem_identification": 5,
14
- "actionability": 5,
15
- "completeness": 5,
16
- "specificity": 5,
17
- "contextual_adequacy": 5,
18
- "consistency": 5,
19
- "brevity": 5
20
- }
 
 
 
 
 
 
 
21
  }
22
- ]
 
 
23
  }
 
1
  {
2
+ "entries": [
3
  {
4
+ "model_name": "GPT-4-CodeReview",
5
+ "model_type": "LLM",
6
+ "mode": "Strict",
7
+ "review_model_type": "gpt-4",
8
+ "programming_language": "Python",
9
+ "comment_language": "en",
10
+ "topic": "Code Reliability",
11
+ "submission_date": "2024-10-06T12:00:00Z",
12
+ "version": "v0",
13
+ "readability": 8.5,
14
+ "relevance": 9.0,
15
+ "explanation_clarity": 7.8,
16
+ "problem_identification": 8.2,
17
+ "actionability": 8.7,
18
+ "completeness": 8.0,
19
+ "specificity": 7.5,
20
+ "contextual_adequacy": 8.3,
21
+ "consistency": 8.8,
22
+ "brevity": 7.2,
23
+ "pass_at_1": 0.75,
24
+ "pass_at_5": 0.88,
25
+ "pass_at_10": 0.92,
26
+ "bleu_at_10": 0.65,
27
+ "total_evaluations": 100
28
  }
29
+ ],
30
+ "last_updated": "2024-10-06T12:00:00Z",
31
+ "version": "v0"
32
  }
requirements.txt CHANGED
@@ -1,19 +1,8 @@
1
- APScheduler
2
- black
3
- datasets
4
- gradio>=4.0.0
5
- gradio[oauth]
6
- gradio_leaderboard==0.0.13
7
- gradio_client
8
- huggingface-hub>=0.18.0
9
- matplotlib
10
- numpy
11
- pandas>=1.3.0
12
- python-dateutil
13
- tqdm
14
- transformers
15
- tokenizers>=0.15.0
16
- sentencepiece
17
- fastapi
18
- uvicorn
19
- pydantic>=2.0.0
 
1
+ gradio==4.44.1
2
+ pandas>=2.0.0
3
+ huggingface_hub>=0.20.0
4
+ datasets>=2.0.0
5
+ apscheduler>=3.10.0
6
+ python-dotenv>=1.0.0
7
+ plotly>=5.18.0
8
+ pydantic==2.10.6
 
 
 
 
 
 
 
 
 
 
 
src/about.py CHANGED
@@ -1,48 +1,60 @@
1
  """
2
- About page content for CodeReview Leaderboard
3
  """
4
 
5
- TITLE = "🏆 CodeReview Leaderboard"
 
 
 
 
6
 
7
  INTRODUCTION_TEXT = """
8
- # CodeReview Leaderboard
9
-
10
- A comprehensive benchmark for evaluating code review generation models across multiple programming languages and comment types.
11
 
12
- ## Overview
 
 
13
 
14
- This leaderboard tracks the performance of various models on code review tasks, providing insights into:
15
- - **Programming Language Performance**: How well models perform across different programming languages
16
- - **Comment Language Support**: Effectiveness in generating reviews in different natural languages
17
- - **Taxonomy Categories**: Performance across different types of code review feedback
18
 
19
- ## Metrics
 
20
 
21
- - **BLEU**: Measures similarity between generated and reference reviews
22
- - **Pass@1/5/10**: Percentage of reviews that pass quality checks in 1, 5, or 10 attempts
23
- - **Multi-dimensional Quality Scores**: Detailed evaluation across 10 quality dimensions
24
 
25
- ## Features
26
 
27
- **Filter by Programming Language**: View results for specific programming languages (Python, JavaScript, Java, etc.)
28
- ✨ **Comment Language Support**: Filter by the natural language of code comments
29
- ✨ **Taxonomy Categories**: Browse results by review type (bug detection, style, performance, etc.)
30
- ✨ **IP-based Submissions**: Secure submission system with IP tracking
31
- ✨ **Dark Theme**: Modern, eye-friendly interface
32
  """
33
 
34
- SUBMISSION_GUIDELINES = """
35
- ## Submission Guidelines
36
 
37
- 1. **Model Requirements**: Submit results for at least 100 test cases
38
- 2. **Format**: Provide scores in the specified format ranges
39
- 3. **Reproducibility**: Include model details and evaluation setup
40
- 4. **Quality Metrics**: Rate your model across all 10 quality dimensions
41
- 5. **Metadata**: Specify programming language, comment language, and taxonomy focus
42
- """
43
 
44
- CONTACT_INFO = """
45
- ## Contact & Support
 
46
 
47
- For questions, issues, or contributions, please reach out through our repository or contact the maintainers.
48
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Text content for the CodeReview Bench Leaderboard.
3
  """
4
 
5
+ TITLE = """
6
+ <div style="text-align: center; margin-bottom: 1rem">
7
+ <h1>CodeReview Bench Leaderboard</h1>
8
+ </div>
9
+ """
10
 
11
  INTRODUCTION_TEXT = """
12
+ ## Introduction
 
 
13
 
14
+ CodeReview Bench is a comprehensive benchmark for evaluating the quality and effectiveness of automated code review systems.
15
+ This leaderboard tracks model performance across various programming languages and review criteria,
16
+ including readability, relevance, explanation clarity, and actionability.
17
 
18
+ Models are evaluated on their ability to provide high-quality code reviews that are helpful,
19
+ accurate, and actionable across multiple programming languages and review categories.
20
+ """
 
21
 
22
+ LLM_BENCHMARKS_TEXT = """
23
+ CodeReview Bench is a comprehensive benchmark for evaluating automated code review systems across programming languages and review quality dimensions.
24
 
25
+ It evaluates models on their ability to provide high-quality code reviews using both LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity) and exact-match metrics (pass@1, pass@5, pass@10, BLEU@10).
 
 
26
 
27
+ The benchmark supports both Russian and English comment languages across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more.
28
 
29
+ Learn more about automated code review evaluation and best practices.
 
 
 
 
30
  """
31
 
32
+ EVALUATION_QUEUE_TEXT = """
33
+ ## Submit Your Model
34
 
35
+ To add your model to the CodeReview Bench leaderboard:
 
 
 
 
 
36
 
37
+ 1. Run your evaluation using the CodeReview Bench framework
38
+ 2. Upload your results in .jsonl format using this form.
39
+ 3. Once validated, your model will appear on the leaderboard.
40
 
41
+ ### Requirements:
42
+ - Results must include all required metrics: LLM-based multimetric scores and exact-match metrics
43
+ - Submissions should cover multiple programming languages where applicable
44
+ - Both Russian and English comment languages are supported
45
+
46
+ ### ✉️✨ Ready? Upload your results below!
47
+ """
48
+
49
+ CITATION_BUTTON_LABEL = "Cite CodeReview Bench"
50
+
51
+ CITATION_BUTTON_TEXT = """
52
+ @misc{codereviewbench2025,
53
+ author = {CodeReview Bench Team},
54
+ title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
55
+ year = {2025},
56
+ publisher = {GitHub},
57
+ journal = {GitHub repository},
58
+ howpublished = {\\url{https://github.com/your-org/codereview-bench}}
59
+ }
60
+ """
src/display/css_html_js.py CHANGED
@@ -1,306 +1,97 @@
1
  """
2
- Custom CSS, HTML, and JavaScript for the CodeReview Leaderboard
3
  """
4
 
5
- # Dark theme CSS
6
- DARK_THEME_CSS = """
7
- /* Dark Theme Styling */
8
- :root {
9
- --bg-primary: #0d1117;
10
- --bg-secondary: #161b22;
11
- --bg-tertiary: #21262d;
12
- --text-primary: #e6edf3;
13
- --text-secondary: #7d8590;
14
- --border-color: #30363d;
15
- --accent-color: #ffffff;
16
- --accent-hover: #f0f0f0;
17
- --danger-color: #da3633;
18
- --warning-color: #d29922;
19
- --info-color: #1f6feb;
20
  }
21
 
22
- /* Global dark theme */
23
- .gradio-container {
24
- background: var(--bg-primary) !important;
25
- color: var(--text-primary) !important;
26
  }
27
 
28
- /* Headers and text */
29
- .gradio-container h1, .gradio-container h2, .gradio-container h3 {
30
- color: var(--text-primary) !important;
31
  }
32
 
33
- .gradio-container p, .gradio-container span {
34
- color: var(--text-secondary) !important;
35
  }
36
 
37
- /* Tabs */
38
- .gradio-container .tab-nav {
39
- background: var(--bg-secondary) !important;
40
- border-bottom: 1px solid var(--border-color) !important;
 
 
41
  }
42
 
43
- .gradio-container .tab-nav button {
44
- background: transparent !important;
45
- color: var(--text-secondary) !important;
46
- border: none !important;
47
- padding: 12px 24px !important;
48
- transition: all 0.2s ease !important;
49
  }
50
 
51
- .gradio-container .tab-nav button:hover {
52
- color: var(--text-primary) !important;
53
- background: var(--bg-tertiary) !important;
54
  }
55
 
56
- .gradio-container .tab-nav button.selected {
57
- color: var(--text-primary) !important;
58
- background: var(--bg-tertiary) !important;
59
- border-bottom: 2px solid var(--accent-color) !important;
60
  }
61
 
62
- /* Tables */
63
- .gradio-container .dataframe {
64
- background: var(--bg-secondary) !important;
65
- border: 1px solid var(--border-color) !important;
66
- border-radius: 8px !important;
67
- overflow: hidden !important;
68
  }
69
 
70
- .gradio-container .dataframe table {
71
- background: var(--bg-secondary) !important;
 
72
  }
73
 
74
- .gradio-container .dataframe th {
75
- background: var(--bg-tertiary) !important;
76
- color: var(--text-primary) !important;
77
- border-bottom: 2px solid var(--border-color) !important;
78
- padding: 12px !important;
79
- font-weight: 600 !important;
80
  }
81
 
82
- .gradio-container .dataframe td {
83
- background: var(--bg-secondary) !important;
84
- color: var(--text-primary) !important;
85
- border-bottom: 1px solid var(--border-color) !important;
86
- padding: 10px 12px !important;
87
  }
88
 
89
- .gradio-container .dataframe tr:hover td {
90
- background: var(--bg-tertiary) !important;
 
91
  }
92
 
93
- /* Form inputs */
94
- .gradio-container input, .gradio-container select, .gradio-container textarea {
95
- background: var(--bg-tertiary) !important;
96
- color: var(--text-primary) !important;
97
- border: 1px solid var(--border-color) !important;
98
- border-radius: 6px !important;
99
- padding: 8px 12px !important;
 
 
100
  }
101
 
102
- .gradio-container input:focus, .gradio-container select:focus, .gradio-container textarea:focus {
103
- border-color: var(--accent-color) !important;
104
- box-shadow: 0 0 0 2px rgba(255, 255, 255, 0.2) !important;
 
105
  }
106
 
107
- /* Buttons */
108
- .gradio-container button {
109
- background: var(--accent-color) !important;
110
- color: var(--bg-primary) !important;
111
- border: 1px solid var(--border-color) !important;
112
- border-radius: 6px !important;
113
- padding: 8px 16px !important;
114
- font-weight: 500 !important;
115
- transition: all 0.2s ease !important;
116
- }
117
-
118
- .gradio-container button:hover {
119
- background: var(--accent-hover) !important;
120
- transform: translateY(-1px) !important;
121
- color: var(--bg-primary) !important;
122
- }
123
-
124
- .gradio-container button:active {
125
- transform: translateY(0) !important;
126
- }
127
-
128
- /* Dropdowns */
129
- .gradio-container .dropdown {
130
- background: var(--bg-tertiary) !important;
131
- border: 1px solid var(--border-color) !important;
132
- border-radius: 6px !important;
133
- }
134
-
135
- .gradio-container .dropdown-menu {
136
- background: var(--bg-secondary) !important;
137
- border: 1px solid var(--border-color) !important;
138
- border-radius: 6px !important;
139
- box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important;
140
- }
141
-
142
- .gradio-container .dropdown-menu .dropdown-item {
143
- color: var(--text-primary) !important;
144
- padding: 8px 12px !important;
145
- }
146
-
147
- .gradio-container .dropdown-menu .dropdown-item:hover {
148
- background: var(--bg-tertiary) !important;
149
- }
150
-
151
- /* Sliders */
152
- .gradio-container .slider {
153
- background: var(--bg-tertiary) !important;
154
- }
155
-
156
- .gradio-container .slider input[type="range"] {
157
- background: var(--bg-tertiary) !important;
158
- }
159
-
160
- .gradio-container .slider input[type="range"]::-webkit-slider-thumb {
161
- background: var(--accent-color) !important;
162
- border: 2px solid var(--bg-primary) !important;
163
- border-radius: 50% !important;
164
- width: 18px !important;
165
- height: 18px !important;
166
- }
167
-
168
- .gradio-container .slider input[type="range"]::-webkit-slider-track {
169
- background: var(--border-color) !important;
170
- border-radius: 4px !important;
171
- height: 6px !important;
172
- }
173
-
174
- /* Accordions */
175
- .gradio-container .accordion {
176
- background: var(--bg-secondary) !important;
177
- border: 1px solid var(--border-color) !important;
178
- border-radius: 8px !important;
179
- margin: 16px 0 !important;
180
- }
181
-
182
- .gradio-container .accordion-header {
183
- background: var(--bg-tertiary) !important;
184
- color: var(--text-primary) !important;
185
- padding: 16px !important;
186
- border-bottom: 1px solid var(--border-color) !important;
187
- cursor: pointer !important;
188
- font-weight: 500 !important;
189
- }
190
-
191
- .gradio-container .accordion-header:hover {
192
- background: var(--bg-primary) !important;
193
- }
194
-
195
- /* Status messages */
196
- .gradio-container .success {
197
- background: rgba(255, 255, 255, 0.1) !important;
198
- color: var(--text-primary) !important;
199
- border: 1px solid var(--accent-color) !important;
200
- border-radius: 6px !important;
201
- padding: 12px 16px !important;
202
- margin: 8px 0 !important;
203
- }
204
-
205
- .gradio-container .error {
206
- background: rgba(218, 54, 51, 0.1) !important;
207
- color: var(--danger-color) !important;
208
- border: 1px solid var(--danger-color) !important;
209
- border-radius: 6px !important;
210
- padding: 12px 16px !important;
211
- margin: 8px 0 !important;
212
- }
213
-
214
- /* Responsive design */
215
- @media (max-width: 768px) {
216
- .gradio-container {
217
- padding: 16px !important;
218
- }
219
-
220
- .gradio-container .tab-nav button {
221
- padding: 8px 16px !important;
222
- font-size: 14px !important;
223
- }
224
-
225
- .gradio-container .dataframe {
226
- font-size: 14px !important;
227
- }
228
  }
229
  """
230
-
231
- # Custom JavaScript for enhanced functionality
232
- CUSTOM_JS = """
233
- // Enhanced table sorting and filtering
234
- function enhanceTable() {
235
- const tables = document.querySelectorAll('.dataframe table');
236
- tables.forEach(table => {
237
- // Add sorting functionality
238
- const headers = table.querySelectorAll('th');
239
- headers.forEach((header, index) => {
240
- header.style.cursor = 'pointer';
241
- header.addEventListener('click', () => sortTable(table, index));
242
- });
243
- });
244
- }
245
-
246
- function sortTable(table, columnIndex) {
247
- const tbody = table.querySelector('tbody');
248
- const rows = Array.from(tbody.querySelectorAll('tr'));
249
-
250
- rows.sort((a, b) => {
251
- const aText = a.cells[columnIndex].textContent.trim();
252
- const bText = b.cells[columnIndex].textContent.trim();
253
-
254
- // Try to parse as numbers first
255
- const aNum = parseFloat(aText);
256
- const bNum = parseFloat(bText);
257
-
258
- if (!isNaN(aNum) && !isNaN(bNum)) {
259
- return bNum - aNum; // Descending for numbers
260
- }
261
-
262
- return aText.localeCompare(bText); // Ascending for text
263
- });
264
-
265
- rows.forEach(row => tbody.appendChild(row));
266
- }
267
-
268
- // Auto-refresh functionality
269
- function autoRefresh() {
270
- setInterval(() => {
271
- const refreshBtn = document.querySelector('button[aria-label="Refresh"]');
272
- if (refreshBtn) {
273
- refreshBtn.click();
274
- }
275
- }, 30000); // Refresh every 30 seconds
276
- }
277
-
278
- // Initialize enhancements
279
- document.addEventListener('DOMContentLoaded', function() {
280
- enhanceTable();
281
- autoRefresh();
282
- });
283
- """
284
-
285
- # HTML components
286
- HEADER_HTML = """
287
- <div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-bottom: 20px;">
288
- <h1 style="color: var(--text-primary); margin: 0; font-size: 2.5em; font-weight: 700;">
289
- 🏆 CodeReview Leaderboard
290
- </h1>
291
- <p style="color: var(--text-secondary); margin: 10px 0 0 0; font-size: 1.2em;">
292
- Benchmarking code review generation models across languages and categories
293
- </p>
294
- </div>
295
- """
296
-
297
- FOOTER_HTML = """
298
- <div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-top: 20px;">
299
- <p style="color: var(--text-secondary); margin: 0; font-size: 0.9em;">
300
- Built with ❤️ for the code review community |
301
- <a href="https://github.com/your-repo" style="color: var(--accent-color); text-decoration: none;">
302
- GitHub
303
- </a>
304
- </p>
305
- </div>
306
- """
 
1
  """
2
+ CSS and styling for the CodeReview Bench Leaderboard.
3
  """
4
 
5
+ custom_css = """
6
+ .markdown-text {
7
+ font-size: 16px !important;
8
+ text-align: justify !important;
9
+ line-height: 1.0 !important;
10
+ margin-top: 10px !important;
11
+ margin-bottom: 10px !important;
 
 
 
 
 
 
 
 
12
  }
13
 
14
+ .tab-buttons button.selected {
15
+ border-color: #f4f4f5 !important;
16
+ background: #3f3f46 !important;
17
+ color: #f4f4f5 !important;
18
  }
19
 
20
+ #citation-button textarea {
21
+ font-family: monospace !important;
 
22
  }
23
 
24
+ .leaderboard-container {
25
+ margin-top: 20px;
26
  }
27
 
28
+ .category-header {
29
+ font-weight: bold;
30
+ background-color: #f5f5f5;
31
+ padding: 10px;
32
+ margin-top: 15px;
33
+ border-radius: 5px;
34
  }
35
 
36
+ .metric-name {
37
+ font-weight: bold;
38
+ color: #a1a1aa !important;
 
 
 
39
  }
40
 
41
+ .model-name {
42
+ font-weight: bold;
 
43
  }
44
 
45
+ .model-link:hover {
46
+ text-decoration: underline;
47
+ color: #ffffff !important;
 
48
  }
49
 
50
+ .version-selector {
51
+ margin: 0 !important;
52
+ padding: 5px;
53
+ border-radius: 5px;
 
 
54
  }
55
 
56
+ .version-selector label {
57
+ font-weight: bold;
58
+ color: #f4f4f5 !important;
59
  }
60
 
61
+ .version-selector select {
62
+ border-color: #3f3f46 !important;
63
+ border-radius: 5px;
 
 
 
64
  }
65
 
66
+ /* Make sure the version selector is properly aligned with refresh button */
67
+ .version-selector > .block {
68
+ padding: 0 !important;
 
 
69
  }
70
 
71
+ .version-selector > .block > .wrap {
72
+ position: relative;
73
+ top: -5px;
74
  }
75
 
76
+ /* Force background/border for common layout containers */
77
+ .gradio-row > .block,
78
+ .gradio-column > .block,
79
+ .form,
80
+ .panel {
81
+ /* background: #18181b !important; */ /* Removed background override */
82
+ border-color: #27272a80 !important; /* Made border color semi-transparent */
83
+ border-width: 1px !important; /* Ensure border is visible */
84
+ border-style: solid !important;
85
  }
86
 
87
+ /* Target the specific file upload component area */
88
+ .gradio-file .wrap {
89
+ /* background: #18181b !important; */ /* Removed background override */
90
+ border-color: #27272a !important;
91
  }
92
 
93
+ #refresh-button {
94
+ margin-top: 5px !important;
95
+ margin-bottom: 5px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  }
97
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py CHANGED
@@ -1,182 +1,71 @@
1
  """
2
- Formatting utilities for display components
3
  """
4
 
5
- import re
6
- from typing import List, Dict, Any, Optional
7
- from datetime import datetime, timezone
8
 
9
- def format_score(score: float, precision: int = 3) -> str:
10
- """Format a score with specified precision"""
11
- if isinstance(score, (int, float)):
12
- return f"{score:.{precision}f}"
13
- return str(score)
14
 
15
- def format_percentage(score: float, precision: int = 1) -> str:
16
- """Format a score as percentage"""
17
- if isinstance(score, (int, float)):
18
- return f"{score * 100:.{precision}f}%"
19
- return str(score)
20
 
21
- def format_model_name(name: str) -> str:
22
- """Format model name for display"""
23
- # Remove common prefixes and make more readable
24
- name = name.strip()
25
- if "/" in name:
26
- org, model = name.split("/", 1)
27
- return f"<span style='color: var(--text-secondary); font-size: 0.9em;'>{org}/</span><strong>{model}</strong>"
28
- return f"<strong>{name}</strong>"
29
 
30
- def format_timestamp(timestamp: str) -> str:
31
- """Format timestamp for display"""
32
- try:
33
- dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
34
- return dt.strftime("%Y-%m-%d %H:%M UTC")
35
- except:
36
- return timestamp
37
 
38
- def format_ip_address(ip: str) -> str:
39
- """Format IP address for display (partial masking)"""
40
- if not ip:
41
- return "Unknown"
42
-
43
- # Mask part of IP for privacy
44
- parts = ip.split(".")
45
- if len(parts) == 4:
46
- return f"{parts[0]}.{parts[1]}.{parts[2]}.xxx"
47
- return "xxx.xxx.xxx.xxx"
48
 
49
- def format_metric_score(score: int, metric_name: str) -> str:
50
- """Format metric score with color coding"""
51
- if not isinstance(score, (int, float)):
52
- return str(score)
53
-
54
- # Color coding based on score
55
- if score >= 8:
56
- color = "#ffffff" # White
57
- elif score >= 6:
58
- color = "#d0d0d0" # Light gray
59
- elif score >= 4:
60
- color = "#a0a0a0" # Gray
61
- else:
62
- color = "#707070" # Dark gray
63
-
64
- return f"<span style='color: {color}; font-weight: 600;'>{score}</span>"
65
 
66
- def format_language_badge(language: str) -> str:
67
- """Format programming language as a badge"""
68
- if not language or language == "All":
69
- return language
70
-
71
- # Language-specific colors
72
- colors = {
73
- "Python": "#3776ab",
74
- "JavaScript": "#f7df1e",
75
- "Java": "#ed8b00",
76
- "C++": "#00599c",
77
- "C#": "#239120",
78
- "Go": "#00add8",
79
- "Rust": "#ce422b",
80
- "TypeScript": "#3178c6",
81
- "PHP": "#777bb4",
82
- "Ruby": "#cc342d",
83
- "Swift": "#fa7343",
84
- "Kotlin": "#7f52ff",
85
- "Scala": "#dc322f",
86
- "R": "#276dc3",
87
- "MATLAB": "#e16737"
88
- }
89
-
90
- color = colors.get(language, "#6c757d")
91
- return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{language}</span>"
92
 
93
- def format_taxonomy_badge(category: str) -> str:
94
- """Format taxonomy category as a badge"""
95
- if not category or category == "All":
96
- return category
97
-
98
- # Category-specific colors
99
- colors = {
100
- "Bug Detection": "#dc3545",
101
- "Code Style": "#6f42c1",
102
- "Performance": "#fd7e14",
103
- "Security": "#e83e8c",
104
- "Maintainability": "#ffffff",
105
- "Documentation": "#17a2b8",
106
- "Testing": "#ffffff",
107
- "Architecture": "#6c757d",
108
- "Best Practices": "#007bff",
109
- "Refactoring": "#ffc107"
110
- }
111
-
112
- color = colors.get(category, "#6c757d")
113
- return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{category}</span>"
114
 
115
- def format_comment_language_flag(language: str) -> str:
116
- """Format comment language with flag emoji"""
117
- if not language or language == "All":
118
- return language
119
-
120
- # Language-specific flags
121
- flags = {
122
- "English": "🇺🇸",
123
- "Chinese": "🇨🇳",
124
- "Spanish": "🇪🇸",
125
- "French": "🇫🇷",
126
- "German": "🇩🇪",
127
- "Japanese": "🇯🇵",
128
- "Korean": "🇰🇷",
129
- "Russian": "🇷🇺",
130
- "Portuguese": "🇵🇹",
131
- "Italian": "🇮🇹",
132
- "Dutch": "🇳🇱"
133
- }
134
-
135
- flag = flags.get(language, "🌐")
136
- return f"{flag} {language}"
137
 
138
- def sanitize_html(text: str) -> str:
139
- """Sanitize HTML content to prevent XSS"""
140
- if not isinstance(text, str):
141
- return str(text)
142
-
143
- # Remove potentially dangerous HTML tags
144
- text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
145
- text = re.sub(r'<iframe[^>]*>.*?</iframe>', '', text, flags=re.DOTALL | re.IGNORECASE)
146
- text = re.sub(r'on\w+="[^"]*"', '', text, flags=re.IGNORECASE)
147
- text = re.sub(r'on\w+=\'[^\']*\'', '', text, flags=re.IGNORECASE)
148
-
149
- return text
150
 
151
- def truncate_text(text: str, max_length: int = 50) -> str:
152
- """Truncate text with ellipsis"""
153
- if not isinstance(text, str):
154
- text = str(text)
155
-
156
- if len(text) <= max_length:
157
- return text
158
-
159
- return text[:max_length-3] + "..."
160
 
161
- def format_table_cell(value: Any, column_name: str) -> str:
162
- """Format table cell based on column type"""
163
- if value is None:
164
- return "N/A"
165
-
166
- # Handle different column types
167
- if column_name.lower() in ["bleu", "pass@1", "pass@5", "pass@10"]:
168
- return format_percentage(value)
169
- elif column_name.lower() == "model":
170
- return format_model_name(str(value))
171
- elif column_name.lower() == "programming language":
172
- return format_language_badge(str(value))
173
- elif column_name.lower() == "comment language":
174
- return format_comment_language_flag(str(value))
175
- elif column_name.lower() == "taxonomy":
176
- return format_taxonomy_badge(str(value))
177
- elif column_name.lower() in ["readability", "relevance", "explanation clarity",
178
- "problem identification", "actionability", "completeness",
179
- "specificity", "contextual adequacy", "consistency", "brevity"]:
180
- return format_metric_score(value, column_name.lower())
181
- else:
182
- return sanitize_html(str(value))
 
1
  """
2
+ Formatting utilities for the GuardBench Leaderboard.
3
  """
4
 
5
+ import pandas as pd
6
+ import numpy as np
 
7
 
 
 
 
 
 
8
 
9
+ def make_clickable_model(model_name: str) -> str:
10
+ """
11
+ Create a clickable link for a model name.
12
+ """
13
+ return f'<a href="https://huggingface.co/{model_name}" target="_blank">{model_name}</a>'
14
 
 
 
 
 
 
 
 
 
15
 
16
+ def has_no_nan_values(df: pd.DataFrame, columns: list) -> pd.Series:
17
+ """
18
+ Check if a row has no NaN values in the specified columns.
19
+ """
20
+ return ~df[columns].isna().any(axis=1)
 
 
21
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ def format_percentage(value: float) -> str:
24
+ """
25
+ Format a value as a percentage.
26
+ """
27
+ if pd.isna(value):
28
+ return "N/A"
29
+ return f"{value * 100:.2f}%"
 
 
 
 
 
 
 
 
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ def format_number(value: float, precision: int = 2) -> str:
33
+ """
34
+ Format a number with specified precision.
35
+ """
36
+ if pd.isna(value):
37
+ return "N/A"
38
+ return f"{value:.{precision}f}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ def styled_message(message: str) -> str:
42
+ """
43
+ Format a success message with styling.
44
+ """
45
+ return f"""
46
+ <div style="padding: 10px; border-radius: 5px; background-color: #e6f7e6; color: #2e7d32; border: 1px solid #2e7d32;">
47
+ {message}
48
+ </div>
49
+ """
 
 
 
50
 
 
 
 
 
 
 
 
 
 
51
 
52
+ def styled_warning(message: str) -> str:
53
+ """
54
+ Format a warning message with styling.
55
+ """
56
+ return f"""
57
+ <div style="padding: 10px; border-radius: 5px; background-color: #fff8e1; color: #ff8f00; border: 1px solid #ff8f00;">
58
+ ⚠️ {message}
59
+ </div>
60
+ """
61
+
62
+
63
+ def styled_error(message: str) -> str:
64
+ """
65
+ Format an error message with styling.
66
+ """
67
+ return f"""
68
+ <div style="padding: 10px; border-radius: 5px; background-color: #ffebee; color: #c62828; border: 1px solid #c62828;">
69
+ {message}
70
+ </div>
71
+ """
 
 
src/display/utils.py CHANGED
@@ -1,292 +1,417 @@
1
  """
2
- Display utilities for the CodeReview Leaderboard
3
  """
4
 
5
- from typing import List, Dict, Any, Optional, Tuple
6
- import json
7
- from datetime import datetime, timezone
8
- from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
9
- from src.display.formatting import format_table_cell, format_timestamp
10
-
11
- def filter_leaderboard_data(
12
- data: List[Dict],
13
- programming_language: str = "All",
14
- comment_language: str = "All",
15
- taxonomy_category: str = "All",
16
- sort_by: str = "llm_pass_1",
17
- sort_order: str = "desc"
18
- ) -> List[Dict]:
19
- """Filter and sort leaderboard data based on criteria"""
20
-
21
- if not data:
22
- return []
23
-
24
- # Apply filters
25
- filtered_data = data.copy()
26
-
27
- if programming_language != "All":
28
- filtered_data = [
29
- entry for entry in filtered_data
30
- if entry.get("programming_language", "").lower() == programming_language.lower()
31
- ]
32
-
33
- if comment_language != "All":
34
- filtered_data = [
35
- entry for entry in filtered_data
36
- if entry.get("comment_language", "").lower() == comment_language.lower()
37
- ]
38
-
39
- if taxonomy_category != "All":
40
- filtered_data = [
41
- entry for entry in filtered_data
42
- if entry.get("taxonomy_category", "").lower() == taxonomy_category.lower()
43
- ]
44
-
45
- # Sort data
46
- reverse = sort_order.lower() == "desc"
47
-
48
- try:
49
- if sort_by in ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]:
50
- filtered_data.sort(key=lambda x: x.get(sort_by, 0), reverse=reverse)
51
- elif sort_by in QUALITY_METRICS:
52
- filtered_data.sort(key=lambda x: x.get("metrics", {}).get(sort_by, 0), reverse=reverse)
53
- else:
54
- filtered_data.sort(key=lambda x: str(x.get(sort_by, "")), reverse=reverse)
55
- except Exception as e:
56
- print(f"Error sorting data: {e}")
57
- # Default sort by pass@1
58
- filtered_data.sort(key=lambda x: x.get("llm_pass_1", 0), reverse=True)
59
-
60
- return filtered_data
61
-
62
- def get_main_leaderboard_data(
63
- data: List[Dict],
64
- programming_language: str = "All",
65
- comment_language: str = "All",
66
- taxonomy_category: str = "All",
67
- sort_by: str = "llm_pass_1"
68
- ) -> List[List[str]]:
69
- """Get formatted main leaderboard table data"""
70
-
71
- filtered_data = filter_leaderboard_data(
72
- data, programming_language, comment_language, taxonomy_category, sort_by
73
- )
74
-
75
- table_rows = []
76
- for entry in filtered_data:
77
- row = [
78
- format_table_cell(entry.get("model_name", ""), "model"),
79
- format_table_cell(entry.get("programming_language", ""), "programming language"),
80
- format_table_cell(entry.get("comment_language", ""), "comment language"),
81
- format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
82
- format_table_cell(entry.get("bleu", 0), "bleu"),
83
- format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
84
- format_table_cell(entry.get("llm_pass_5", 0), "pass@5"),
85
- format_table_cell(entry.get("llm_pass_10", 0), "pass@10"),
86
- ]
87
- table_rows.append(row)
88
-
89
- return table_rows
90
-
91
- def get_quality_metrics_data(
92
- data: List[Dict],
93
- programming_language: str = "All",
94
- comment_language: str = "All",
95
- taxonomy_category: str = "All",
96
- sort_by: str = "llm_pass_1"
97
- ) -> List[List[str]]:
98
- """Get formatted quality metrics table data"""
99
-
100
- filtered_data = filter_leaderboard_data(
101
- data, programming_language, comment_language, taxonomy_category, sort_by
102
- )
103
-
104
- table_rows = []
105
- for entry in filtered_data:
106
- metrics = entry.get("metrics", {})
107
- row = [format_table_cell(entry.get("model_name", ""), "model")]
108
-
109
- for metric in QUALITY_METRICS:
110
- formatted_value = format_table_cell(metrics.get(metric, 0), metric.replace("_", " "))
111
- row.append(formatted_value)
112
-
113
- table_rows.append(row)
114
-
115
- return table_rows
116
-
117
- def get_submission_history_data(
118
- data: List[Dict],
119
- programming_language: str = "All",
120
- comment_language: str = "All",
121
- taxonomy_category: str = "All",
122
- limit: int = 50
123
- ) -> List[List[str]]:
124
- """Get formatted submission history data"""
125
-
126
- filtered_data = filter_leaderboard_data(
127
- data, programming_language, comment_language, taxonomy_category, "submission_date", "desc"
128
- )
129
-
130
- # Limit results
131
- filtered_data = filtered_data[:limit]
132
-
133
- table_rows = []
134
- for entry in filtered_data:
135
- row = [
136
- format_table_cell(entry.get("model_name", ""), "model"),
137
- format_table_cell(entry.get("programming_language", ""), "programming language"),
138
- format_table_cell(entry.get("comment_language", ""), "comment language"),
139
- format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
140
- format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
141
- format_timestamp(entry.get("submission_date", "")),
142
- entry.get("submission_ip", "").split(".")[0] + ".xxx.xxx.xxx" if entry.get("submission_ip") else "Unknown"
143
- ]
144
- table_rows.append(row)
145
-
146
- return table_rows
147
 
148
- def get_statistics_summary(data: List[Dict]) -> Dict[str, Any]:
149
- """Get summary statistics for the leaderboard"""
150
-
151
- if not data:
152
- return {
153
- "total_models": 0,
154
- "total_submissions": 0,
155
- "avg_pass_1": 0,
156
- "best_model": "None",
157
- "languages_covered": 0,
158
- "categories_covered": 0
159
- }
160
-
161
- # Calculate statistics
162
- total_models = len(set(entry.get("model_name", "") for entry in data))
163
- total_submissions = len(data)
164
-
165
- pass_1_scores = [entry.get("llm_pass_1", 0) for entry in data if entry.get("llm_pass_1") is not None]
166
- avg_pass_1 = sum(pass_1_scores) / len(pass_1_scores) if pass_1_scores else 0
167
-
168
- best_entry = max(data, key=lambda x: x.get("llm_pass_1", 0)) if data else None
169
- best_model = best_entry.get("model_name", "None") if best_entry else "None"
170
-
171
- languages_covered = len(set(entry.get("programming_language", "") for entry in data if entry.get("programming_language")))
172
- categories_covered = len(set(entry.get("taxonomy_category", "") for entry in data if entry.get("taxonomy_category")))
173
-
174
- return {
175
- "total_models": total_models,
176
- "total_submissions": total_submissions,
177
- "avg_pass_1": avg_pass_1,
178
- "best_model": best_model,
179
- "languages_covered": languages_covered,
180
- "categories_covered": categories_covered
181
- }
182
-
183
- def validate_submission_data(data: Dict[str, Any]) -> Tuple[bool, str]:
184
- """Validate submission data"""
185
-
186
- required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
187
-
188
- # Check required fields
189
- for field in required_fields:
190
- if not data.get(field):
191
- return False, f"Missing required field: {field}"
192
-
193
- # Validate scores
194
- score_fields = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
195
- for field in score_fields:
196
- value = data.get(field)
197
- if value is None:
198
- return False, f"Missing score: {field}"
199
- if not isinstance(value, (int, float)):
200
- return False, f"Invalid score format: {field}"
201
- if not 0 <= value <= 1:
202
- return False, f"Score out of range (0-1): {field}"
203
-
204
- # Validate metrics
205
- metrics = data.get("metrics", {})
206
- for metric in QUALITY_METRICS:
207
- value = metrics.get(metric)
208
- if value is None:
209
- return False, f"Missing metric: {metric}"
210
- if not isinstance(value, (int, float)):
211
- return False, f"Invalid metric format: {metric}"
212
- if not 0 <= value <= 10:
213
- return False, f"Metric out of range (0-10): {metric}"
214
-
215
- # Validate language and category choices
216
- if data.get("programming_language") not in PROGRAMMING_LANGUAGES:
217
- return False, "Invalid programming language"
218
-
219
- if data.get("comment_language") not in COMMENT_LANGUAGES:
220
- return False, "Invalid comment language"
221
-
222
- if data.get("taxonomy_category") not in TAXONOMY_CATEGORIES:
223
- return False, "Invalid taxonomy category"
224
-
225
- return True, "Valid submission"
226
 
227
- def get_leaderboard_insights(data: List[Dict]) -> Dict[str, Any]:
228
- """Get insights and trends from leaderboard data"""
229
-
230
- if not data:
231
- return {}
232
-
233
- # Language performance analysis
234
- lang_performance = {}
235
- for lang in PROGRAMMING_LANGUAGES[1:]: # Skip "All"
236
- lang_data = [entry for entry in data if entry.get("programming_language") == lang]
237
- if lang_data:
238
- avg_score = sum(entry.get("llm_pass_1", 0) for entry in lang_data) / len(lang_data)
239
- lang_performance[lang] = {
240
- "avg_score": avg_score,
241
- "model_count": len(lang_data),
242
- "best_model": max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
243
- }
244
-
245
- # Category performance analysis
246
- category_performance = {}
247
- for category in TAXONOMY_CATEGORIES[1:]: # Skip "All"
248
- cat_data = [entry for entry in data if entry.get("taxonomy_category") == category]
249
- if cat_data:
250
- avg_score = sum(entry.get("llm_pass_1", 0) for entry in cat_data) / len(cat_data)
251
- category_performance[category] = {
252
- "avg_score": avg_score,
253
- "model_count": len(cat_data),
254
- "best_model": max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
255
- }
256
-
257
- return {
258
- "language_performance": lang_performance,
259
- "category_performance": category_performance,
260
- "top_performers": sorted(data, key=lambda x: x.get("llm_pass_1", 0), reverse=True)[:5]
261
- }
262
-
263
- def export_leaderboard_data(data: List[Dict], format_type: str = "json") -> str:
264
- """Export leaderboard data in specified format"""
265
-
266
- if format_type.lower() == "json":
267
- return json.dumps(data, indent=2, ensure_ascii=False)
268
- elif format_type.lower() == "csv":
269
- # Simple CSV export
270
- if not data:
271
- return ""
272
-
273
- # Get headers
274
- headers = ["model_name", "programming_language", "comment_language", "taxonomy_category",
275
- "bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
276
- headers.extend(QUALITY_METRICS)
277
-
278
- lines = [",".join(headers)]
279
-
280
- for entry in data:
281
- row = []
282
- for header in headers:
283
- if header in QUALITY_METRICS:
284
- value = entry.get("metrics", {}).get(header, "")
285
- else:
286
- value = entry.get(header, "")
287
- row.append(str(value))
288
- lines.append(",".join(row))
289
-
290
- return "\n".join(lines)
291
- else:
292
- return "Unsupported format"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Utility classes and functions for the CodeReview Bench Leaderboard display.
3
  """
4
 
5
+ from dataclasses import dataclass, field, fields
6
+ from enum import Enum, auto
7
+ from typing import List, Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
+ class Mode(Enum):
11
+ """Inference mode for the review model."""
12
+ CoT = auto() # Chain of Thought
13
+ Strict = auto()
14
+
15
+ def __str__(self):
16
+ """String representation of the mode."""
17
+ return self.name
18
+
19
+
20
+ class ModelType(Enum):
21
+ """Model types for the leaderboard."""
22
+ Unknown = auto()
23
+ OpenSource = auto()
24
+ ClosedSource = auto()
25
+ API = auto()
26
+
27
+ def to_str(self, separator: str = "-") -> str:
28
+ """Convert enum to string with separator."""
29
+ if self == ModelType.Unknown:
30
+ return "Unknown"
31
+ elif self == ModelType.OpenSource:
32
+ return f"Open{separator}Source"
33
+ elif self == ModelType.ClosedSource:
34
+ return f"Closed{separator}Source"
35
+ elif self == ModelType.API:
36
+ return "API"
37
+ return "Unknown"
38
+
39
+
40
+ class ReviewModelType(str, Enum):
41
+ """Review model types for the leaderboard."""
42
+ GPT_4 = "gpt-4"
43
+ GPT_3_5 = "gpt-3.5-turbo"
44
+ CLAUDE = "claude"
45
+ LLAMA = "llama"
46
+ GEMINI = "gemini"
47
+ CUSTOM = "custom"
48
+
49
+ def __str__(self):
50
+ """String representation of the review model type."""
51
+ return self.value
52
+
53
+
54
+ class Precision(Enum):
55
+ """Model precision types."""
56
+ Unknown = auto()
57
+ float16 = auto()
58
+ bfloat16 = auto()
59
+ float32 = auto()
60
+ int8 = auto()
61
+ int4 = auto()
62
+ NA = auto()
63
+
64
+ def __str__(self):
65
+ """String representation of the precision type."""
66
+ return self.name
67
+
68
+
69
+ class WeightType(Enum):
70
+ """Model weight types."""
71
+ Original = auto()
72
+ Delta = auto()
73
+ Adapter = auto()
74
+
75
+ def __str__(self):
76
+ """String representation of the weight type."""
77
+ return self.name
78
+
79
+
80
+ @dataclass
81
+ class ColumnInfo:
82
+ """Information about a column in the leaderboard."""
83
+ name: str
84
+ display_name: str
85
+ type: str = "text"
86
+ hidden: bool = False
87
+ never_hidden: bool = False
88
+ displayed_by_default: bool = True
89
+
90
+
91
+ @dataclass
92
+ class CodeReviewBenchColumn:
93
+ """Columns for the CodeReview Bench leaderboard."""
94
+ # Core metadata
95
+ model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
96
+ name="model_name",
97
+ display_name="Model",
98
+ never_hidden=True,
99
+ displayed_by_default=True
100
+ ))
101
+ mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
102
+ name="mode",
103
+ display_name="Mode",
104
+ displayed_by_default=True
105
+ ))
106
+ model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
107
+ name="model_type",
108
+ display_name="Access_Type",
109
+ displayed_by_default=True
110
+ ))
111
+ submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
112
+ name="submission_date",
113
+ display_name="Submission_Date",
114
+ displayed_by_default=False
115
+ ))
116
+ version: ColumnInfo = field(default_factory=lambda: ColumnInfo(
117
+ name="version",
118
+ display_name="Version",
119
+ displayed_by_default=False
120
+ ))
121
+ review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
122
+ name="review_model_type",
123
+ display_name="Type",
124
+ displayed_by_default=False
125
+ ))
126
+ base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
127
+ name="base_model",
128
+ display_name="Base Model",
129
+ displayed_by_default=False
130
+ ))
131
+ revision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
132
+ name="revision",
133
+ display_name="Revision",
134
+ displayed_by_default=False
135
+ ))
136
+ precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
137
+ name="precision",
138
+ display_name="Precision",
139
+ displayed_by_default=False
140
+ ))
141
+ weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
142
+ name="weight_type",
143
+ display_name="Weight Type",
144
+ displayed_by_default=False
145
+ ))
146
+ topic: ColumnInfo = field(default_factory=lambda: ColumnInfo(
147
+ name="topic",
148
+ display_name="Topic",
149
+ displayed_by_default=True
150
+ ))
151
+
152
+ # LLM-based multimetric scores
153
+ readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
154
+ name="readability",
155
+ display_name="Readability",
156
+ type="number",
157
+ displayed_by_default=True
158
+ ))
159
+ relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
160
+ name="relevance",
161
+ display_name="Relevance",
162
+ type="number",
163
+ displayed_by_default=True
164
+ ))
165
+ explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
166
+ name="explanation_clarity",
167
+ display_name="Explanation_Clarity",
168
+ type="number",
169
+ displayed_by_default=True
170
+ ))
171
+ problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo(
172
+ name="problem_identification",
173
+ display_name="Problem_Identification",
174
+ type="number",
175
+ displayed_by_default=True
176
+ ))
177
+ actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
178
+ name="actionability",
179
+ display_name="Actionability",
180
+ type="number",
181
+ displayed_by_default=True
182
+ ))
183
+ completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo(
184
+ name="completeness",
185
+ display_name="Completeness",
186
+ type="number",
187
+ displayed_by_default=True
188
+ ))
189
+ specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
190
+ name="specificity",
191
+ display_name="Specificity",
192
+ type="number",
193
+ displayed_by_default=True
194
+ ))
195
+ contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
196
+ name="contextual_adequacy",
197
+ display_name="Contextual_Adequacy",
198
+ type="number",
199
+ displayed_by_default=True
200
+ ))
201
+ consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo(
202
+ name="consistency",
203
+ display_name="Consistency",
204
+ type="number",
205
+ displayed_by_default=True
206
+ ))
207
+ brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
208
+ name="brevity",
209
+ display_name="Brevity",
210
+ type="number",
211
+ displayed_by_default=True
212
+ ))
213
+
214
+ # LLM-based-exact-match metrics
215
+ pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
216
+ name="pass_at_1",
217
+ display_name="Pass@1",
218
+ type="number",
219
+ displayed_by_default=True
220
+ ))
221
+ pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo(
222
+ name="pass_at_5",
223
+ display_name="Pass@5",
224
+ type="number",
225
+ displayed_by_default=True
226
+ ))
227
+ pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
228
+ name="pass_at_10",
229
+ display_name="Pass@10",
230
+ type="number",
231
+ displayed_by_default=True
232
+ ))
233
+ bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
234
+ name="bleu_at_10",
235
+ display_name="BLEU@10",
236
+ type="number",
237
+ displayed_by_default=True
238
+ ))
239
+
240
+ # Overall aggregated metrics
241
+ overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
242
+ name="overall_score",
243
+ display_name="Overall_Score",
244
+ type="number",
245
+ displayed_by_default=True
246
+ ))
247
+ multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
248
+ name="multimetric_average",
249
+ display_name="Multimetric_Average",
250
+ type="number",
251
+ displayed_by_default=True
252
+ ))
253
+ exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
254
+ name="exact_match_average",
255
+ display_name="Exact_Match_Average",
256
+ type="number",
257
+ displayed_by_default=True
258
+ ))
259
+ total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo(
260
+ name="total_evaluations",
261
+ display_name="Total_Evaluations",
262
+ type="number",
263
+ displayed_by_default=True
264
+ ))
265
+
266
+ # Language-specific metrics (Russian)
267
+ ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
268
+ name="ru_readability",
269
+ display_name="RU_Readability",
270
+ type="number",
271
+ displayed_by_default=False
272
+ ))
273
+ ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
274
+ name="ru_relevance",
275
+ display_name="RU_Relevance",
276
+ type="number",
277
+ displayed_by_default=False
278
+ ))
279
+ ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
280
+ name="ru_overall_score",
281
+ display_name="RU_Overall_Score",
282
+ type="number",
283
+ displayed_by_default=False
284
+ ))
285
+
286
+ # Language-specific metrics (English)
287
+ en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
288
+ name="en_readability",
289
+ display_name="EN_Readability",
290
+ type="number",
291
+ displayed_by_default=False
292
+ ))
293
+ en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
294
+ name="en_relevance",
295
+ display_name="EN_Relevance",
296
+ type="number",
297
+ displayed_by_default=False
298
+ ))
299
+ en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
300
+ name="en_overall_score",
301
+ display_name="EN_Overall_Score",
302
+ type="number",
303
+ displayed_by_default=False
304
+ ))
305
+
306
+
307
+ # Create instances for easy access
308
+ CODEREVIEW_COLUMN = CodeReviewBenchColumn()
309
+
310
+ # Extract column lists for different views
311
+ COLS = [f.name for f in fields(CODEREVIEW_COLUMN)]
312
+ DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
313
+ if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
314
+
315
+ # Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
316
+ def reorder_display_cols():
317
+ cols = DISPLAY_COLS
318
+ if 'model_name' in cols and 'mode' in cols:
319
+ cols.remove('mode')
320
+ model_name_index = cols.index('model_name')
321
+ cols.insert(model_name_index + 1, 'mode')
322
+ return cols
323
+ DISPLAY_COLS = reorder_display_cols()
324
+
325
+ METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
326
+ if getattr(CODEREVIEW_COLUMN, f.name).type == "number"]
327
+ HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
328
+ if getattr(CODEREVIEW_COLUMN, f.name).hidden]
329
+ NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
330
+ if getattr(CODEREVIEW_COLUMN, f.name).never_hidden]
331
+
332
+ # Categories for CodeReview Bench (Programming Languages)
333
+ CATEGORIES = [
334
+ 'Python',
335
+ 'Java',
336
+ 'Scala',
337
+ 'Go'
338
+ ]
339
+
340
+ # Language taxonomies for CodeReview Bench
341
+ COMMENT_LANGUAGES = [
342
+ 'ru', # Russian
343
+ 'en' # English
344
+ ]
345
+
346
+ # Topics for CodeReview Bench
347
+ TOPICS = [
348
+ 'Code Reliability',
349
+ 'Coding Standards',
350
+ 'Code Organization',
351
+ 'Performance Issues',
352
+ 'Validation',
353
+ 'Variables'
354
+ ]
355
+
356
+ # Example categories
357
+ EXAMPLE_CATEGORIES = [
358
+ 'Bug_Fix',
359
+ 'Code_Style',
360
+ 'Performance',
361
+ 'Security',
362
+ 'Refactoring',
363
+ 'Documentation',
364
+ 'Testing',
365
+ 'Architecture',
366
+ 'Other'
367
+ ]
368
+
369
+ # Metrics for CodeReview Bench
370
+ MULTIMETRIC_METRICS = [
371
+ "readability",
372
+ "relevance",
373
+ "explanation_clarity",
374
+ "problem_identification",
375
+ "actionability",
376
+ "completeness",
377
+ "specificity",
378
+ "contextual_adequacy",
379
+ "consistency",
380
+ "brevity"
381
+ ]
382
+
383
+ EXACT_MATCH_METRICS = [
384
+ "pass_at_1",
385
+ "pass_at_5",
386
+ "pass_at_10",
387
+ "bleu_at_10"
388
+ ]
389
+
390
+ def get_all_column_choices():
391
+ """
392
+ Get all available column choices for the multiselect dropdown.
393
+
394
+ Returns:
395
+ List of tuples with (column_name, display_name) for all columns.
396
+ """
397
+ column_choices = []
398
+
399
+ default_visible_columns = get_default_visible_columns()
400
+
401
+ for f in fields(CODEREVIEW_COLUMN):
402
+ column_info = getattr(CODEREVIEW_COLUMN, f.name)
403
+ # Create a tuple with both the internal name and display name
404
+ if column_info.name not in default_visible_columns:
405
+ column_choices.append((column_info.name, column_info.display_name))
406
+
407
+ return column_choices
408
+
409
+ def get_default_visible_columns():
410
+ """
411
+ Get the list of column names that should be visible by default.
412
+
413
+ Returns:
414
+ List of column names that are displayed by default.
415
+ """
416
+ return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
417
+ if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
src/envs.py CHANGED
@@ -1,106 +1,27 @@
1
- """
2
- Environment configuration and constants
3
- """
4
-
5
  import os
6
- from pathlib import Path
7
-
8
- # Data paths
9
- DATA_DIR = Path("data")
10
- LEADERBOARD_PATH = DATA_DIR / "leaderboard_data.json"
11
- SUBMISSIONS_PATH = DATA_DIR / "submissions.json"
12
-
13
- # Create data directory if it doesn't exist
14
- DATA_DIR.mkdir(exist_ok=True)
15
-
16
- # Programming languages supported
17
- PROGRAMMING_LANGUAGES = [
18
- "All",
19
- "Python",
20
- "JavaScript",
21
- "Java",
22
- "C++",
23
- "C#",
24
- "Go",
25
- "Rust",
26
- "TypeScript",
27
- "PHP",
28
- "Ruby",
29
- "Swift",
30
- "Kotlin",
31
- "Scala",
32
- "R",
33
- "MATLAB",
34
- "Other"
35
- ]
36
 
37
- # Comment languages supported
38
- COMMENT_LANGUAGES = [
39
- "All",
40
- "English",
41
- "Chinese",
42
- "Spanish",
43
- "French",
44
- "German",
45
- "Japanese",
46
- "Korean",
47
- "Russian",
48
- "Portuguese",
49
- "Italian",
50
- "Dutch",
51
- "Other"
52
- ]
53
 
54
- # Taxonomy categories
55
- TAXONOMY_CATEGORIES = [
56
- "All",
57
- "Bug Detection",
58
- "Code Style",
59
- "Performance",
60
- "Security",
61
- "Maintainability",
62
- "Documentation",
63
- "Testing",
64
- "Architecture",
65
- "Best Practices",
66
- "Refactoring",
67
- "Other"
68
- ]
69
 
70
- # Quality metrics
71
- QUALITY_METRICS = [
72
- "readability",
73
- "relevance",
74
- "explanation_clarity",
75
- "problem_identification",
76
- "actionability",
77
- "completeness",
78
- "specificity",
79
- "contextual_adequacy",
80
- "consistency",
81
- "brevity"
82
- ]
83
 
84
- # Table headers
85
- MAIN_HEADERS = ["Model", "Programming Language", "Comment Language", "Taxonomy", "BLEU", "Pass@1", "Pass@5", "Pass@10"]
 
86
 
87
- QUALITY_HEADERS = ["Model"] + [metric.replace("_", " ").title() for metric in QUALITY_METRICS]
 
88
 
89
- # Default data
90
- DEFAULT_DATA = [{
91
- "model_name": "example/model",
92
- "programming_language": "Python",
93
- "comment_language": "English",
94
- "taxonomy_category": "Bug Detection",
95
- "bleu": 0.5,
96
- "llm_pass_1": 0.5,
97
- "llm_pass_5": 0.5,
98
- "llm_pass_10": 0.5,
99
- "metrics": {
100
- "readability": 5, "relevance": 5, "explanation_clarity": 5,
101
- "problem_identification": 5, "actionability": 5, "completeness": 5,
102
- "specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5
103
- },
104
- "submission_ip": "127.0.0.1",
105
- "submission_date": "2024-01-01T00:00:00Z"
106
- }]
 
 
 
 
 
1
  import os
2
+ from huggingface_hub import HfApi
3
+ from dotenv import load_dotenv
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ # Load environment variables
6
+ load_dotenv()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Hugging Face configuration
9
+ TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
10
+ OWNER = os.environ.get("OWNER", "codereview-bench") # Change to your org
11
+ SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
12
+ ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
13
+ ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
 
 
 
 
 
 
 
 
 
14
 
15
+ # Repository IDs
16
+ REPO_ID = f"{OWNER}/codereview-bench"
17
+ RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/codereview-bench-results")
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # Cache paths
20
+ CACHE_PATH = os.getenv("HF_HOME", ".")
21
+ DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
22
 
23
+ # Local data paths
24
+ LEADERBOARD_FILE = os.path.join(DATA_PATH, "leaderboard.json")
25
 
26
+ # HF API instance
27
+ API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/processor.py CHANGED
@@ -1,306 +1,271 @@
1
  """
2
- Leaderboard data processor for CodeReview Leaderboard
3
  """
4
 
5
  import json
6
- import traceback
7
- from typing import List, Dict, Any, Optional
8
- from datetime import datetime, timezone, timedelta
9
- from pathlib import Path
10
- from src.envs import LEADERBOARD_PATH, SUBMISSIONS_PATH, DEFAULT_DATA
11
- from src.display.utils import validate_submission_data, get_statistics_summary
12
-
13
- class LeaderboardProcessor:
14
- """Handles all leaderboard data operations"""
 
 
 
 
 
 
15
 
16
- def __init__(self):
17
- self.leaderboard_path = LEADERBOARD_PATH
18
- self.submissions_path = SUBMISSIONS_PATH
19
- self._ensure_data_files()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- def _ensure_data_files(self):
22
- """Ensure data files exist with default data"""
23
- if not self.leaderboard_path.exists():
24
- self.save_leaderboard_data(DEFAULT_DATA)
25
 
26
- if not self.submissions_path.exists():
27
- self.save_submission_log([])
 
 
 
 
 
 
28
 
29
- def load_leaderboard_data(self) -> List[Dict]:
30
- """Load leaderboard data from storage"""
31
- try:
32
- with open(self.leaderboard_path, 'r', encoding='utf-8') as f:
33
- data = json.load(f)
34
- return data.get("leaderboard", [])
35
- except Exception as e:
36
- print(f"Error loading leaderboard: {e}")
37
- return DEFAULT_DATA.copy()
38
 
39
- def save_leaderboard_data(self, data: List[Dict]) -> bool:
40
- """Save leaderboard data to storage"""
41
- try:
42
- to_store = {
43
- "leaderboard": data,
44
- "last_updated": datetime.now(timezone.utc).isoformat(),
45
- "total_entries": len(data)
46
- }
47
-
48
- with open(self.leaderboard_path, 'w', encoding='utf-8') as f:
49
- json.dump(to_store, f, indent=2, ensure_ascii=False)
50
-
51
- return True
52
- except Exception as e:
53
- print(f"Error saving leaderboard: {e}")
54
- return False
55
 
56
- def load_submission_log(self) -> List[Dict]:
57
- """Load submission log from storage"""
58
- try:
59
- with open(self.submissions_path, 'r', encoding='utf-8') as f:
60
- data = json.load(f)
61
- return data.get("submissions", [])
62
- except Exception as e:
63
- print(f"Error loading submission log: {e}")
64
- return []
65
 
66
- def save_submission_log(self, submissions: List[Dict]) -> bool:
67
- """Save submission log to storage"""
68
- try:
69
- to_store = {
70
- "submissions": submissions,
71
- "last_updated": datetime.now(timezone.utc).isoformat(),
72
- "total_submissions": len(submissions)
73
- }
74
-
75
- with open(self.submissions_path, 'w', encoding='utf-8') as f:
76
- json.dump(to_store, f, indent=2, ensure_ascii=False)
77
-
78
- return True
79
- except Exception as e:
80
- print(f"Error saving submission log: {e}")
81
- return False
82
 
83
- def add_submission(self, submission_data: Dict[str, Any], ip_address: str) -> tuple[bool, str]:
84
- """Add a new submission to the leaderboard"""
85
- try:
86
- # Validate submission data
87
- is_valid, message = validate_submission_data(submission_data)
88
- if not is_valid:
89
- return False, message
90
-
91
- # Add metadata
92
- submission_data["submission_ip"] = ip_address
93
- submission_data["submission_date"] = datetime.now(timezone.utc).isoformat()
94
-
95
- # Load current data
96
- current_data = self.load_leaderboard_data()
97
-
98
- # Check for existing model and replace if found
99
- model_name = submission_data.get("model_name", "")
100
- current_data = [entry for entry in current_data if entry.get("model_name") != model_name]
101
-
102
- # Add new submission
103
- current_data.append(submission_data)
104
-
105
- # Save updated data
106
- if self.save_leaderboard_data(current_data):
107
- # Log the submission
108
- self._log_submission(submission_data, ip_address)
109
- return True, "✅ Submission recorded successfully!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  else:
111
- return False, "❌ Failed to save submission"
112
-
113
- except Exception as e:
114
- print(f"Error adding submission: {e}")
115
- traceback.print_exc()
116
- return False, f"❌ Submission failed: {str(e)}"
117
-
118
- def _log_submission(self, submission_data: Dict[str, Any], ip_address: str):
119
- """Log submission for audit trail"""
120
- try:
121
- submissions = self.load_submission_log()
122
-
123
- log_entry = {
124
- "model_name": submission_data.get("model_name"),
125
- "programming_language": submission_data.get("programming_language"),
126
- "comment_language": submission_data.get("comment_language"),
127
- "taxonomy_category": submission_data.get("taxonomy_category"),
128
- "scores": {
129
- "bleu": submission_data.get("bleu"),
130
- "llm_pass_1": submission_data.get("llm_pass_1"),
131
- "llm_pass_5": submission_data.get("llm_pass_5"),
132
- "llm_pass_10": submission_data.get("llm_pass_10")
133
- },
134
- "submission_ip": ip_address,
135
- "submission_date": submission_data.get("submission_date"),
136
- "status": "accepted"
137
- }
138
-
139
- submissions.append(log_entry)
140
-
141
- # Keep only last 1000 submissions
142
- submissions = submissions[-1000:]
143
-
144
- self.save_submission_log(submissions)
145
-
146
- except Exception as e:
147
- print(f"Error logging submission: {e}")
148
-
149
- def get_model_history(self, model_name: str) -> List[Dict]:
150
- """Get submission history for a specific model"""
151
- try:
152
- submissions = self.load_submission_log()
153
- return [
154
- sub for sub in submissions
155
- if sub.get("model_name") == model_name
156
- ]
157
- except Exception as e:
158
- print(f"Error getting model history: {e}")
159
- return []
160
-
161
- def get_ip_submissions(self, ip_address: str, limit: int = 10) -> List[Dict]:
162
- """Get recent submissions from a specific IP"""
163
- try:
164
- submissions = self.load_submission_log()
165
- ip_submissions = [
166
- sub for sub in submissions
167
- if sub.get("submission_ip") == ip_address
168
- ]
169
-
170
- # Sort by date and limit
171
- ip_submissions.sort(key=lambda x: x.get("submission_date", ""), reverse=True)
172
- return ip_submissions[:limit]
173
-
174
- except Exception as e:
175
- print(f"Error getting IP submissions: {e}")
176
- return []
177
-
178
- def check_rate_limit(self, ip_address: str, max_submissions: int = 5, hours: int = 24) -> tuple[bool, str]:
179
- """Check if IP has exceeded rate limit"""
180
- try:
181
- submissions = self.get_ip_submissions(ip_address, max_submissions * 2)
182
-
183
- # Count submissions within the time window
184
- cutoff_time = datetime.now(timezone.utc) - timedelta(hours=hours)
185
- recent_submissions = [
186
- sub for sub in submissions
187
- if datetime.fromisoformat(sub.get("submission_date", "")).replace(tzinfo=timezone.utc) > cutoff_time
188
- ]
189
-
190
- if len(recent_submissions) >= max_submissions:
191
- return False, f"Rate limit exceeded: {len(recent_submissions)}/{max_submissions} submissions in {hours} hours"
192
-
193
- return True, f"Rate limit OK: {len(recent_submissions)}/{max_submissions} submissions in {hours} hours"
194
-
195
- except Exception as e:
196
- print(f"Error checking rate limit: {e}")
197
- return True, "Rate limit check failed, allowing submission"
198
-
199
- def get_leaderboard_stats(self) -> Dict[str, Any]:
200
- """Get comprehensive leaderboard statistics"""
201
- try:
202
- data = self.load_leaderboard_data()
203
- submissions = self.load_submission_log()
204
-
205
- basic_stats = get_statistics_summary(data)
206
-
207
- # Additional stats
208
- recent_submissions = len([
209
- sub for sub in submissions
210
- if datetime.fromisoformat(sub.get("submission_date", "")).replace(tzinfo=timezone.utc) >
211
- datetime.now(timezone.utc) - timedelta(days=7)
212
- ])
213
-
214
- return {
215
- **basic_stats,
216
- "recent_submissions_7d": recent_submissions,
217
- "total_logged_submissions": len(submissions),
218
- "last_updated": datetime.now(timezone.utc).isoformat()
219
- }
220
-
221
- except Exception as e:
222
- print(f"Error getting leaderboard stats: {e}")
223
- return {}
224
-
225
- def backup_data(self) -> bool:
226
- """Create backup of current data"""
227
- try:
228
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
229
- backup_dir = Path("backups")
230
- backup_dir.mkdir(exist_ok=True)
231
-
232
- # Backup leaderboard
233
- if self.leaderboard_path.exists():
234
- backup_path = backup_dir / f"leaderboard_{timestamp}.json"
235
- with open(self.leaderboard_path, 'r') as src, open(backup_path, 'w') as dst:
236
- dst.write(src.read())
237
-
238
- # Backup submissions
239
- if self.submissions_path.exists():
240
- backup_path = backup_dir / f"submissions_{timestamp}.json"
241
- with open(self.submissions_path, 'r') as src, open(backup_path, 'w') as dst:
242
- dst.write(src.read())
243
-
244
- return True
245
-
246
- except Exception as e:
247
- print(f"Error creating backup: {e}")
248
- return False
249
-
250
- def export_data(self, format_type: str = "json") -> str:
251
- """Export leaderboard data in specified format"""
252
- try:
253
- from src.display.utils import export_leaderboard_data
254
-
255
- data = self.load_leaderboard_data()
256
- return export_leaderboard_data(data, format_type)
257
-
258
- except Exception as e:
259
- print(f"Error exporting data: {e}")
260
- return f"Export failed: {str(e)}"
261
-
262
- def validate_data_integrity(self) -> Dict[str, Any]:
263
- """Validate data integrity and return report"""
264
- try:
265
- data = self.load_leaderboard_data()
266
- submissions = self.load_submission_log()
267
-
268
- issues = []
269
-
270
- # Check for duplicate models
271
- model_names = [entry.get("model_name") for entry in data]
272
- duplicates = [name for name in model_names if model_names.count(name) > 1]
273
- if duplicates:
274
- issues.append(f"Duplicate models found: {set(duplicates)}")
275
-
276
- # Check for missing required fields
277
- required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
278
- for i, entry in enumerate(data):
279
- missing = [field for field in required_fields if not entry.get(field)]
280
- if missing:
281
- issues.append(f"Entry {i}: Missing fields {missing}")
282
-
283
- # Check score ranges
284
- for i, entry in enumerate(data):
285
- scores = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
286
- for score in scores:
287
- value = entry.get(score)
288
- if value is not None and (value < 0 or value > 1):
289
- issues.append(f"Entry {i}: {score} out of range: {value}")
290
-
291
- return {
292
- "is_valid": len(issues) == 0,
293
- "issues": issues,
294
- "total_entries": len(data),
295
- "total_submissions": len(submissions),
296
- "check_date": datetime.now(timezone.utc).isoformat()
297
- }
298
-
299
- except Exception as e:
300
- return {
301
- "is_valid": False,
302
- "issues": [f"Validation failed: {str(e)}"],
303
- "total_entries": 0,
304
- "total_submissions": 0,
305
- "check_date": datetime.now(timezone.utc).isoformat()
306
- }
 
1
  """
2
+ Process CodeReview Bench leaderboard data and submissions.
3
  """
4
 
5
  import json
6
+ import os
7
+ import pandas as pd
8
+ from datetime import datetime
9
+ from typing import Dict, List, Tuple, Optional
10
+ import numpy as np
11
+
12
+ from src.display.utils import (
13
+ CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES,
14
+ MULTIMETRIC_METRICS, EXACT_MATCH_METRICS
15
+ )
16
+
17
+
18
+ def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
19
+ """
20
+ Process a JSONL submission file for CodeReview Bench.
21
 
22
+ Args:
23
+ file_path: Path to the JSONL submission file
24
+
25
+ Returns:
26
+ Tuple of (entries_list, message)
27
+ """
28
+ try:
29
+ entries = []
30
+ with open(file_path, 'r', encoding='utf-8') as f:
31
+ for line_num, line in enumerate(f, 1):
32
+ line = line.strip()
33
+ if not line:
34
+ continue
35
+
36
+ try:
37
+ entry = json.loads(line)
38
+
39
+ # Validate required fields
40
+ required_fields = ['model_name', 'programming_language', 'comment_language']
41
+ missing_fields = [field for field in required_fields if field not in entry]
42
+ if missing_fields:
43
+ return [], f"Missing required fields {missing_fields} in line {line_num}"
44
+
45
+ # Validate metrics exist
46
+ has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS)
47
+ has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS)
48
+
49
+ if not has_multimetric and not has_exact_match:
50
+ return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}"
51
+
52
+ entries.append(entry)
53
+
54
+ except json.JSONDecodeError as e:
55
+ return [], f"Invalid JSON in line {line_num}: {e}"
56
+
57
+ if not entries:
58
+ return [], "No valid entries found in submission file"
59
+
60
+ return entries, f"Successfully processed {len(entries)} entries"
61
+
62
+ except Exception as e:
63
+ return [], f"Error processing submission: {e}"
64
+
65
+
66
+ def calculate_overall_score(entry: Dict) -> float:
67
+ """
68
+ Calculate overall score for a CodeReview Bench entry.
69
 
70
+ Args:
71
+ entry: Dictionary containing model evaluation results
 
 
72
 
73
+ Returns:
74
+ Overall score as float
75
+ """
76
+ # Calculate multimetric average
77
+ multimetric_scores = []
78
+ for metric in MULTIMETRIC_METRICS:
79
+ if metric in entry and isinstance(entry[metric], (int, float)):
80
+ multimetric_scores.append(entry[metric])
81
 
82
+ multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0
 
 
 
 
 
 
 
 
83
 
84
+ # Calculate exact match average
85
+ exact_match_scores = []
86
+ for metric in EXACT_MATCH_METRICS:
87
+ if metric in entry and isinstance(entry[metric], (int, float)):
88
+ exact_match_scores.append(entry[metric])
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0
 
 
 
 
 
 
 
 
91
 
92
+ # Weighted combination (can be adjusted based on requirements)
93
+ overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
+ return overall_score
96
+
97
+
98
+ def load_leaderboard_data(file_path: str) -> Dict:
99
+ """
100
+ Load the leaderboard data from a JSON file.
101
+ """
102
+ if not os.path.exists(file_path):
103
+ version = "v0"
104
+ if "_v" in file_path:
105
+ version = file_path.split("_")[-1].split(".")[0]
106
+ return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
107
+
108
+ with open(file_path, 'r') as f:
109
+ data = json.load(f)
110
+
111
+ # Ensure version field exists
112
+ if "version" not in data:
113
+ version = "v0"
114
+ if "_v" in file_path:
115
+ version = file_path.split("_")[-1].split(".")[0]
116
+ data["version"] = version
117
+
118
+ return data
119
+
120
+
121
+ def save_leaderboard_data(data: Dict, file_path: str) -> None:
122
+ """
123
+ Save the leaderboard data to a JSON file.
124
+ """
125
+ # Ensure the directory exists
126
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
127
+
128
+ # Update the last_updated timestamp
129
+ data["last_updated"] = datetime.now().isoformat()
130
+
131
+ # Ensure version is set
132
+ if "version" not in data:
133
+ version = "v0"
134
+ if "_v" in file_path:
135
+ version = file_path.split("_")[-1].split(".")[0]
136
+ data["version"] = version
137
+
138
+ with open(file_path, 'w') as f:
139
+ json.dump(data, f, indent=2)
140
+
141
+
142
+ def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
143
+ """
144
+ Convert leaderboard data to a pandas DataFrame for display.
145
+ """
146
+ rows = []
147
+
148
+ for entry in leaderboard_data.get("entries", []):
149
+ model_name = entry.get("model_name", "Unknown Model")
150
+
151
+ # Extract basic metadata
152
+ row = {
153
+ "model_name": model_name,
154
+ "model_type": entry.get("model_type", "Unknown"),
155
+ "mode": entry.get("mode", "Strict"),
156
+ "submission_date": entry.get("submission_date", ""),
157
+ "version": entry.get("version", "v0"),
158
+ "review_model_type": entry.get("review_model_type", "custom").lower()
159
+ }
160
+
161
+ # Add additional metadata fields if present
162
+ for key in ["base_model", "revision", "precision", "weight_type", "topic", "programming_language", "comment_language"]:
163
+ if key in entry:
164
+ row[key] = entry[key]
165
+
166
+ # Add multimetric scores
167
+ for metric in MULTIMETRIC_METRICS:
168
+ if metric in entry:
169
+ row[metric] = entry[metric]
170
  else:
171
+ row[metric] = pd.NA
172
+
173
+ # Add exact match metrics
174
+ for metric in EXACT_MATCH_METRICS:
175
+ if metric in entry:
176
+ row[metric] = entry[metric]
177
+ else:
178
+ row[metric] = pd.NA
179
+
180
+ # Calculate aggregated metrics
181
+ multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])]
182
+ exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])]
183
+
184
+ if multimetric_scores:
185
+ row["multimetric_average"] = np.mean(multimetric_scores)
186
+ else:
187
+ row["multimetric_average"] = pd.NA
188
+
189
+ if exact_match_scores:
190
+ row["exact_match_average"] = np.mean(exact_match_scores)
191
+ else:
192
+ row["exact_match_average"] = pd.NA
193
+
194
+ # Calculate overall score
195
+ row["overall_score"] = calculate_overall_score(entry)
196
+
197
+ # Add language-specific metrics if available
198
+ for lang in COMMENT_LANGUAGES:
199
+ for metric in ["readability", "relevance", "overall_score"]:
200
+ lang_key = f"{lang}_{metric}"
201
+ if lang_key in entry:
202
+ row[lang_key] = entry[lang_key]
203
+ else:
204
+ row[lang_key] = pd.NA
205
+
206
+ # Add evaluation count
207
+ row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA))
208
+
209
+ rows.append(row)
210
+
211
+ # Create DataFrame and sort by overall score
212
+ df = pd.DataFrame(rows)
213
+
214
+ # Ensure all expected columns exist
215
+ for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS:
216
+ if metric not in df.columns:
217
+ df[metric] = pd.NA
218
+
219
+ # Sort by overall score (descending)
220
+ if not df.empty:
221
+ df = df.sort_values(by="overall_score", ascending=False, na_position='last')
222
+
223
+ # Ensure summary columns exist
224
+ summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"]
225
+ for col in summary_cols:
226
+ if col not in df.columns:
227
+ df[col] = pd.NA
228
+
229
+ return df
230
+
231
+
232
+ def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict:
233
+ """
234
+ Add new entries to the leaderboard, replacing any with the same model name.
235
+ """
236
+ # Create a mapping of existing entries by model name and version
237
+ existing_entries = {
238
+ (entry["model_name"], entry.get("version", "v0")): i
239
+ for i, entry in enumerate(leaderboard_data.get("entries", []))
240
+ }
241
+
242
+ # Process each new entry
243
+ for new_entry in new_entries:
244
+ model_name = new_entry.get("model_name")
245
+ version = new_entry.get("version", "v0")
246
+
247
+ # Add calculated metrics
248
+ new_entry["overall_score"] = calculate_overall_score(new_entry)
249
+
250
+ # Calculate averages
251
+ multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
252
+ exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
253
+
254
+ if multimetric_scores:
255
+ new_entry["multimetric_average"] = np.mean(multimetric_scores)
256
+ if exact_match_scores:
257
+ new_entry["exact_match_average"] = np.mean(exact_match_scores)
258
+
259
+ if (model_name, version) in existing_entries:
260
+ # Replace existing entry
261
+ leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
262
+ else:
263
+ # Add new entry
264
+ if "entries" not in leaderboard_data:
265
+ leaderboard_data["entries"] = []
266
+ leaderboard_data["entries"].append(new_entry)
267
+
268
+ # Update the last_updated timestamp
269
+ leaderboard_data["last_updated"] = datetime.now().isoformat()
270
+
271
+ return leaderboard_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Populate the CodeReview Bench leaderboard from HuggingFace datasets.
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import pandas as pd
8
+ import tempfile
9
+ from typing import Dict, List, Optional
10
+ from datetime import datetime
11
+ import numpy as np
12
+
13
+ from huggingface_hub import hf_hub_download, HfApi
14
+ from datasets import load_dataset
15
+
16
+ from src.display.utils import CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES
17
+ from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
18
+ from src.leaderboard.processor import leaderboard_to_dataframe
19
+
20
+
21
+ def get_latest_leaderboard(version="v0") -> Optional[Dict]:
22
+ """
23
+ Get the latest leaderboard data from HuggingFace dataset.
24
+ Fallback to local JSON file if HF download fails or is unavailable.
25
+ """
26
+ # First try to fetch from HuggingFace Hub
27
+ try:
28
+ leaderboard_path = hf_hub_download(
29
+ repo_id=RESULTS_DATASET_ID,
30
+ filename=f"leaderboards/leaderboard_{version}.json",
31
+ repo_type="dataset",
32
+ token=TOKEN
33
+ )
34
+ with open(leaderboard_path, 'r') as f:
35
+ return json.load(f)
36
+ except Exception as hf_err:
37
+ print(f"HF download failed or unavailable: {hf_err}. Trying local fallback...")
38
+
39
+ # Fallback: attempt to load a local leaderboard_data.json located at the project root
40
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
41
+ local_path_candidates = [
42
+ os.path.join(project_root, "leaderboard_data.json"), # legacy path in root
43
+ os.path.join(project_root, "data", "leaderboard.json"), # path defined in envs.py
44
+ ]
45
+
46
+ for local_path in local_path_candidates:
47
+ if os.path.exists(local_path):
48
+ try:
49
+ with open(local_path, 'r') as f:
50
+ return json.load(f)
51
+ except Exception as local_err:
52
+ print(f"Error loading local leaderboard file {local_path}: {local_err}")
53
+
54
+ # If nothing found, return None
55
+ return None
56
+
57
+
58
+ def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
59
+ """
60
+ Get a specific model's entry from the entries folder, uniquely identified by model_name, mode, and version.
61
+ """
62
+ try:
63
+ model_name_safe = model_name.replace("/", "_").replace(" ", "_")
64
+ mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
65
+ entry_path = hf_hub_download(
66
+ repo_id=RESULTS_DATASET_ID,
67
+ filename=f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json",
68
+ repo_type="dataset",
69
+ token=TOKEN
70
+ )
71
+ with open(entry_path, 'r') as f:
72
+ return json.load(f)
73
+ except Exception as e:
74
+ print(f"Error downloading model entry: {e}")
75
+ return None
76
+
77
+
78
+ def get_all_entries(version="v0") -> List[Dict]:
79
+ """
80
+ Get all entries from the HuggingFace dataset.
81
+ """
82
+ try:
83
+ api = HfApi(token=TOKEN)
84
+ files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
85
+ entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
86
+
87
+ all_entries = []
88
+ for entry_file in entry_files:
89
+ try:
90
+ entry_path = hf_hub_download(
91
+ repo_id=RESULTS_DATASET_ID,
92
+ filename=entry_file,
93
+ repo_type="dataset",
94
+ token=TOKEN
95
+ )
96
+ with open(entry_path, 'r') as f:
97
+ entry_data = json.load(f)
98
+ all_entries.append(entry_data)
99
+ except Exception as e:
100
+ print(f"Error loading entry {entry_file}: {e}")
101
+
102
+ return all_entries
103
+ except Exception as e:
104
+ print(f"Error getting all entries: {e}")
105
+ return []
106
+
107
+
108
+ def get_leaderboard_df(version="v0") -> pd.DataFrame:
109
+ """
110
+ Get the leaderboard data as a DataFrame.
111
+ """
112
+ # Get latest leaderboard data
113
+ leaderboard_data = get_latest_leaderboard(version)
114
+
115
+ if not leaderboard_data:
116
+ # If no leaderboard exists, try to build it from entries
117
+ entries = get_all_entries(version)
118
+ if entries:
119
+ leaderboard_data = {
120
+ "entries": entries,
121
+ "last_updated": datetime.now().isoformat(),
122
+ "version": version
123
+ }
124
+ else:
125
+ # Return empty DataFrame if no data available
126
+ return pd.DataFrame(columns=DISPLAY_COLS)
127
+
128
+ # Convert to DataFrame
129
+ return leaderboard_to_dataframe(leaderboard_data)
130
+
131
+
132
+ def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
133
+ """
134
+ Get the leaderboard data filtered by a specific programming language category.
135
+ """
136
+ # Get latest leaderboard data
137
+ leaderboard_data = get_latest_leaderboard(version)
138
+
139
+ if not leaderboard_data:
140
+ # If no leaderboard exists, try to build it from entries
141
+ entries = get_all_entries(version)
142
+ if entries:
143
+ leaderboard_data = {
144
+ "entries": entries,
145
+ "last_updated": datetime.now().isoformat(),
146
+ "version": version
147
+ }
148
+ else:
149
+ # Return empty DataFrame if no data available
150
+ return pd.DataFrame(columns=DISPLAY_COLS)
151
+
152
+ # Filter entries to only include those with data for the specified programming language
153
+ filtered_entries = []
154
+ for entry in leaderboard_data.get("entries", []):
155
+ # Check if entry has data for this programming language
156
+ programming_language = entry.get("programming_language", "").lower()
157
+ if programming_language == category.lower() or category.lower() == "other":
158
+ # For "other" category, include entries that don't match any specific language
159
+ if category.lower() == "other":
160
+ if programming_language not in [cat.lower() for cat in CATEGORIES[:-1]]: # Exclude "Other" from check
161
+ filtered_entries.append(entry)
162
+ else:
163
+ filtered_entries.append(entry)
164
+
165
+ # Create a new leaderboard data structure with the filtered entries
166
+ filtered_leaderboard = {
167
+ "entries": filtered_entries,
168
+ "last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
169
+ "version": version
170
+ }
171
+
172
+ # Convert to DataFrame
173
+ return leaderboard_to_dataframe(filtered_leaderboard)
174
+
175
+
176
+ def get_detailed_model_data(model_name: str, mode: str, version="v0") -> Dict:
177
+ """
178
+ Get detailed data for a specific model and mode.
179
+ """
180
+ entry = get_model_entry(model_name, mode, version)
181
+ if entry:
182
+ return entry
183
+ leaderboard_data = get_latest_leaderboard(version)
184
+ if leaderboard_data:
185
+ for entry in leaderboard_data.get("entries", []):
186
+ if entry.get("model_name") == model_name and str(entry.get("mode")).lower() == str(mode).lower():
187
+ return entry
188
+ return {}
src/submission/submit.py CHANGED
@@ -1,386 +1,184 @@
1
  """
2
- Submission system for CodeReview Leaderboard
3
  """
4
 
5
- import gradio as gr
6
- import re
7
- from typing import Dict, Any, List, Tuple
8
- from datetime import datetime, timezone
9
- from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
10
- from src.leaderboard.processor import LeaderboardProcessor
11
- from src.display.utils import get_main_leaderboard_data, get_quality_metrics_data
12
 
13
- class SubmissionHandler:
14
- """Handles model submissions with validation and rate limiting"""
15
-
16
- def __init__(self):
17
- self.processor = LeaderboardProcessor()
18
-
19
- def get_client_ip(self, request: gr.Request) -> str:
20
- """Extract client IP address from request"""
21
- try:
22
- # Check for forwarded headers first
23
- forwarded_for = request.headers.get('X-Forwarded-For')
24
- if forwarded_for:
25
- # Take the first IP if multiple
26
- ip = forwarded_for.split(',')[0].strip()
27
- return ip
28
-
29
- # Check for real IP header
30
- real_ip = request.headers.get('X-Real-IP')
31
- if real_ip:
32
- return real_ip.strip()
33
-
34
- # Fall back to client host
35
- if hasattr(request, 'client') and hasattr(request.client, 'host'):
36
- return request.client.host
37
-
38
- # Default fallback
39
- return "127.0.0.1"
40
-
41
- except Exception as e:
42
- print(f"Error getting client IP: {e}")
43
- return "127.0.0.1"
44
-
45
- def validate_model_name(self, model_name: str) -> Tuple[bool, str]:
46
- """Validate model name format"""
47
- if not model_name or not model_name.strip():
48
- return False, "Model name cannot be empty"
49
-
50
- model_name = model_name.strip()
51
-
52
- # Check length
53
- if len(model_name) > 100:
54
- return False, "Model name too long (max 100 characters)"
55
-
56
- # Check for valid characters
57
- if not re.match(r'^[a-zA-Z0-9._/-]+$', model_name):
58
- return False, "Model name contains invalid characters (only letters, numbers, dots, hyphens, underscores, and slashes allowed)"
59
-
60
- # Check for organization/model format
61
- if "/" in model_name:
62
- parts = model_name.split("/")
63
- if len(parts) != 2:
64
- return False, "Model name should be in format 'organization/model'"
65
- if not parts[0] or not parts[1]:
66
- return False, "Both organization and model name must be specified"
67
-
68
- return True, "Valid model name"
69
-
70
- def validate_scores(self, scores: Dict[str, float]) -> Tuple[bool, str]:
71
- """Validate score values"""
72
- required_scores = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
73
-
74
- for score_name in required_scores:
75
- value = scores.get(score_name)
76
-
77
- if value is None:
78
- return False, f"Missing score: {score_name}"
79
-
80
- if not isinstance(value, (int, float)):
81
- return False, f"Invalid score format for {score_name}: must be a number"
82
-
83
- if not (0 <= value <= 1):
84
- return False, f"Score {score_name} out of range: {value} (must be between 0 and 1)"
85
-
86
- # Check logical consistency
87
- if scores["llm_pass_1"] > scores["llm_pass_5"]:
88
- return False, "Pass@1 score cannot be higher than Pass@5"
89
-
90
- if scores["llm_pass_5"] > scores["llm_pass_10"]:
91
- return False, "Pass@5 score cannot be higher than Pass@10"
92
-
93
- return True, "Valid scores"
94
-
95
- def validate_metrics(self, metrics: Dict[str, int]) -> Tuple[bool, str]:
96
- """Validate quality metrics"""
97
- for metric_name in QUALITY_METRICS:
98
- value = metrics.get(metric_name)
99
-
100
- if value is None:
101
- return False, f"Missing metric: {metric_name}"
102
-
103
- if not isinstance(value, (int, float)):
104
- return False, f"Invalid metric format for {metric_name}: must be a number"
105
-
106
- if not (0 <= value <= 10):
107
- return False, f"Metric {metric_name} out of range: {value} (must be between 0 and 10)"
108
-
109
- return True, "Valid metrics"
110
-
111
- def submit_model(
112
- self,
113
- request: gr.Request,
114
- current_data: List[Dict],
115
- model_name: str,
116
- programming_language: str,
117
- comment_language: str,
118
- taxonomy_category: str,
119
- bleu: float,
120
- llm_pass_1: float,
121
- llm_pass_5: float,
122
- llm_pass_10: float,
123
- readability: int,
124
- relevance: int,
125
- explanation_clarity: int,
126
- problem_identification: int,
127
- actionability: int,
128
- completeness: int,
129
- specificity: int,
130
- contextual_adequacy: int,
131
- consistency: int,
132
- brevity: int,
133
- ) -> Tuple[List[Dict], List[List[str]], List[List[str]], str]:
134
- """Handle model submission with full validation"""
135
-
136
- try:
137
- # Get client IP
138
- client_ip = self.get_client_ip(request)
139
-
140
- # Check rate limiting
141
- rate_ok, rate_msg = self.processor.check_rate_limit(client_ip)
142
- if not rate_ok:
143
- return current_data, [], [], f"❌ {rate_msg}"
144
-
145
- # Validate model name
146
- name_valid, name_msg = self.validate_model_name(model_name)
147
- if not name_valid:
148
- return current_data, [], [], f"❌ {name_msg}"
149
-
150
- # Validate scores
151
- scores = {
152
- "bleu": bleu,
153
- "llm_pass_1": llm_pass_1,
154
- "llm_pass_5": llm_pass_5,
155
- "llm_pass_10": llm_pass_10
156
- }
157
- scores_valid, scores_msg = self.validate_scores(scores)
158
- if not scores_valid:
159
- return current_data, [], [], f"❌ {scores_msg}"
160
-
161
- # Validate metrics
162
- metrics = {
163
- "readability": readability,
164
- "relevance": relevance,
165
- "explanation_clarity": explanation_clarity,
166
- "problem_identification": problem_identification,
167
- "actionability": actionability,
168
- "completeness": completeness,
169
- "specificity": specificity,
170
- "contextual_adequacy": contextual_adequacy,
171
- "consistency": consistency,
172
- "brevity": brevity,
173
- }
174
- metrics_valid, metrics_msg = self.validate_metrics(metrics)
175
- if not metrics_valid:
176
- return current_data, [], [], f"❌ {metrics_msg}"
177
-
178
- # Create submission data
179
- submission_data = {
180
- "model_name": model_name.strip(),
181
- "programming_language": programming_language,
182
- "comment_language": comment_language,
183
- "taxonomy_category": taxonomy_category,
184
- "bleu": bleu,
185
- "llm_pass_1": llm_pass_1,
186
- "llm_pass_5": llm_pass_5,
187
- "llm_pass_10": llm_pass_10,
188
- "metrics": metrics
189
- }
190
-
191
- # Submit to processor
192
- success, message = self.processor.add_submission(submission_data, client_ip)
193
-
194
- if success:
195
- # Load updated data
196
- updated_data = self.processor.load_leaderboard_data()
197
-
198
- # Format tables
199
- main_table = get_main_leaderboard_data(updated_data)
200
- quality_table = get_quality_metrics_data(updated_data)
201
-
202
- return updated_data, main_table, quality_table, message
203
- else:
204
- return current_data, [], [], message
205
-
206
- except Exception as e:
207
- print(f"Error in submission: {e}")
208
- return current_data, [], [], f"❌ Submission failed: {str(e)}"
209
-
210
- def get_submission_form_components(self):
211
- """Create gradio components for submission form"""
212
 
213
- with gr.Accordion("📝 Submit New Model Results", open=False):
214
- gr.Markdown("""
215
- ### Submission Guidelines
216
- - Provide accurate scores based on proper evaluation
217
- - Model name should follow 'organization/model' format
218
- - All metrics are required
219
- - Submissions are rate-limited per IP address
220
- """)
221
-
222
- with gr.Row():
223
- model_name = gr.Textbox(
224
- label="Model Name",
225
- placeholder="e.g., microsoft/CodeT5-base",
226
- info="Use organization/model format"
227
- )
228
- programming_language = gr.Dropdown(
229
- choices=PROGRAMMING_LANGUAGES,
230
- value="All",
231
- label="Programming Language",
232
- info="Primary programming language evaluated"
233
- )
234
- comment_language = gr.Dropdown(
235
- choices=COMMENT_LANGUAGES,
236
- value="English",
237
- label="Comment Language",
238
- info="Natural language of code comments"
239
- )
240
- taxonomy_category = gr.Dropdown(
241
- choices=TAXONOMY_CATEGORIES,
242
- value="All",
243
- label="Taxonomy Category",
244
- info="Primary review category focus"
245
- )
246
-
247
- gr.Markdown("### 📊 Performance Scores (0.0 - 1.0)")
248
- with gr.Row():
249
- bleu = gr.Number(
250
- label="BLEU Score",
251
- value=0.0,
252
- minimum=0.0,
253
- maximum=1.0,
254
- step=0.001,
255
- info="BLEU similarity score"
256
- )
257
- pass1 = gr.Number(
258
- label="Pass@1",
259
- value=0.0,
260
- minimum=0.0,
261
- maximum=1.0,
262
- step=0.001,
263
- info="Success rate in 1 attempt"
264
- )
265
- pass5 = gr.Number(
266
- label="Pass@5",
267
- value=0.0,
268
- minimum=0.0,
269
- maximum=1.0,
270
- step=0.001,
271
- info="Success rate in 5 attempts"
272
- )
273
- pass10 = gr.Number(
274
- label="Pass@10",
275
- value=0.0,
276
- minimum=0.0,
277
- maximum=1.0,
278
- step=0.001,
279
- info="Success rate in 10 attempts"
280
- )
281
-
282
- gr.Markdown("### 📋 Quality Metrics (0 - 10)")
283
- with gr.Row():
284
- readability = gr.Slider(
285
- minimum=0, maximum=10, value=5, step=1,
286
- label="Readability",
287
- info="How readable are the generated reviews?"
288
- )
289
- relevance = gr.Slider(
290
- minimum=0, maximum=10, value=5, step=1,
291
- label="Relevance",
292
- info="How relevant to the code changes?"
293
- )
294
- explanation_clarity = gr.Slider(
295
- minimum=0, maximum=10, value=5, step=1,
296
- label="Explanation Clarity",
297
- info="How clear are the explanations?"
298
- )
299
- problem_identification = gr.Slider(
300
- minimum=0, maximum=10, value=5, step=1,
301
- label="Problem Identification",
302
- info="How well does it identify issues?"
303
- )
304
- actionability = gr.Slider(
305
- minimum=0, maximum=10, value=5, step=1,
306
- label="Actionability",
307
- info="How actionable are the suggestions?"
308
- )
309
-
310
- with gr.Row():
311
- completeness = gr.Slider(
312
- minimum=0, maximum=10, value=5, step=1,
313
- label="Completeness",
314
- info="How complete are the reviews?"
315
- )
316
- specificity = gr.Slider(
317
- minimum=0, maximum=10, value=5, step=1,
318
- label="Specificity",
319
- info="How specific are the comments?"
320
- )
321
- contextual_adequacy = gr.Slider(
322
- minimum=0, maximum=10, value=5, step=1,
323
- label="Contextual Adequacy",
324
- info="How well does it understand context?"
325
- )
326
- consistency = gr.Slider(
327
- minimum=0, maximum=10, value=5, step=1,
328
- label="Consistency",
329
- info="How consistent across reviews?"
330
- )
331
- brevity = gr.Slider(
332
- minimum=0, maximum=10, value=5, step=1,
333
- label="Brevity",
334
- info="How concise are the reviews?"
335
- )
336
-
337
- submit_btn = gr.Button("🚀 Submit Model", variant="primary")
338
- status_msg = gr.Markdown("")
339
-
340
- # Return all components for use in the main app
341
- return {
342
- "model_name": model_name,
343
- "programming_language": programming_language,
344
- "comment_language": comment_language,
345
- "taxonomy_category": taxonomy_category,
346
- "bleu": bleu,
347
- "pass1": pass1,
348
- "pass5": pass5,
349
- "pass10": pass10,
350
- "readability": readability,
351
- "relevance": relevance,
352
- "explanation_clarity": explanation_clarity,
353
- "problem_identification": problem_identification,
354
- "actionability": actionability,
355
- "completeness": completeness,
356
- "specificity": specificity,
357
- "contextual_adequacy": contextual_adequacy,
358
- "consistency": consistency,
359
- "brevity": brevity,
360
- "submit_btn": submit_btn,
361
- "status_msg": status_msg,
362
- }
363
-
364
- def get_submission_history(self, ip_address: str) -> List[List[str]]:
365
- """Get submission history for display"""
366
  try:
367
- submissions = self.processor.get_ip_submissions(ip_address)
368
-
369
- table_data = []
370
- for sub in submissions:
371
- row = [
372
- sub.get("model_name", ""),
373
- sub.get("programming_language", ""),
374
- sub.get("comment_language", ""),
375
- sub.get("taxonomy_category", ""),
376
- f"{sub.get('scores', {}).get('llm_pass_1', 0):.3f}",
377
- sub.get("submission_date", "").split("T")[0] if sub.get("submission_date") else "",
378
- sub.get("status", "")
379
- ]
380
- table_data.append(row)
381
-
382
- return table_data
383
-
384
- except Exception as e:
385
- print(f"Error getting submission history: {e}")
386
- return []
 
1
  """
2
+ Handle submissions to the CodeReview Bench leaderboard.
3
  """
4
 
5
+ import json
6
+ import os
7
+ import tempfile
8
+ from datetime import datetime
9
+ from typing import Dict, List, Tuple
 
 
10
 
11
+ from huggingface_hub import HfApi
12
+ from datasets import load_dataset
13
+
14
+ from src.display.formatting import styled_error, styled_message
15
+ from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
16
+ from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard
17
+
18
+
19
+ def validate_submission(file_path: str) -> Tuple[bool, str]:
20
+ """
21
+ Validate a submission file.
22
+ """
23
+ try:
24
+ entries, message = process_jsonl_submission(file_path)
25
+ if not entries:
26
+ return False, message
27
+ return True, "Submission is valid"
28
+ except Exception as e:
29
+ return False, f"Error validating submission: {e}"
30
+
31
+
32
+ def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]:
33
+ """
34
+ Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version.
35
+ """
36
+ try:
37
+ # Create safe model name for file path
38
+ model_name_safe = model_name.replace("/", "_").replace(" ", "_")
39
+ mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
40
+
41
+ # Create entry path in entries folder
42
+ entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json"
43
+
44
+ # Save entry to temporary file
45
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
46
+ json.dump(entry, temp_file, indent=2)
47
+ temp_path = temp_file.name
48
+
49
+ # Upload file
50
+ api = HfApi(token=TOKEN)
51
+ api.upload_file(
52
+ path_or_fileobj=temp_path,
53
+ path_in_repo=entry_path,
54
+ repo_id=RESULTS_DATASET_ID,
55
+ repo_type="dataset",
56
+ commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})"
57
+ )
58
+
59
+ os.unlink(temp_path)
60
+ return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})"
61
+ except Exception as e:
62
+ return False, f"Error submitting entry to dataset: {e}"
63
+
64
+
65
+ def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
66
+ """
67
+ Submit updated leaderboard to the HuggingFace dataset.
68
+ """
69
+ try:
70
+ # Create leaderboard data
71
+ leaderboard_data = {
72
+ "entries": entries,
73
+ "last_updated": datetime.now().isoformat(),
74
+ "version": version
75
+ }
76
+
77
+ # Save to temporary file
78
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
79
+ json.dump(leaderboard_data, temp_file, indent=2)
80
+ temp_path = temp_file.name
81
+
82
+ # Upload file
83
+ api = HfApi(token=TOKEN)
84
+ api.upload_file(
85
+ path_or_fileobj=temp_path,
86
+ path_in_repo=f"leaderboards/leaderboard_{version}.json",
87
+ repo_id=RESULTS_DATASET_ID,
88
+ repo_type="dataset",
89
+ commit_message=f"Update leaderboard for version {version}"
90
+ )
91
+
92
+ os.unlink(temp_path)
93
+ return True, "Leaderboard updated successfully"
94
+ except Exception as e:
95
+ return False, f"Error updating leaderboard: {e}"
96
+
97
+
98
+ def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
99
+ """
100
+ Process a submission to the CodeReview Bench leaderboard.
101
+ """
102
+ try:
103
+ # Validate submission
104
+ is_valid, validation_message = validate_submission(file_path)
105
+ if not is_valid:
106
+ return styled_error(validation_message)
107
+
108
+ # Process the submission entries
109
+ entries, message = process_jsonl_submission(file_path)
110
+ if not entries:
111
+ return styled_error(f"Failed to process submission: {message}")
112
+
113
+ # Upload raw submission file
114
+ model_name = metadata.get("model_name", "unknown")
115
+ model_name_safe = model_name.replace("/", "_").replace(" ", "_")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
+ api = HfApi(token=TOKEN)
118
+ submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
119
+ api.upload_file(
120
+ path_or_fileobj=file_path,
121
+ path_in_repo=submission_path,
122
+ repo_id=RESULTS_DATASET_ID,
123
+ repo_type="dataset",
124
+ commit_message=f"Add raw submission for {model_name}"
125
+ )
126
+
127
+ # Process entries and add metadata
128
+ processed_entries = []
129
+ for entry in entries:
130
+ # Add metadata to entry
131
+ entry.update({
132
+ "model_name": metadata.get("model_name"),
133
+ "model_type": metadata.get("model_type"),
134
+ "review_model_type": str(metadata.get("review_model_type", "custom")).lower(),
135
+ "mode": metadata.get("mode"),
136
+ "base_model": metadata.get("base_model"),
137
+ "revision": metadata.get("revision"),
138
+ "precision": metadata.get("precision"),
139
+ "weight_type": metadata.get("weight_type"),
140
+ "version": version,
141
+ "submission_date": datetime.now().isoformat()
142
+ })
143
+ processed_entries.append(entry)
144
+
145
+ # Submit entries to entries folder
146
+ for entry in processed_entries:
147
+ success, message = submit_entry_to_hub(entry, model_name, metadata.get("mode"), version)
148
+ if not success:
149
+ return styled_error(message)
150
+
151
+ # Get all entries from HF dataset and update leaderboard
152
+ files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
153
+ entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
154
+
155
+ all_entries = []
156
+ for entry_file in entry_files:
157
+ try:
158
+ entry_path = api.hf_hub_download(
159
+ repo_id=RESULTS_DATASET_ID,
160
+ filename=entry_file,
161
+ repo_type="dataset",
162
+ )
163
+ with open(entry_path, 'r') as f:
164
+ entry_data = json.load(f)
165
+ all_entries.append(entry_data)
166
+ except Exception as e:
167
+ print(f"Error loading entry {entry_file}: {e}")
168
+
169
+ # Update leaderboard with all entries
170
+ success, message = submit_leaderboard_to_hub(all_entries, version)
171
+ if not success:
172
+ return styled_error(message)
173
+
174
+ return styled_message("Submission successful! Model evaluated and leaderboard updated.")
175
+
176
+ except Exception as e:
177
+ return styled_error(f"Error processing submission: {e}")
178
+ finally:
179
+ # Clean up temporary files if they exist
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  try:
181
+ if os.path.exists(file_path):
182
+ os.remove(file_path)
183
+ except:
184
+ pass