Spaces:
Sleeping
Sleeping
Alex
commited on
Commit
·
982b341
1
Parent(s):
2812333
zalupa1
Browse files- README.md +196 -16
- app.py +347 -257
- data/.gitkeep +1 -0
- data/leaderboard_data.json +30 -0
- data/submissions.json +5 -0
- requirements.txt +4 -3
- src/__init__.py +1 -0
- src/about.py +48 -0
- src/display/__init__.py +1 -0
- src/display/css_html_js.py +305 -0
- src/display/formatting.py +182 -0
- src/display/utils.py +292 -0
- src/envs.py +106 -0
- src/leaderboard/__init__.py +1 -0
- src/leaderboard/processor.py +306 -0
- src/submission/__init__.py +1 -0
- src/submission/submit.py +386 -0
README.md
CHANGED
@@ -12,31 +12,211 @@ sdk_version: 5.19.0
|
|
12 |
storage: persistent
|
13 |
---
|
14 |
|
15 |
-
# CodeReview Leaderboard
|
16 |
|
17 |
-
A leaderboard for
|
18 |
|
19 |
-
##
|
20 |
|
21 |
-
###
|
22 |
|
23 |
-
- **
|
24 |
-
- **
|
|
|
|
|
25 |
|
26 |
-
###
|
27 |
|
28 |
-
-
|
29 |
-
-
|
30 |
-
-
|
|
|
31 |
|
32 |
-
|
33 |
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
```bash
|
37 |
-
|
38 |
-
-H "Content-Type: application/json" \
|
39 |
-
-d '{"data": ["org/model", 0.68, 0.73, 0.82, 0.87, 8, 7, 8, 7, 6, 7, 6, 7, 6, 5]}'
|
40 |
```
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
storage: persistent
|
13 |
---
|
14 |
|
15 |
+
# 🏆 CodeReview Leaderboard
|
16 |
|
17 |
+
A comprehensive benchmark and leaderboard for code review generation models, inspired by [circle-guard-bench](https://huggingface.co/spaces/whitecircle-ai/circle-guard-bench).
|
18 |
|
19 |
+
## ✨ Features
|
20 |
|
21 |
+
### 🎯 Core Functionality
|
22 |
|
23 |
+
- **Multi-dimensional Evaluation**: Track models across BLEU scores, Pass@1/5/10 metrics, and 10 quality dimensions
|
24 |
+
- **Advanced Filtering**: Filter results by programming language, comment language, and taxonomy category
|
25 |
+
- **Real-time Updates**: Dynamic leaderboard updates with instant filtering
|
26 |
+
- **Dark Theme**: Modern, eye-friendly interface with GitHub-inspired dark theme
|
27 |
|
28 |
+
### 🔍 Advanced Analytics
|
29 |
|
30 |
+
- **Language Performance**: Compare model performance across programming languages
|
31 |
+
- **Category Analysis**: Analyze performance by review type (bug detection, security, etc.)
|
32 |
+
- **Submission History**: Track all submissions with IP-based logging
|
33 |
+
- **Statistical Insights**: Comprehensive statistics and trend analysis
|
34 |
|
35 |
+
### 🛡️ Security & Quality
|
36 |
|
37 |
+
- **IP-based Rate Limiting**: Prevent spam submissions (5 per 24 hours per IP)
|
38 |
+
- **Comprehensive Validation**: Multi-layer validation for all submissions
|
39 |
+
- **Audit Trail**: Complete submission logging for transparency
|
40 |
+
- **Data Integrity**: Automatic data validation and backup systems
|
41 |
+
|
42 |
+
### 🌐 Multi-Language Support
|
43 |
+
|
44 |
+
- **Programming Languages**: Python, JavaScript, Java, C++, Go, Rust, and more
|
45 |
+
- **Comment Languages**: English, Chinese, Spanish, French, German, Japanese, and more
|
46 |
+
- **Taxonomy Categories**: Bug Detection, Security, Performance, Style, and more
|
47 |
+
|
48 |
+
## 🚀 Quick Start
|
49 |
+
|
50 |
+
### Installation
|
51 |
+
|
52 |
+
```bash
|
53 |
+
pip install -r requirements.txt
|
54 |
+
```
|
55 |
+
|
56 |
+
### Run Locally
|
57 |
|
58 |
```bash
|
59 |
+
python app.py
|
|
|
|
|
60 |
```
|
61 |
|
62 |
+
### Access the Interface
|
63 |
+
|
64 |
+
Open your browser to `http://localhost:7860`
|
65 |
+
|
66 |
+
## 📊 Usage Guide
|
67 |
+
|
68 |
+
### 1. Viewing the Leaderboard
|
69 |
+
|
70 |
+
- Navigate to the **🏆 Leaderboard** tab
|
71 |
+
- Use the filter dropdowns to narrow results:
|
72 |
+
- **Programming Language**: Filter by specific programming languages
|
73 |
+
- **Comment Language**: Filter by natural language of comments
|
74 |
+
- **Taxonomy Category**: Filter by review category type
|
75 |
+
- Click **🔄 Refresh** to update data
|
76 |
+
|
77 |
+
### 2. Submitting Models
|
78 |
+
|
79 |
+
- Go to the **📝 Submit Model** tab
|
80 |
+
- Fill in the submission form:
|
81 |
+
- **Model Name**: Use `organization/model` format
|
82 |
+
- **Languages & Category**: Select appropriate filters
|
83 |
+
- **Performance Scores**: Provide BLEU and Pass@k scores (0.0-1.0)
|
84 |
+
- **Quality Metrics**: Rate across 10 dimensions (0-10)
|
85 |
+
- Click **🚀 Submit Model** to add your results
|
86 |
+
|
87 |
+
### 3. Analytics & Insights
|
88 |
+
|
89 |
+
- Visit the **📈 Analytics** tab to see:
|
90 |
+
- Recent submission history
|
91 |
+
- Language performance comparisons
|
92 |
+
- Category performance analysis
|
93 |
+
- Trends and patterns
|
94 |
+
|
95 |
+
### 4. Data Export
|
96 |
+
|
97 |
+
- Use the **ℹ️ About** tab to export data in JSON or CSV format
|
98 |
+
- Full leaderboard data available for research and analysis
|
99 |
+
|
100 |
+
## 🏗️ Architecture
|
101 |
+
|
102 |
+
### Directory Structure
|
103 |
+
|
104 |
+
```
|
105 |
+
├── src/
|
106 |
+
│ ├── about.py # About page content
|
107 |
+
│ ├── envs.py # Environment configuration
|
108 |
+
│ ├── display/ # Display utilities
|
109 |
+
│ │ ├── css_html_js.py # Styling and themes
|
110 |
+
│ │ ├── formatting.py # Data formatting
|
111 |
+
│ │ └── utils.py # Display utilities
|
112 |
+
│ ├── leaderboard/ # Leaderboard processing
|
113 |
+
│ │ └── processor.py # Data operations
|
114 |
+
│ └── submission/ # Submission handling
|
115 |
+
│ └── submit.py # Submission validation
|
116 |
+
├── data/ # Data storage
|
117 |
+
│ ├── leaderboard_data.json # Main leaderboard
|
118 |
+
│ └── submissions.json # Submission log
|
119 |
+
├── app.py # Main application
|
120 |
+
└── requirements.txt # Dependencies
|
121 |
+
```
|
122 |
+
|
123 |
+
### Key Components
|
124 |
+
|
125 |
+
- **LeaderboardProcessor**: Handles all data operations, validation, and persistence
|
126 |
+
- **SubmissionHandler**: Manages model submissions with IP tracking and validation
|
127 |
+
- **Display Utils**: Provides filtering, formatting, and table generation
|
128 |
+
- **Dark Theme**: Custom CSS for modern, accessible interface
|
129 |
+
|
130 |
+
## 🎨 Features Inspired by circle-guard-bench
|
131 |
+
|
132 |
+
### ✅ Implemented Features
|
133 |
+
|
134 |
+
- **Multi-tab Interface**: Organized navigation with dedicated sections
|
135 |
+
- **Advanced Filtering**: Real-time filtering by multiple criteria
|
136 |
+
- **Dark Theme**: Modern, GitHub-inspired dark interface
|
137 |
+
- **IP-based Submissions**: Secure submission tracking
|
138 |
+
- **Comprehensive Analytics**: Detailed performance insights
|
139 |
+
- **Data Export**: Multiple export formats
|
140 |
+
- **Rate Limiting**: Anti-spam protection
|
141 |
+
|
142 |
+
### 🔧 Technical Improvements
|
143 |
+
|
144 |
+
- **Modular Architecture**: Clean separation of concerns
|
145 |
+
- **Type Safety**: Full type annotations throughout
|
146 |
+
- **Error Handling**: Comprehensive error handling and logging
|
147 |
+
- **Data Validation**: Multi-layer validation with Pydantic
|
148 |
+
- **Performance**: Optimized data processing and display
|
149 |
+
|
150 |
+
## 📈 Metrics & Evaluation
|
151 |
+
|
152 |
+
### Performance Metrics
|
153 |
+
|
154 |
+
- **BLEU**: Text similarity score (0.0-1.0)
|
155 |
+
- **Pass@1**: Success rate in single attempt (0.0-1.0)
|
156 |
+
- **Pass@5**: Success rate in 5 attempts (0.0-1.0)
|
157 |
+
- **Pass@10**: Success rate in 10 attempts (0.0-1.0)
|
158 |
+
|
159 |
+
### Quality Dimensions
|
160 |
+
|
161 |
+
1. **Readability**: How clear and readable are the reviews?
|
162 |
+
2. **Relevance**: How relevant to the code changes?
|
163 |
+
3. **Explanation Clarity**: How well does it explain issues?
|
164 |
+
4. **Problem Identification**: How effectively does it identify problems?
|
165 |
+
5. **Actionability**: How actionable are the suggestions?
|
166 |
+
6. **Completeness**: How thorough are the reviews?
|
167 |
+
7. **Specificity**: How specific are the comments?
|
168 |
+
8. **Contextual Adequacy**: How well does it understand context?
|
169 |
+
9. **Consistency**: How consistent across different reviews?
|
170 |
+
10. **Brevity**: How concise without losing important information?
|
171 |
+
|
172 |
+
## 🔒 Security Features
|
173 |
+
|
174 |
+
### Rate Limiting
|
175 |
+
|
176 |
+
- **5 submissions per IP per 24 hours**
|
177 |
+
- **Automatic IP tracking and logging**
|
178 |
+
- **Graceful error handling for rate limits**
|
179 |
+
|
180 |
+
### Data Validation
|
181 |
+
|
182 |
+
- **Model name format validation**
|
183 |
+
- **Score range validation (0.0-1.0 for performance, 0-10 for quality)**
|
184 |
+
- **Logical consistency checks (Pass@1 ≤ Pass@5 ≤ Pass@10)**
|
185 |
+
- **Required field validation**
|
186 |
+
|
187 |
+
### Audit Trail
|
188 |
+
|
189 |
+
- **Complete submission logging**
|
190 |
+
- **IP address tracking (partially masked for privacy)**
|
191 |
+
- **Timestamp recording**
|
192 |
+
- **Data integrity checks**
|
193 |
+
|
194 |
+
## 🤝 Contributing
|
195 |
+
|
196 |
+
1. Fork the repository
|
197 |
+
2. Create a feature branch
|
198 |
+
3. Make your changes
|
199 |
+
4. Add tests if applicable
|
200 |
+
5. Submit a pull request
|
201 |
+
|
202 |
+
## 📄 License
|
203 |
+
|
204 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
205 |
+
|
206 |
+
## 🙏 Acknowledgments
|
207 |
+
|
208 |
+
- Inspired by [circle-guard-bench](https://huggingface.co/spaces/whitecircle-ai/circle-guard-bench)
|
209 |
+
- Built with [Gradio](https://gradio.app/) for the web interface
|
210 |
+
- Thanks to the open-source community for tools and inspiration
|
211 |
+
|
212 |
+
## 📞 Support
|
213 |
+
|
214 |
+
For questions, issues, or contributions:
|
215 |
+
|
216 |
+
- Open an issue on GitHub
|
217 |
+
- Check the documentation
|
218 |
+
- Contact the maintainers
|
219 |
+
|
220 |
+
---
|
221 |
+
|
222 |
+
**Built with ❤️ for the code review research community**
|
app.py
CHANGED
@@ -1,275 +1,365 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
|
6 |
import gradio as gr
|
7 |
-
from
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
}
|
24 |
-
}]
|
25 |
-
|
26 |
-
# --------------- Data models ---------------
|
27 |
-
class Metrics(BaseModel):
|
28 |
-
readability: int
|
29 |
-
relevance: int
|
30 |
-
explanation_clarity: int = Field(alias="explanation_clarity")
|
31 |
-
problem_identification: int
|
32 |
-
actionability: int
|
33 |
-
completeness: int
|
34 |
-
specificity: int
|
35 |
-
contextual_adequacy: int
|
36 |
-
consistency: int
|
37 |
-
brevity: int
|
38 |
-
|
39 |
-
@field_validator("readability", "relevance", "explanation_clarity", "problem_identification", "actionability", "completeness", "specificity", "contextual_adequacy", "consistency", "brevity")
|
40 |
-
def metric_range(cls, v: int):
|
41 |
-
if not 0 <= v <= 10:
|
42 |
-
raise ValueError("Multi-metrics should be between 0 and 10")
|
43 |
-
return v
|
44 |
-
|
45 |
-
|
46 |
-
class LeaderboardEntry(BaseModel):
|
47 |
-
model_name: str
|
48 |
-
bleu: float
|
49 |
-
llm_pass_1: float
|
50 |
-
llm_pass_5: float
|
51 |
-
llm_pass_10: float
|
52 |
-
metrics: Metrics
|
53 |
-
|
54 |
-
@field_validator("bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10")
|
55 |
-
def score_range(cls, v: float):
|
56 |
-
if not 0.0 <= v <= 1.0:
|
57 |
-
raise ValueError("Scores should be between 0 and 1")
|
58 |
-
return v
|
59 |
-
|
60 |
-
|
61 |
-
# --------------- Persistence helpers ---------------
|
62 |
-
|
63 |
-
def _load_leaderboard() -> List[Dict]:
|
64 |
-
"""Load leaderboard data with persistent storage support."""
|
65 |
-
if not LEADERBOARD_PATH.exists():
|
66 |
-
# Create default example data
|
67 |
-
_save_leaderboard(DEFAULT_DATA)
|
68 |
-
return DEFAULT_DATA
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
data = json.load(f)
|
73 |
-
return data.get("leaderboard", [])
|
74 |
-
except Exception as e:
|
75 |
-
print(f"Error loading leaderboard: {e}")
|
76 |
-
return []
|
77 |
-
|
78 |
-
|
79 |
-
def _save_leaderboard(data: List[Dict]):
|
80 |
-
"""Save leaderboard data to persistent storage."""
|
81 |
-
try:
|
82 |
-
to_store = {"leaderboard": data}
|
83 |
-
with LEADERBOARD_PATH.open("w", encoding="utf-8") as f:
|
84 |
-
json.dump(to_store, f, indent=2)
|
85 |
-
except Exception as e:
|
86 |
-
print(f"Error saving leaderboard: {e}")
|
87 |
-
|
88 |
-
|
89 |
-
# --------------- Table data functions ---------------
|
90 |
-
|
91 |
-
def _table_data(data: List[Dict] = None) -> List[List]:
|
92 |
-
"""Get main metrics table data."""
|
93 |
-
if data is None:
|
94 |
-
data = _load_leaderboard()
|
95 |
-
if not data:
|
96 |
-
return []
|
97 |
-
data.sort(key=lambda x: x["llm_pass_1"], reverse=True)
|
98 |
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
entry["bleu"],
|
104 |
-
entry["llm_pass_1"],
|
105 |
-
entry["llm_pass_5"],
|
106 |
-
entry["llm_pass_10"],
|
107 |
-
]
|
108 |
-
table_rows.append(row)
|
109 |
-
return table_rows
|
110 |
-
|
111 |
-
|
112 |
-
def _multimetric_table_data(data: List[Dict] = None) -> List[List]:
|
113 |
-
"""Get multi-metric table data."""
|
114 |
-
if data is None:
|
115 |
-
data = _load_leaderboard()
|
116 |
-
if not data:
|
117 |
-
return []
|
118 |
-
data.sort(key=lambda x: x["llm_pass_1"], reverse=True)
|
119 |
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
-
def
|
142 |
-
|
143 |
-
|
144 |
-
bleu: float,
|
145 |
-
llm_pass_1: float,
|
146 |
-
llm_pass_5: float,
|
147 |
-
llm_pass_10: float,
|
148 |
-
readability: int,
|
149 |
-
relevance: int,
|
150 |
-
explanation_clarity: int,
|
151 |
-
problem_identification: int,
|
152 |
-
actionability: int,
|
153 |
-
completeness: int,
|
154 |
-
specificity: int,
|
155 |
-
contextual_adequacy: int,
|
156 |
-
consistency: int,
|
157 |
-
brevity: int,
|
158 |
):
|
159 |
-
"""
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
"actionability": actionability,
|
173 |
-
"completeness": completeness,
|
174 |
-
"specificity": specificity,
|
175 |
-
"contextual_adequacy": contextual_adequacy,
|
176 |
-
"consistency": consistency,
|
177 |
-
"brevity": brevity,
|
178 |
-
},
|
179 |
)
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
#
|
194 |
-
|
195 |
-
gr.Markdown("""# 🏆 CodeReview Leaderboard\nSubmit your model results below. Leaderboard is sorted by **Pass@1**. """)
|
196 |
-
|
197 |
-
# Initialize table data
|
198 |
-
initial_leaderboard_data = _load_leaderboard()
|
199 |
-
initial_data = _table_data(initial_leaderboard_data)
|
200 |
-
initial_multimetric_data = _multimetric_table_data(initial_leaderboard_data)
|
201 |
|
202 |
# State to store leaderboard data
|
203 |
-
leaderboard_state = gr.State(value=
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
)
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
readability_inp,
|
255 |
-
relevance_inp,
|
256 |
-
explanation_inp,
|
257 |
-
problem_inp,
|
258 |
-
actionability_inp,
|
259 |
-
completeness_inp,
|
260 |
-
specificity_inp,
|
261 |
-
contextual_inp,
|
262 |
-
consistency_inp,
|
263 |
-
brevity_inp,
|
264 |
-
],
|
265 |
-
outputs=[leaderboard_state, leaderboard_df, multimetric_df, status_markdown],
|
266 |
-
api_name="submit_model",
|
267 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
268 |
|
269 |
-
#
|
270 |
-
|
271 |
if __name__ == "__main__":
|
272 |
-
demo.queue().launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
|
274 |
-
# For
|
275 |
app = demo
|
|
|
1 |
+
"""
|
2 |
+
CodeReview Leaderboard - Inspired by circle-guard-bench
|
3 |
+
A comprehensive leaderboard for code review generation models
|
4 |
+
"""
|
5 |
|
6 |
import gradio as gr
|
7 |
+
from typing import List, Dict, Any
|
8 |
+
from datetime import datetime, timezone
|
9 |
+
|
10 |
+
# Import our modules
|
11 |
+
from src.envs import (
|
12 |
+
PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES,
|
13 |
+
MAIN_HEADERS, QUALITY_HEADERS
|
14 |
+
)
|
15 |
+
from src.about import TITLE, INTRODUCTION_TEXT
|
16 |
+
from src.display.css_html_js import DARK_THEME_CSS, CUSTOM_JS, HEADER_HTML, FOOTER_HTML
|
17 |
+
from src.display.utils import (
|
18 |
+
get_main_leaderboard_data, get_quality_metrics_data,
|
19 |
+
get_submission_history_data, get_statistics_summary
|
20 |
+
)
|
21 |
+
from src.leaderboard.processor import LeaderboardProcessor
|
22 |
+
from src.submission.submit import SubmissionHandler
|
23 |
+
|
24 |
+
# Initialize processors
|
25 |
+
processor = LeaderboardProcessor()
|
26 |
+
submission_handler = SubmissionHandler()
|
27 |
+
|
28 |
+
# Global state
|
29 |
+
current_filters = {
|
30 |
+
"programming_language": "All",
|
31 |
+
"comment_language": "All",
|
32 |
+
"taxonomy_category": "All"
|
33 |
+
}
|
34 |
+
|
35 |
+
def update_leaderboard_tables(
|
36 |
+
programming_language: str = "All",
|
37 |
+
comment_language: str = "All",
|
38 |
+
taxonomy_category: str = "All"
|
39 |
+
):
|
40 |
+
"""Update leaderboard tables with filters"""
|
41 |
+
global current_filters
|
42 |
+
current_filters = {
|
43 |
+
"programming_language": programming_language,
|
44 |
+
"comment_language": comment_language,
|
45 |
+
"taxonomy_category": taxonomy_category
|
46 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
+
# Load current data
|
49 |
+
data = processor.load_leaderboard_data()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
+
# Get filtered tables
|
52 |
+
main_table = get_main_leaderboard_data(
|
53 |
+
data, programming_language, comment_language, taxonomy_category
|
54 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
+
quality_table = get_quality_metrics_data(
|
57 |
+
data, programming_language, comment_language, taxonomy_category
|
58 |
+
)
|
59 |
+
|
60 |
+
# Get statistics
|
61 |
+
stats = get_statistics_summary(data)
|
62 |
+
|
63 |
+
# Format statistics display
|
64 |
+
stats_text = f"""
|
65 |
+
## 📊 Current Statistics
|
66 |
+
- **Total Models**: {stats['total_models']}
|
67 |
+
- **Total Submissions**: {stats['total_submissions']}
|
68 |
+
- **Average Pass@1**: {stats['avg_pass_1']:.3f}
|
69 |
+
- **Best Model**: {stats['best_model']}
|
70 |
+
- **Languages Covered**: {stats['languages_covered']}
|
71 |
+
- **Categories Covered**: {stats['categories_covered']}
|
72 |
+
"""
|
73 |
+
|
74 |
+
return main_table, quality_table, stats_text
|
75 |
+
|
76 |
+
def refresh_data():
|
77 |
+
"""Refresh all data from storage"""
|
78 |
+
return update_leaderboard_tables(
|
79 |
+
current_filters["programming_language"],
|
80 |
+
current_filters["comment_language"],
|
81 |
+
current_filters["taxonomy_category"]
|
82 |
+
)
|
83 |
|
84 |
+
def handle_submission(
|
85 |
+
request: gr.Request,
|
86 |
+
*args
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
):
|
88 |
+
"""Handle model submission"""
|
89 |
+
# Get current data
|
90 |
+
current_data = processor.load_leaderboard_data()
|
91 |
+
|
92 |
+
# Call submission handler
|
93 |
+
result = submission_handler.submit_model(request, current_data, *args)
|
94 |
+
|
95 |
+
# If submission was successful, refresh tables
|
96 |
+
if result[0] != current_data: # Data was updated
|
97 |
+
main_table, quality_table, stats_text = update_leaderboard_tables(
|
98 |
+
current_filters["programming_language"],
|
99 |
+
current_filters["comment_language"],
|
100 |
+
current_filters["taxonomy_category"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
)
|
102 |
+
return result[0], main_table, quality_table, result[3], stats_text
|
103 |
+
else:
|
104 |
+
return result[0], result[1], result[2], result[3], None
|
105 |
+
|
106 |
+
# Create the Gradio interface
|
107 |
+
with gr.Blocks(
|
108 |
+
theme=gr.themes.Base(),
|
109 |
+
css=DARK_THEME_CSS,
|
110 |
+
js=CUSTOM_JS,
|
111 |
+
title=TITLE,
|
112 |
+
head="<meta name='viewport' content='width=device-width, initial-scale=1'>"
|
113 |
+
) as demo:
|
114 |
+
|
115 |
+
# Header
|
116 |
+
gr.HTML(HEADER_HTML)
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
# State to store leaderboard data
|
119 |
+
leaderboard_state = gr.State(value=processor.load_leaderboard_data())
|
120 |
|
121 |
+
# Main content tabs
|
122 |
+
with gr.Tabs():
|
123 |
+
|
124 |
+
# Leaderboard Tab
|
125 |
+
with gr.Tab("🏆 Leaderboard"):
|
126 |
+
|
127 |
+
# Filters
|
128 |
+
with gr.Row():
|
129 |
+
prog_lang_filter = gr.Dropdown(
|
130 |
+
choices=PROGRAMMING_LANGUAGES,
|
131 |
+
value="All",
|
132 |
+
label="🔍 Programming Language",
|
133 |
+
info="Filter by programming language"
|
134 |
+
)
|
135 |
+
comment_lang_filter = gr.Dropdown(
|
136 |
+
choices=COMMENT_LANGUAGES,
|
137 |
+
value="All",
|
138 |
+
label="🌍 Comment Language",
|
139 |
+
info="Filter by comment language"
|
140 |
+
)
|
141 |
+
taxonomy_filter = gr.Dropdown(
|
142 |
+
choices=TAXONOMY_CATEGORIES,
|
143 |
+
value="All",
|
144 |
+
label="🏷️ Taxonomy Category",
|
145 |
+
info="Filter by review category"
|
146 |
+
)
|
147 |
+
refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
|
148 |
+
|
149 |
+
# Statistics
|
150 |
+
stats_display = gr.Markdown("")
|
151 |
+
|
152 |
+
# Main leaderboard table
|
153 |
+
with gr.Row():
|
154 |
+
main_leaderboard = gr.Dataframe(
|
155 |
+
headers=MAIN_HEADERS,
|
156 |
+
label="🏅 Main Leaderboard",
|
157 |
+
interactive=False,
|
158 |
+
wrap=True,
|
159 |
+
max_height=600
|
160 |
+
)
|
161 |
+
|
162 |
+
# Quality metrics table
|
163 |
+
with gr.Row():
|
164 |
+
quality_metrics = gr.Dataframe(
|
165 |
+
headers=QUALITY_HEADERS,
|
166 |
+
label="📊 Quality Metrics",
|
167 |
+
interactive=False,
|
168 |
+
wrap=True,
|
169 |
+
max_height=600
|
170 |
+
)
|
171 |
+
|
172 |
+
# Submission Tab
|
173 |
+
with gr.Tab("📝 Submit Model"):
|
174 |
+
|
175 |
+
# Create submission form
|
176 |
+
form_components = submission_handler.get_submission_form_components()
|
177 |
+
|
178 |
+
# Connect submission handler
|
179 |
+
form_components["submit_btn"].click(
|
180 |
+
fn=handle_submission,
|
181 |
+
inputs=[
|
182 |
+
leaderboard_state,
|
183 |
+
form_components["model_name"],
|
184 |
+
form_components["programming_language"],
|
185 |
+
form_components["comment_language"],
|
186 |
+
form_components["taxonomy_category"],
|
187 |
+
form_components["bleu"],
|
188 |
+
form_components["pass1"],
|
189 |
+
form_components["pass5"],
|
190 |
+
form_components["pass10"],
|
191 |
+
form_components["readability"],
|
192 |
+
form_components["relevance"],
|
193 |
+
form_components["explanation_clarity"],
|
194 |
+
form_components["problem_identification"],
|
195 |
+
form_components["actionability"],
|
196 |
+
form_components["completeness"],
|
197 |
+
form_components["specificity"],
|
198 |
+
form_components["contextual_adequacy"],
|
199 |
+
form_components["consistency"],
|
200 |
+
form_components["brevity"],
|
201 |
+
],
|
202 |
+
outputs=[
|
203 |
+
leaderboard_state,
|
204 |
+
main_leaderboard,
|
205 |
+
quality_metrics,
|
206 |
+
form_components["status_msg"],
|
207 |
+
stats_display
|
208 |
+
]
|
209 |
+
)
|
210 |
+
|
211 |
+
# Analytics Tab
|
212 |
+
with gr.Tab("📈 Analytics"):
|
213 |
+
|
214 |
+
with gr.Row():
|
215 |
+
analytics_prog_lang = gr.Dropdown(
|
216 |
+
choices=PROGRAMMING_LANGUAGES,
|
217 |
+
value="All",
|
218 |
+
label="Programming Language"
|
219 |
+
)
|
220 |
+
analytics_comment_lang = gr.Dropdown(
|
221 |
+
choices=COMMENT_LANGUAGES,
|
222 |
+
value="All",
|
223 |
+
label="Comment Language"
|
224 |
+
)
|
225 |
+
analytics_taxonomy = gr.Dropdown(
|
226 |
+
choices=TAXONOMY_CATEGORIES,
|
227 |
+
value="All",
|
228 |
+
label="Taxonomy Category"
|
229 |
+
)
|
230 |
+
|
231 |
+
# Submission history
|
232 |
+
submission_history = gr.Dataframe(
|
233 |
+
headers=["Model", "Programming Language", "Comment Language", "Taxonomy", "Pass@1", "Date", "IP"],
|
234 |
+
label="📋 Recent Submissions",
|
235 |
+
interactive=False,
|
236 |
+
max_height=400
|
237 |
+
)
|
238 |
+
|
239 |
+
# Language performance analysis
|
240 |
+
with gr.Row():
|
241 |
+
with gr.Column():
|
242 |
+
gr.Markdown("### 🗣️ Language Performance Analysis")
|
243 |
+
language_analysis = gr.Dataframe(
|
244 |
+
headers=["Language", "Avg Pass@1", "Model Count", "Best Model"],
|
245 |
+
label="Programming Language Performance",
|
246 |
+
interactive=False
|
247 |
+
)
|
248 |
+
|
249 |
+
with gr.Column():
|
250 |
+
gr.Markdown("### 🏷️ Category Performance Analysis")
|
251 |
+
category_analysis = gr.Dataframe(
|
252 |
+
headers=["Category", "Avg Pass@1", "Model Count", "Best Model"],
|
253 |
+
label="Taxonomy Category Performance",
|
254 |
+
interactive=False
|
255 |
+
)
|
256 |
+
|
257 |
+
# About Tab
|
258 |
+
with gr.Tab("ℹ️ About"):
|
259 |
+
gr.Markdown(INTRODUCTION_TEXT)
|
260 |
+
|
261 |
+
# Export functionality
|
262 |
+
with gr.Row():
|
263 |
+
export_format = gr.Dropdown(
|
264 |
+
choices=["JSON", "CSV"],
|
265 |
+
value="JSON",
|
266 |
+
label="Export Format"
|
267 |
+
)
|
268 |
+
export_btn = gr.Button("📥 Export Data")
|
269 |
+
|
270 |
+
export_output = gr.Textbox(
|
271 |
+
label="Export Output",
|
272 |
+
lines=10,
|
273 |
+
max_lines=20,
|
274 |
+
show_copy_button=True
|
275 |
+
)
|
276 |
+
|
277 |
+
# Footer
|
278 |
+
gr.HTML(FOOTER_HTML)
|
279 |
+
|
280 |
+
# Initialize with data
|
281 |
+
initial_main, initial_quality, initial_stats = update_leaderboard_tables()
|
282 |
+
|
283 |
+
# Update tables when filters change
|
284 |
+
filter_inputs = [prog_lang_filter, comment_lang_filter, taxonomy_filter]
|
285 |
+
filter_outputs = [main_leaderboard, quality_metrics, stats_display]
|
286 |
+
|
287 |
+
for filter_input in filter_inputs:
|
288 |
+
filter_input.change(
|
289 |
+
fn=update_leaderboard_tables,
|
290 |
+
inputs=filter_inputs,
|
291 |
+
outputs=filter_outputs
|
292 |
+
)
|
293 |
+
|
294 |
+
# Refresh button
|
295 |
+
refresh_btn.click(
|
296 |
+
fn=refresh_data,
|
297 |
+
outputs=filter_outputs
|
298 |
)
|
299 |
+
|
300 |
+
# Analytics updates
|
301 |
+
analytics_inputs = [analytics_prog_lang, analytics_comment_lang, analytics_taxonomy]
|
302 |
+
|
303 |
+
def update_analytics(prog_lang, comment_lang, taxonomy):
|
304 |
+
"""Update analytics tables"""
|
305 |
+
data = processor.load_leaderboard_data()
|
306 |
+
|
307 |
+
# Get submission history
|
308 |
+
history = get_submission_history_data(data, prog_lang, comment_lang, taxonomy)
|
309 |
+
|
310 |
+
# Get language performance
|
311 |
+
lang_perf = []
|
312 |
+
for lang in PROGRAMMING_LANGUAGES[1:]:
|
313 |
+
lang_data = [d for d in data if d.get("programming_language") == lang]
|
314 |
+
if lang_data:
|
315 |
+
avg_score = sum(d.get("llm_pass_1", 0) for d in lang_data) / len(lang_data)
|
316 |
+
best_model = max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
|
317 |
+
lang_perf.append([lang, f"{avg_score:.3f}", len(lang_data), best_model])
|
318 |
+
|
319 |
+
# Get category performance
|
320 |
+
cat_perf = []
|
321 |
+
for cat in TAXONOMY_CATEGORIES[1:]:
|
322 |
+
cat_data = [d for d in data if d.get("taxonomy_category") == cat]
|
323 |
+
if cat_data:
|
324 |
+
avg_score = sum(d.get("llm_pass_1", 0) for d in cat_data) / len(cat_data)
|
325 |
+
best_model = max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
|
326 |
+
cat_perf.append([cat, f"{avg_score:.3f}", len(cat_data), best_model])
|
327 |
+
|
328 |
+
return history, lang_perf, cat_perf
|
329 |
+
|
330 |
+
for analytics_input in analytics_inputs:
|
331 |
+
analytics_input.change(
|
332 |
+
fn=update_analytics,
|
333 |
+
inputs=analytics_inputs,
|
334 |
+
outputs=[submission_history, language_analysis, category_analysis]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
)
|
336 |
+
|
337 |
+
# Export functionality
|
338 |
+
def export_data(format_type):
|
339 |
+
"""Export leaderboard data"""
|
340 |
+
return processor.export_data(format_type.lower())
|
341 |
+
|
342 |
+
export_btn.click(
|
343 |
+
fn=export_data,
|
344 |
+
inputs=[export_format],
|
345 |
+
outputs=[export_output]
|
346 |
+
)
|
347 |
+
|
348 |
+
# Set initial values
|
349 |
+
demo.load(
|
350 |
+
fn=lambda: (initial_main, initial_quality, initial_stats),
|
351 |
+
outputs=[main_leaderboard, quality_metrics, stats_display]
|
352 |
+
)
|
353 |
|
354 |
+
# Launch configuration
|
|
|
355 |
if __name__ == "__main__":
|
356 |
+
demo.queue(max_size=20).launch(
|
357 |
+
server_name="0.0.0.0",
|
358 |
+
server_port=7860,
|
359 |
+
share=False,
|
360 |
+
show_error=True,
|
361 |
+
debug=True
|
362 |
+
)
|
363 |
|
364 |
+
# For deployment (HuggingFace Spaces, etc.)
|
365 |
app = demo
|
data/.gitkeep
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# Keep this directory in git
|
data/leaderboard_data.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"leaderboard": [
|
3 |
+
{
|
4 |
+
"model_name": "example/model",
|
5 |
+
"programming_language": "Python",
|
6 |
+
"comment_language": "English",
|
7 |
+
"taxonomy_category": "Bug Detection",
|
8 |
+
"bleu": 0.5,
|
9 |
+
"llm_pass_1": 0.5,
|
10 |
+
"llm_pass_5": 0.5,
|
11 |
+
"llm_pass_10": 0.5,
|
12 |
+
"metrics": {
|
13 |
+
"readability": 5,
|
14 |
+
"relevance": 5,
|
15 |
+
"explanation_clarity": 5,
|
16 |
+
"problem_identification": 5,
|
17 |
+
"actionability": 5,
|
18 |
+
"completeness": 5,
|
19 |
+
"specificity": 5,
|
20 |
+
"contextual_adequacy": 5,
|
21 |
+
"consistency": 5,
|
22 |
+
"brevity": 5
|
23 |
+
},
|
24 |
+
"submission_ip": "127.0.0.1",
|
25 |
+
"submission_date": "2024-01-01T00:00:00Z"
|
26 |
+
}
|
27 |
+
],
|
28 |
+
"last_updated": "2025-07-03T13:10:47.434623+00:00",
|
29 |
+
"total_entries": 1
|
30 |
+
}
|
data/submissions.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"submissions": [],
|
3 |
+
"last_updated": "2025-07-03T13:10:47.435548+00:00",
|
4 |
+
"total_submissions": 0
|
5 |
+
}
|
requirements.txt
CHANGED
@@ -1,18 +1,19 @@
|
|
1 |
APScheduler
|
2 |
black
|
3 |
datasets
|
4 |
-
gradio
|
5 |
gradio[oauth]
|
6 |
gradio_leaderboard==0.0.13
|
7 |
gradio_client
|
8 |
huggingface-hub>=0.18.0
|
9 |
matplotlib
|
10 |
numpy
|
11 |
-
pandas
|
12 |
python-dateutil
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
sentencepiece
|
17 |
fastapi
|
18 |
-
uvicorn
|
|
|
|
1 |
APScheduler
|
2 |
black
|
3 |
datasets
|
4 |
+
gradio>=4.0.0
|
5 |
gradio[oauth]
|
6 |
gradio_leaderboard==0.0.13
|
7 |
gradio_client
|
8 |
huggingface-hub>=0.18.0
|
9 |
matplotlib
|
10 |
numpy
|
11 |
+
pandas>=1.3.0
|
12 |
python-dateutil
|
13 |
tqdm
|
14 |
transformers
|
15 |
tokenizers>=0.15.0
|
16 |
sentencepiece
|
17 |
fastapi
|
18 |
+
uvicorn
|
19 |
+
pydantic>=2.0.0
|
src/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# CodeReview Leaderboard - Source Module
|
src/about.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
About page content for CodeReview Leaderboard
|
3 |
+
"""
|
4 |
+
|
5 |
+
TITLE = "🏆 CodeReview Leaderboard"
|
6 |
+
|
7 |
+
INTRODUCTION_TEXT = """
|
8 |
+
# CodeReview Leaderboard
|
9 |
+
|
10 |
+
A comprehensive benchmark for evaluating code review generation models across multiple programming languages and comment types.
|
11 |
+
|
12 |
+
## Overview
|
13 |
+
|
14 |
+
This leaderboard tracks the performance of various models on code review tasks, providing insights into:
|
15 |
+
- **Programming Language Performance**: How well models perform across different programming languages
|
16 |
+
- **Comment Language Support**: Effectiveness in generating reviews in different natural languages
|
17 |
+
- **Taxonomy Categories**: Performance across different types of code review feedback
|
18 |
+
|
19 |
+
## Metrics
|
20 |
+
|
21 |
+
- **BLEU**: Measures similarity between generated and reference reviews
|
22 |
+
- **Pass@1/5/10**: Percentage of reviews that pass quality checks in 1, 5, or 10 attempts
|
23 |
+
- **Multi-dimensional Quality Scores**: Detailed evaluation across 10 quality dimensions
|
24 |
+
|
25 |
+
## Features
|
26 |
+
|
27 |
+
✨ **Filter by Programming Language**: View results for specific programming languages (Python, JavaScript, Java, etc.)
|
28 |
+
✨ **Comment Language Support**: Filter by the natural language of code comments
|
29 |
+
✨ **Taxonomy Categories**: Browse results by review type (bug detection, style, performance, etc.)
|
30 |
+
✨ **IP-based Submissions**: Secure submission system with IP tracking
|
31 |
+
✨ **Dark Theme**: Modern, eye-friendly interface
|
32 |
+
"""
|
33 |
+
|
34 |
+
SUBMISSION_GUIDELINES = """
|
35 |
+
## Submission Guidelines
|
36 |
+
|
37 |
+
1. **Model Requirements**: Submit results for at least 100 test cases
|
38 |
+
2. **Format**: Provide scores in the specified format ranges
|
39 |
+
3. **Reproducibility**: Include model details and evaluation setup
|
40 |
+
4. **Quality Metrics**: Rate your model across all 10 quality dimensions
|
41 |
+
5. **Metadata**: Specify programming language, comment language, and taxonomy focus
|
42 |
+
"""
|
43 |
+
|
44 |
+
CONTACT_INFO = """
|
45 |
+
## Contact & Support
|
46 |
+
|
47 |
+
For questions, issues, or contributions, please reach out through our repository or contact the maintainers.
|
48 |
+
"""
|
src/display/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# Display utilities module
|
src/display/css_html_js.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Custom CSS, HTML, and JavaScript for the CodeReview Leaderboard
|
3 |
+
"""
|
4 |
+
|
5 |
+
# Dark theme CSS
|
6 |
+
DARK_THEME_CSS = """
|
7 |
+
/* Dark Theme Styling */
|
8 |
+
:root {
|
9 |
+
--bg-primary: #0d1117;
|
10 |
+
--bg-secondary: #161b22;
|
11 |
+
--bg-tertiary: #21262d;
|
12 |
+
--text-primary: #e6edf3;
|
13 |
+
--text-secondary: #7d8590;
|
14 |
+
--border-color: #30363d;
|
15 |
+
--accent-color: #238636;
|
16 |
+
--accent-hover: #2ea043;
|
17 |
+
--danger-color: #da3633;
|
18 |
+
--warning-color: #d29922;
|
19 |
+
--info-color: #1f6feb;
|
20 |
+
}
|
21 |
+
|
22 |
+
/* Global dark theme */
|
23 |
+
.gradio-container {
|
24 |
+
background: var(--bg-primary) !important;
|
25 |
+
color: var(--text-primary) !important;
|
26 |
+
}
|
27 |
+
|
28 |
+
/* Headers and text */
|
29 |
+
.gradio-container h1, .gradio-container h2, .gradio-container h3 {
|
30 |
+
color: var(--text-primary) !important;
|
31 |
+
}
|
32 |
+
|
33 |
+
.gradio-container p, .gradio-container span {
|
34 |
+
color: var(--text-secondary) !important;
|
35 |
+
}
|
36 |
+
|
37 |
+
/* Tabs */
|
38 |
+
.gradio-container .tab-nav {
|
39 |
+
background: var(--bg-secondary) !important;
|
40 |
+
border-bottom: 1px solid var(--border-color) !important;
|
41 |
+
}
|
42 |
+
|
43 |
+
.gradio-container .tab-nav button {
|
44 |
+
background: transparent !important;
|
45 |
+
color: var(--text-secondary) !important;
|
46 |
+
border: none !important;
|
47 |
+
padding: 12px 24px !important;
|
48 |
+
transition: all 0.2s ease !important;
|
49 |
+
}
|
50 |
+
|
51 |
+
.gradio-container .tab-nav button:hover {
|
52 |
+
color: var(--text-primary) !important;
|
53 |
+
background: var(--bg-tertiary) !important;
|
54 |
+
}
|
55 |
+
|
56 |
+
.gradio-container .tab-nav button.selected {
|
57 |
+
color: var(--text-primary) !important;
|
58 |
+
background: var(--bg-tertiary) !important;
|
59 |
+
border-bottom: 2px solid var(--accent-color) !important;
|
60 |
+
}
|
61 |
+
|
62 |
+
/* Tables */
|
63 |
+
.gradio-container .dataframe {
|
64 |
+
background: var(--bg-secondary) !important;
|
65 |
+
border: 1px solid var(--border-color) !important;
|
66 |
+
border-radius: 8px !important;
|
67 |
+
overflow: hidden !important;
|
68 |
+
}
|
69 |
+
|
70 |
+
.gradio-container .dataframe table {
|
71 |
+
background: var(--bg-secondary) !important;
|
72 |
+
}
|
73 |
+
|
74 |
+
.gradio-container .dataframe th {
|
75 |
+
background: var(--bg-tertiary) !important;
|
76 |
+
color: var(--text-primary) !important;
|
77 |
+
border-bottom: 2px solid var(--border-color) !important;
|
78 |
+
padding: 12px !important;
|
79 |
+
font-weight: 600 !important;
|
80 |
+
}
|
81 |
+
|
82 |
+
.gradio-container .dataframe td {
|
83 |
+
background: var(--bg-secondary) !important;
|
84 |
+
color: var(--text-primary) !important;
|
85 |
+
border-bottom: 1px solid var(--border-color) !important;
|
86 |
+
padding: 10px 12px !important;
|
87 |
+
}
|
88 |
+
|
89 |
+
.gradio-container .dataframe tr:hover td {
|
90 |
+
background: var(--bg-tertiary) !important;
|
91 |
+
}
|
92 |
+
|
93 |
+
/* Form inputs */
|
94 |
+
.gradio-container input, .gradio-container select, .gradio-container textarea {
|
95 |
+
background: var(--bg-tertiary) !important;
|
96 |
+
color: var(--text-primary) !important;
|
97 |
+
border: 1px solid var(--border-color) !important;
|
98 |
+
border-radius: 6px !important;
|
99 |
+
padding: 8px 12px !important;
|
100 |
+
}
|
101 |
+
|
102 |
+
.gradio-container input:focus, .gradio-container select:focus, .gradio-container textarea:focus {
|
103 |
+
border-color: var(--accent-color) !important;
|
104 |
+
box-shadow: 0 0 0 2px rgba(35, 134, 54, 0.2) !important;
|
105 |
+
}
|
106 |
+
|
107 |
+
/* Buttons */
|
108 |
+
.gradio-container button {
|
109 |
+
background: var(--accent-color) !important;
|
110 |
+
color: white !important;
|
111 |
+
border: none !important;
|
112 |
+
border-radius: 6px !important;
|
113 |
+
padding: 8px 16px !important;
|
114 |
+
font-weight: 500 !important;
|
115 |
+
transition: all 0.2s ease !important;
|
116 |
+
}
|
117 |
+
|
118 |
+
.gradio-container button:hover {
|
119 |
+
background: var(--accent-hover) !important;
|
120 |
+
transform: translateY(-1px) !important;
|
121 |
+
}
|
122 |
+
|
123 |
+
.gradio-container button:active {
|
124 |
+
transform: translateY(0) !important;
|
125 |
+
}
|
126 |
+
|
127 |
+
/* Dropdowns */
|
128 |
+
.gradio-container .dropdown {
|
129 |
+
background: var(--bg-tertiary) !important;
|
130 |
+
border: 1px solid var(--border-color) !important;
|
131 |
+
border-radius: 6px !important;
|
132 |
+
}
|
133 |
+
|
134 |
+
.gradio-container .dropdown-menu {
|
135 |
+
background: var(--bg-secondary) !important;
|
136 |
+
border: 1px solid var(--border-color) !important;
|
137 |
+
border-radius: 6px !important;
|
138 |
+
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important;
|
139 |
+
}
|
140 |
+
|
141 |
+
.gradio-container .dropdown-menu .dropdown-item {
|
142 |
+
color: var(--text-primary) !important;
|
143 |
+
padding: 8px 12px !important;
|
144 |
+
}
|
145 |
+
|
146 |
+
.gradio-container .dropdown-menu .dropdown-item:hover {
|
147 |
+
background: var(--bg-tertiary) !important;
|
148 |
+
}
|
149 |
+
|
150 |
+
/* Sliders */
|
151 |
+
.gradio-container .slider {
|
152 |
+
background: var(--bg-tertiary) !important;
|
153 |
+
}
|
154 |
+
|
155 |
+
.gradio-container .slider input[type="range"] {
|
156 |
+
background: var(--bg-tertiary) !important;
|
157 |
+
}
|
158 |
+
|
159 |
+
.gradio-container .slider input[type="range"]::-webkit-slider-thumb {
|
160 |
+
background: var(--accent-color) !important;
|
161 |
+
border: 2px solid var(--bg-secondary) !important;
|
162 |
+
border-radius: 50% !important;
|
163 |
+
width: 18px !important;
|
164 |
+
height: 18px !important;
|
165 |
+
}
|
166 |
+
|
167 |
+
.gradio-container .slider input[type="range"]::-webkit-slider-track {
|
168 |
+
background: var(--border-color) !important;
|
169 |
+
border-radius: 4px !important;
|
170 |
+
height: 6px !important;
|
171 |
+
}
|
172 |
+
|
173 |
+
/* Accordions */
|
174 |
+
.gradio-container .accordion {
|
175 |
+
background: var(--bg-secondary) !important;
|
176 |
+
border: 1px solid var(--border-color) !important;
|
177 |
+
border-radius: 8px !important;
|
178 |
+
margin: 16px 0 !important;
|
179 |
+
}
|
180 |
+
|
181 |
+
.gradio-container .accordion-header {
|
182 |
+
background: var(--bg-tertiary) !important;
|
183 |
+
color: var(--text-primary) !important;
|
184 |
+
padding: 16px !important;
|
185 |
+
border-bottom: 1px solid var(--border-color) !important;
|
186 |
+
cursor: pointer !important;
|
187 |
+
font-weight: 500 !important;
|
188 |
+
}
|
189 |
+
|
190 |
+
.gradio-container .accordion-header:hover {
|
191 |
+
background: var(--bg-primary) !important;
|
192 |
+
}
|
193 |
+
|
194 |
+
/* Status messages */
|
195 |
+
.gradio-container .success {
|
196 |
+
background: rgba(35, 134, 54, 0.1) !important;
|
197 |
+
color: var(--accent-color) !important;
|
198 |
+
border: 1px solid var(--accent-color) !important;
|
199 |
+
border-radius: 6px !important;
|
200 |
+
padding: 12px 16px !important;
|
201 |
+
margin: 8px 0 !important;
|
202 |
+
}
|
203 |
+
|
204 |
+
.gradio-container .error {
|
205 |
+
background: rgba(218, 54, 51, 0.1) !important;
|
206 |
+
color: var(--danger-color) !important;
|
207 |
+
border: 1px solid var(--danger-color) !important;
|
208 |
+
border-radius: 6px !important;
|
209 |
+
padding: 12px 16px !important;
|
210 |
+
margin: 8px 0 !important;
|
211 |
+
}
|
212 |
+
|
213 |
+
/* Responsive design */
|
214 |
+
@media (max-width: 768px) {
|
215 |
+
.gradio-container {
|
216 |
+
padding: 16px !important;
|
217 |
+
}
|
218 |
+
|
219 |
+
.gradio-container .tab-nav button {
|
220 |
+
padding: 8px 16px !important;
|
221 |
+
font-size: 14px !important;
|
222 |
+
}
|
223 |
+
|
224 |
+
.gradio-container .dataframe {
|
225 |
+
font-size: 14px !important;
|
226 |
+
}
|
227 |
+
}
|
228 |
+
"""
|
229 |
+
|
230 |
+
# Custom JavaScript for enhanced functionality
|
231 |
+
CUSTOM_JS = """
|
232 |
+
// Enhanced table sorting and filtering
|
233 |
+
function enhanceTable() {
|
234 |
+
const tables = document.querySelectorAll('.dataframe table');
|
235 |
+
tables.forEach(table => {
|
236 |
+
// Add sorting functionality
|
237 |
+
const headers = table.querySelectorAll('th');
|
238 |
+
headers.forEach((header, index) => {
|
239 |
+
header.style.cursor = 'pointer';
|
240 |
+
header.addEventListener('click', () => sortTable(table, index));
|
241 |
+
});
|
242 |
+
});
|
243 |
+
}
|
244 |
+
|
245 |
+
function sortTable(table, columnIndex) {
|
246 |
+
const tbody = table.querySelector('tbody');
|
247 |
+
const rows = Array.from(tbody.querySelectorAll('tr'));
|
248 |
+
|
249 |
+
rows.sort((a, b) => {
|
250 |
+
const aText = a.cells[columnIndex].textContent.trim();
|
251 |
+
const bText = b.cells[columnIndex].textContent.trim();
|
252 |
+
|
253 |
+
// Try to parse as numbers first
|
254 |
+
const aNum = parseFloat(aText);
|
255 |
+
const bNum = parseFloat(bText);
|
256 |
+
|
257 |
+
if (!isNaN(aNum) && !isNaN(bNum)) {
|
258 |
+
return bNum - aNum; // Descending for numbers
|
259 |
+
}
|
260 |
+
|
261 |
+
return aText.localeCompare(bText); // Ascending for text
|
262 |
+
});
|
263 |
+
|
264 |
+
rows.forEach(row => tbody.appendChild(row));
|
265 |
+
}
|
266 |
+
|
267 |
+
// Auto-refresh functionality
|
268 |
+
function autoRefresh() {
|
269 |
+
setInterval(() => {
|
270 |
+
const refreshBtn = document.querySelector('button[aria-label="Refresh"]');
|
271 |
+
if (refreshBtn) {
|
272 |
+
refreshBtn.click();
|
273 |
+
}
|
274 |
+
}, 30000); // Refresh every 30 seconds
|
275 |
+
}
|
276 |
+
|
277 |
+
// Initialize enhancements
|
278 |
+
document.addEventListener('DOMContentLoaded', function() {
|
279 |
+
enhanceTable();
|
280 |
+
autoRefresh();
|
281 |
+
});
|
282 |
+
"""
|
283 |
+
|
284 |
+
# HTML components
|
285 |
+
HEADER_HTML = """
|
286 |
+
<div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-bottom: 20px;">
|
287 |
+
<h1 style="color: var(--text-primary); margin: 0; font-size: 2.5em; font-weight: 700;">
|
288 |
+
🏆 CodeReview Leaderboard
|
289 |
+
</h1>
|
290 |
+
<p style="color: var(--text-secondary); margin: 10px 0 0 0; font-size: 1.2em;">
|
291 |
+
Benchmarking code review generation models across languages and categories
|
292 |
+
</p>
|
293 |
+
</div>
|
294 |
+
"""
|
295 |
+
|
296 |
+
FOOTER_HTML = """
|
297 |
+
<div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-top: 20px;">
|
298 |
+
<p style="color: var(--text-secondary); margin: 0; font-size: 0.9em;">
|
299 |
+
Built with ❤️ for the code review community |
|
300 |
+
<a href="https://github.com/your-repo" style="color: var(--accent-color); text-decoration: none;">
|
301 |
+
GitHub
|
302 |
+
</a>
|
303 |
+
</p>
|
304 |
+
</div>
|
305 |
+
"""
|
src/display/formatting.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Formatting utilities for display components
|
3 |
+
"""
|
4 |
+
|
5 |
+
import re
|
6 |
+
from typing import List, Dict, Any, Optional
|
7 |
+
from datetime import datetime, timezone
|
8 |
+
|
9 |
+
def format_score(score: float, precision: int = 3) -> str:
|
10 |
+
"""Format a score with specified precision"""
|
11 |
+
if isinstance(score, (int, float)):
|
12 |
+
return f"{score:.{precision}f}"
|
13 |
+
return str(score)
|
14 |
+
|
15 |
+
def format_percentage(score: float, precision: int = 1) -> str:
|
16 |
+
"""Format a score as percentage"""
|
17 |
+
if isinstance(score, (int, float)):
|
18 |
+
return f"{score * 100:.{precision}f}%"
|
19 |
+
return str(score)
|
20 |
+
|
21 |
+
def format_model_name(name: str) -> str:
|
22 |
+
"""Format model name for display"""
|
23 |
+
# Remove common prefixes and make more readable
|
24 |
+
name = name.strip()
|
25 |
+
if "/" in name:
|
26 |
+
org, model = name.split("/", 1)
|
27 |
+
return f"<span style='color: var(--text-secondary); font-size: 0.9em;'>{org}/</span><strong>{model}</strong>"
|
28 |
+
return f"<strong>{name}</strong>"
|
29 |
+
|
30 |
+
def format_timestamp(timestamp: str) -> str:
|
31 |
+
"""Format timestamp for display"""
|
32 |
+
try:
|
33 |
+
dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
|
34 |
+
return dt.strftime("%Y-%m-%d %H:%M UTC")
|
35 |
+
except:
|
36 |
+
return timestamp
|
37 |
+
|
38 |
+
def format_ip_address(ip: str) -> str:
|
39 |
+
"""Format IP address for display (partial masking)"""
|
40 |
+
if not ip:
|
41 |
+
return "Unknown"
|
42 |
+
|
43 |
+
# Mask part of IP for privacy
|
44 |
+
parts = ip.split(".")
|
45 |
+
if len(parts) == 4:
|
46 |
+
return f"{parts[0]}.{parts[1]}.{parts[2]}.xxx"
|
47 |
+
return "xxx.xxx.xxx.xxx"
|
48 |
+
|
49 |
+
def format_metric_score(score: int, metric_name: str) -> str:
|
50 |
+
"""Format metric score with color coding"""
|
51 |
+
if not isinstance(score, (int, float)):
|
52 |
+
return str(score)
|
53 |
+
|
54 |
+
# Color coding based on score
|
55 |
+
if score >= 8:
|
56 |
+
color = "#28a745" # Green
|
57 |
+
elif score >= 6:
|
58 |
+
color = "#ffc107" # Yellow
|
59 |
+
elif score >= 4:
|
60 |
+
color = "#fd7e14" # Orange
|
61 |
+
else:
|
62 |
+
color = "#dc3545" # Red
|
63 |
+
|
64 |
+
return f"<span style='color: {color}; font-weight: 600;'>{score}</span>"
|
65 |
+
|
66 |
+
def format_language_badge(language: str) -> str:
|
67 |
+
"""Format programming language as a badge"""
|
68 |
+
if not language or language == "All":
|
69 |
+
return language
|
70 |
+
|
71 |
+
# Language-specific colors
|
72 |
+
colors = {
|
73 |
+
"Python": "#3776ab",
|
74 |
+
"JavaScript": "#f7df1e",
|
75 |
+
"Java": "#ed8b00",
|
76 |
+
"C++": "#00599c",
|
77 |
+
"C#": "#239120",
|
78 |
+
"Go": "#00add8",
|
79 |
+
"Rust": "#ce422b",
|
80 |
+
"TypeScript": "#3178c6",
|
81 |
+
"PHP": "#777bb4",
|
82 |
+
"Ruby": "#cc342d",
|
83 |
+
"Swift": "#fa7343",
|
84 |
+
"Kotlin": "#7f52ff",
|
85 |
+
"Scala": "#dc322f",
|
86 |
+
"R": "#276dc3",
|
87 |
+
"MATLAB": "#e16737"
|
88 |
+
}
|
89 |
+
|
90 |
+
color = colors.get(language, "#6c757d")
|
91 |
+
return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{language}</span>"
|
92 |
+
|
93 |
+
def format_taxonomy_badge(category: str) -> str:
|
94 |
+
"""Format taxonomy category as a badge"""
|
95 |
+
if not category or category == "All":
|
96 |
+
return category
|
97 |
+
|
98 |
+
# Category-specific colors
|
99 |
+
colors = {
|
100 |
+
"Bug Detection": "#dc3545",
|
101 |
+
"Code Style": "#6f42c1",
|
102 |
+
"Performance": "#fd7e14",
|
103 |
+
"Security": "#e83e8c",
|
104 |
+
"Maintainability": "#20c997",
|
105 |
+
"Documentation": "#17a2b8",
|
106 |
+
"Testing": "#28a745",
|
107 |
+
"Architecture": "#6c757d",
|
108 |
+
"Best Practices": "#007bff",
|
109 |
+
"Refactoring": "#ffc107"
|
110 |
+
}
|
111 |
+
|
112 |
+
color = colors.get(category, "#6c757d")
|
113 |
+
return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{category}</span>"
|
114 |
+
|
115 |
+
def format_comment_language_flag(language: str) -> str:
|
116 |
+
"""Format comment language with flag emoji"""
|
117 |
+
if not language or language == "All":
|
118 |
+
return language
|
119 |
+
|
120 |
+
# Language-specific flags
|
121 |
+
flags = {
|
122 |
+
"English": "🇺🇸",
|
123 |
+
"Chinese": "🇨🇳",
|
124 |
+
"Spanish": "🇪🇸",
|
125 |
+
"French": "🇫🇷",
|
126 |
+
"German": "🇩🇪",
|
127 |
+
"Japanese": "🇯🇵",
|
128 |
+
"Korean": "🇰🇷",
|
129 |
+
"Russian": "🇷🇺",
|
130 |
+
"Portuguese": "🇵🇹",
|
131 |
+
"Italian": "🇮🇹",
|
132 |
+
"Dutch": "🇳🇱"
|
133 |
+
}
|
134 |
+
|
135 |
+
flag = flags.get(language, "🌐")
|
136 |
+
return f"{flag} {language}"
|
137 |
+
|
138 |
+
def sanitize_html(text: str) -> str:
|
139 |
+
"""Sanitize HTML content to prevent XSS"""
|
140 |
+
if not isinstance(text, str):
|
141 |
+
return str(text)
|
142 |
+
|
143 |
+
# Remove potentially dangerous HTML tags
|
144 |
+
text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
145 |
+
text = re.sub(r'<iframe[^>]*>.*?</iframe>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
146 |
+
text = re.sub(r'on\w+="[^"]*"', '', text, flags=re.IGNORECASE)
|
147 |
+
text = re.sub(r'on\w+=\'[^\']*\'', '', text, flags=re.IGNORECASE)
|
148 |
+
|
149 |
+
return text
|
150 |
+
|
151 |
+
def truncate_text(text: str, max_length: int = 50) -> str:
|
152 |
+
"""Truncate text with ellipsis"""
|
153 |
+
if not isinstance(text, str):
|
154 |
+
text = str(text)
|
155 |
+
|
156 |
+
if len(text) <= max_length:
|
157 |
+
return text
|
158 |
+
|
159 |
+
return text[:max_length-3] + "..."
|
160 |
+
|
161 |
+
def format_table_cell(value: Any, column_name: str) -> str:
|
162 |
+
"""Format table cell based on column type"""
|
163 |
+
if value is None:
|
164 |
+
return "N/A"
|
165 |
+
|
166 |
+
# Handle different column types
|
167 |
+
if column_name.lower() in ["bleu", "pass@1", "pass@5", "pass@10"]:
|
168 |
+
return format_percentage(value)
|
169 |
+
elif column_name.lower() == "model":
|
170 |
+
return format_model_name(str(value))
|
171 |
+
elif column_name.lower() == "programming language":
|
172 |
+
return format_language_badge(str(value))
|
173 |
+
elif column_name.lower() == "comment language":
|
174 |
+
return format_comment_language_flag(str(value))
|
175 |
+
elif column_name.lower() == "taxonomy":
|
176 |
+
return format_taxonomy_badge(str(value))
|
177 |
+
elif column_name.lower() in ["readability", "relevance", "explanation clarity",
|
178 |
+
"problem identification", "actionability", "completeness",
|
179 |
+
"specificity", "contextual adequacy", "consistency", "brevity"]:
|
180 |
+
return format_metric_score(value, column_name.lower())
|
181 |
+
else:
|
182 |
+
return sanitize_html(str(value))
|
src/display/utils.py
ADDED
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Display utilities for the CodeReview Leaderboard
|
3 |
+
"""
|
4 |
+
|
5 |
+
from typing import List, Dict, Any, Optional, Tuple
|
6 |
+
import json
|
7 |
+
from datetime import datetime, timezone
|
8 |
+
from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
|
9 |
+
from src.display.formatting import format_table_cell, format_timestamp
|
10 |
+
|
11 |
+
def filter_leaderboard_data(
|
12 |
+
data: List[Dict],
|
13 |
+
programming_language: str = "All",
|
14 |
+
comment_language: str = "All",
|
15 |
+
taxonomy_category: str = "All",
|
16 |
+
sort_by: str = "llm_pass_1",
|
17 |
+
sort_order: str = "desc"
|
18 |
+
) -> List[Dict]:
|
19 |
+
"""Filter and sort leaderboard data based on criteria"""
|
20 |
+
|
21 |
+
if not data:
|
22 |
+
return []
|
23 |
+
|
24 |
+
# Apply filters
|
25 |
+
filtered_data = data.copy()
|
26 |
+
|
27 |
+
if programming_language != "All":
|
28 |
+
filtered_data = [
|
29 |
+
entry for entry in filtered_data
|
30 |
+
if entry.get("programming_language", "").lower() == programming_language.lower()
|
31 |
+
]
|
32 |
+
|
33 |
+
if comment_language != "All":
|
34 |
+
filtered_data = [
|
35 |
+
entry for entry in filtered_data
|
36 |
+
if entry.get("comment_language", "").lower() == comment_language.lower()
|
37 |
+
]
|
38 |
+
|
39 |
+
if taxonomy_category != "All":
|
40 |
+
filtered_data = [
|
41 |
+
entry for entry in filtered_data
|
42 |
+
if entry.get("taxonomy_category", "").lower() == taxonomy_category.lower()
|
43 |
+
]
|
44 |
+
|
45 |
+
# Sort data
|
46 |
+
reverse = sort_order.lower() == "desc"
|
47 |
+
|
48 |
+
try:
|
49 |
+
if sort_by in ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]:
|
50 |
+
filtered_data.sort(key=lambda x: x.get(sort_by, 0), reverse=reverse)
|
51 |
+
elif sort_by in QUALITY_METRICS:
|
52 |
+
filtered_data.sort(key=lambda x: x.get("metrics", {}).get(sort_by, 0), reverse=reverse)
|
53 |
+
else:
|
54 |
+
filtered_data.sort(key=lambda x: str(x.get(sort_by, "")), reverse=reverse)
|
55 |
+
except Exception as e:
|
56 |
+
print(f"Error sorting data: {e}")
|
57 |
+
# Default sort by pass@1
|
58 |
+
filtered_data.sort(key=lambda x: x.get("llm_pass_1", 0), reverse=True)
|
59 |
+
|
60 |
+
return filtered_data
|
61 |
+
|
62 |
+
def get_main_leaderboard_data(
|
63 |
+
data: List[Dict],
|
64 |
+
programming_language: str = "All",
|
65 |
+
comment_language: str = "All",
|
66 |
+
taxonomy_category: str = "All",
|
67 |
+
sort_by: str = "llm_pass_1"
|
68 |
+
) -> List[List[str]]:
|
69 |
+
"""Get formatted main leaderboard table data"""
|
70 |
+
|
71 |
+
filtered_data = filter_leaderboard_data(
|
72 |
+
data, programming_language, comment_language, taxonomy_category, sort_by
|
73 |
+
)
|
74 |
+
|
75 |
+
table_rows = []
|
76 |
+
for entry in filtered_data:
|
77 |
+
row = [
|
78 |
+
format_table_cell(entry.get("model_name", ""), "model"),
|
79 |
+
format_table_cell(entry.get("programming_language", ""), "programming language"),
|
80 |
+
format_table_cell(entry.get("comment_language", ""), "comment language"),
|
81 |
+
format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
|
82 |
+
format_table_cell(entry.get("bleu", 0), "bleu"),
|
83 |
+
format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
|
84 |
+
format_table_cell(entry.get("llm_pass_5", 0), "pass@5"),
|
85 |
+
format_table_cell(entry.get("llm_pass_10", 0), "pass@10"),
|
86 |
+
]
|
87 |
+
table_rows.append(row)
|
88 |
+
|
89 |
+
return table_rows
|
90 |
+
|
91 |
+
def get_quality_metrics_data(
|
92 |
+
data: List[Dict],
|
93 |
+
programming_language: str = "All",
|
94 |
+
comment_language: str = "All",
|
95 |
+
taxonomy_category: str = "All",
|
96 |
+
sort_by: str = "llm_pass_1"
|
97 |
+
) -> List[List[str]]:
|
98 |
+
"""Get formatted quality metrics table data"""
|
99 |
+
|
100 |
+
filtered_data = filter_leaderboard_data(
|
101 |
+
data, programming_language, comment_language, taxonomy_category, sort_by
|
102 |
+
)
|
103 |
+
|
104 |
+
table_rows = []
|
105 |
+
for entry in filtered_data:
|
106 |
+
metrics = entry.get("metrics", {})
|
107 |
+
row = [format_table_cell(entry.get("model_name", ""), "model")]
|
108 |
+
|
109 |
+
for metric in QUALITY_METRICS:
|
110 |
+
formatted_value = format_table_cell(metrics.get(metric, 0), metric.replace("_", " "))
|
111 |
+
row.append(formatted_value)
|
112 |
+
|
113 |
+
table_rows.append(row)
|
114 |
+
|
115 |
+
return table_rows
|
116 |
+
|
117 |
+
def get_submission_history_data(
|
118 |
+
data: List[Dict],
|
119 |
+
programming_language: str = "All",
|
120 |
+
comment_language: str = "All",
|
121 |
+
taxonomy_category: str = "All",
|
122 |
+
limit: int = 50
|
123 |
+
) -> List[List[str]]:
|
124 |
+
"""Get formatted submission history data"""
|
125 |
+
|
126 |
+
filtered_data = filter_leaderboard_data(
|
127 |
+
data, programming_language, comment_language, taxonomy_category, "submission_date", "desc"
|
128 |
+
)
|
129 |
+
|
130 |
+
# Limit results
|
131 |
+
filtered_data = filtered_data[:limit]
|
132 |
+
|
133 |
+
table_rows = []
|
134 |
+
for entry in filtered_data:
|
135 |
+
row = [
|
136 |
+
format_table_cell(entry.get("model_name", ""), "model"),
|
137 |
+
format_table_cell(entry.get("programming_language", ""), "programming language"),
|
138 |
+
format_table_cell(entry.get("comment_language", ""), "comment language"),
|
139 |
+
format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
|
140 |
+
format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
|
141 |
+
format_timestamp(entry.get("submission_date", "")),
|
142 |
+
entry.get("submission_ip", "").split(".")[0] + ".xxx.xxx.xxx" if entry.get("submission_ip") else "Unknown"
|
143 |
+
]
|
144 |
+
table_rows.append(row)
|
145 |
+
|
146 |
+
return table_rows
|
147 |
+
|
148 |
+
def get_statistics_summary(data: List[Dict]) -> Dict[str, Any]:
|
149 |
+
"""Get summary statistics for the leaderboard"""
|
150 |
+
|
151 |
+
if not data:
|
152 |
+
return {
|
153 |
+
"total_models": 0,
|
154 |
+
"total_submissions": 0,
|
155 |
+
"avg_pass_1": 0,
|
156 |
+
"best_model": "None",
|
157 |
+
"languages_covered": 0,
|
158 |
+
"categories_covered": 0
|
159 |
+
}
|
160 |
+
|
161 |
+
# Calculate statistics
|
162 |
+
total_models = len(set(entry.get("model_name", "") for entry in data))
|
163 |
+
total_submissions = len(data)
|
164 |
+
|
165 |
+
pass_1_scores = [entry.get("llm_pass_1", 0) for entry in data if entry.get("llm_pass_1") is not None]
|
166 |
+
avg_pass_1 = sum(pass_1_scores) / len(pass_1_scores) if pass_1_scores else 0
|
167 |
+
|
168 |
+
best_entry = max(data, key=lambda x: x.get("llm_pass_1", 0)) if data else None
|
169 |
+
best_model = best_entry.get("model_name", "None") if best_entry else "None"
|
170 |
+
|
171 |
+
languages_covered = len(set(entry.get("programming_language", "") for entry in data if entry.get("programming_language")))
|
172 |
+
categories_covered = len(set(entry.get("taxonomy_category", "") for entry in data if entry.get("taxonomy_category")))
|
173 |
+
|
174 |
+
return {
|
175 |
+
"total_models": total_models,
|
176 |
+
"total_submissions": total_submissions,
|
177 |
+
"avg_pass_1": avg_pass_1,
|
178 |
+
"best_model": best_model,
|
179 |
+
"languages_covered": languages_covered,
|
180 |
+
"categories_covered": categories_covered
|
181 |
+
}
|
182 |
+
|
183 |
+
def validate_submission_data(data: Dict[str, Any]) -> Tuple[bool, str]:
|
184 |
+
"""Validate submission data"""
|
185 |
+
|
186 |
+
required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
|
187 |
+
|
188 |
+
# Check required fields
|
189 |
+
for field in required_fields:
|
190 |
+
if not data.get(field):
|
191 |
+
return False, f"Missing required field: {field}"
|
192 |
+
|
193 |
+
# Validate scores
|
194 |
+
score_fields = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
|
195 |
+
for field in score_fields:
|
196 |
+
value = data.get(field)
|
197 |
+
if value is None:
|
198 |
+
return False, f"Missing score: {field}"
|
199 |
+
if not isinstance(value, (int, float)):
|
200 |
+
return False, f"Invalid score format: {field}"
|
201 |
+
if not 0 <= value <= 1:
|
202 |
+
return False, f"Score out of range (0-1): {field}"
|
203 |
+
|
204 |
+
# Validate metrics
|
205 |
+
metrics = data.get("metrics", {})
|
206 |
+
for metric in QUALITY_METRICS:
|
207 |
+
value = metrics.get(metric)
|
208 |
+
if value is None:
|
209 |
+
return False, f"Missing metric: {metric}"
|
210 |
+
if not isinstance(value, (int, float)):
|
211 |
+
return False, f"Invalid metric format: {metric}"
|
212 |
+
if not 0 <= value <= 10:
|
213 |
+
return False, f"Metric out of range (0-10): {metric}"
|
214 |
+
|
215 |
+
# Validate language and category choices
|
216 |
+
if data.get("programming_language") not in PROGRAMMING_LANGUAGES:
|
217 |
+
return False, "Invalid programming language"
|
218 |
+
|
219 |
+
if data.get("comment_language") not in COMMENT_LANGUAGES:
|
220 |
+
return False, "Invalid comment language"
|
221 |
+
|
222 |
+
if data.get("taxonomy_category") not in TAXONOMY_CATEGORIES:
|
223 |
+
return False, "Invalid taxonomy category"
|
224 |
+
|
225 |
+
return True, "Valid submission"
|
226 |
+
|
227 |
+
def get_leaderboard_insights(data: List[Dict]) -> Dict[str, Any]:
|
228 |
+
"""Get insights and trends from leaderboard data"""
|
229 |
+
|
230 |
+
if not data:
|
231 |
+
return {}
|
232 |
+
|
233 |
+
# Language performance analysis
|
234 |
+
lang_performance = {}
|
235 |
+
for lang in PROGRAMMING_LANGUAGES[1:]: # Skip "All"
|
236 |
+
lang_data = [entry for entry in data if entry.get("programming_language") == lang]
|
237 |
+
if lang_data:
|
238 |
+
avg_score = sum(entry.get("llm_pass_1", 0) for entry in lang_data) / len(lang_data)
|
239 |
+
lang_performance[lang] = {
|
240 |
+
"avg_score": avg_score,
|
241 |
+
"model_count": len(lang_data),
|
242 |
+
"best_model": max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
|
243 |
+
}
|
244 |
+
|
245 |
+
# Category performance analysis
|
246 |
+
category_performance = {}
|
247 |
+
for category in TAXONOMY_CATEGORIES[1:]: # Skip "All"
|
248 |
+
cat_data = [entry for entry in data if entry.get("taxonomy_category") == category]
|
249 |
+
if cat_data:
|
250 |
+
avg_score = sum(entry.get("llm_pass_1", 0) for entry in cat_data) / len(cat_data)
|
251 |
+
category_performance[category] = {
|
252 |
+
"avg_score": avg_score,
|
253 |
+
"model_count": len(cat_data),
|
254 |
+
"best_model": max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
|
255 |
+
}
|
256 |
+
|
257 |
+
return {
|
258 |
+
"language_performance": lang_performance,
|
259 |
+
"category_performance": category_performance,
|
260 |
+
"top_performers": sorted(data, key=lambda x: x.get("llm_pass_1", 0), reverse=True)[:5]
|
261 |
+
}
|
262 |
+
|
263 |
+
def export_leaderboard_data(data: List[Dict], format_type: str = "json") -> str:
|
264 |
+
"""Export leaderboard data in specified format"""
|
265 |
+
|
266 |
+
if format_type.lower() == "json":
|
267 |
+
return json.dumps(data, indent=2, ensure_ascii=False)
|
268 |
+
elif format_type.lower() == "csv":
|
269 |
+
# Simple CSV export
|
270 |
+
if not data:
|
271 |
+
return ""
|
272 |
+
|
273 |
+
# Get headers
|
274 |
+
headers = ["model_name", "programming_language", "comment_language", "taxonomy_category",
|
275 |
+
"bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
|
276 |
+
headers.extend(QUALITY_METRICS)
|
277 |
+
|
278 |
+
lines = [",".join(headers)]
|
279 |
+
|
280 |
+
for entry in data:
|
281 |
+
row = []
|
282 |
+
for header in headers:
|
283 |
+
if header in QUALITY_METRICS:
|
284 |
+
value = entry.get("metrics", {}).get(header, "")
|
285 |
+
else:
|
286 |
+
value = entry.get(header, "")
|
287 |
+
row.append(str(value))
|
288 |
+
lines.append(",".join(row))
|
289 |
+
|
290 |
+
return "\n".join(lines)
|
291 |
+
else:
|
292 |
+
return "Unsupported format"
|
src/envs.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Environment configuration and constants
|
3 |
+
"""
|
4 |
+
|
5 |
+
import os
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
# Data paths
|
9 |
+
DATA_DIR = Path("data")
|
10 |
+
LEADERBOARD_PATH = DATA_DIR / "leaderboard_data.json"
|
11 |
+
SUBMISSIONS_PATH = DATA_DIR / "submissions.json"
|
12 |
+
|
13 |
+
# Create data directory if it doesn't exist
|
14 |
+
DATA_DIR.mkdir(exist_ok=True)
|
15 |
+
|
16 |
+
# Programming languages supported
|
17 |
+
PROGRAMMING_LANGUAGES = [
|
18 |
+
"All",
|
19 |
+
"Python",
|
20 |
+
"JavaScript",
|
21 |
+
"Java",
|
22 |
+
"C++",
|
23 |
+
"C#",
|
24 |
+
"Go",
|
25 |
+
"Rust",
|
26 |
+
"TypeScript",
|
27 |
+
"PHP",
|
28 |
+
"Ruby",
|
29 |
+
"Swift",
|
30 |
+
"Kotlin",
|
31 |
+
"Scala",
|
32 |
+
"R",
|
33 |
+
"MATLAB",
|
34 |
+
"Other"
|
35 |
+
]
|
36 |
+
|
37 |
+
# Comment languages supported
|
38 |
+
COMMENT_LANGUAGES = [
|
39 |
+
"All",
|
40 |
+
"English",
|
41 |
+
"Chinese",
|
42 |
+
"Spanish",
|
43 |
+
"French",
|
44 |
+
"German",
|
45 |
+
"Japanese",
|
46 |
+
"Korean",
|
47 |
+
"Russian",
|
48 |
+
"Portuguese",
|
49 |
+
"Italian",
|
50 |
+
"Dutch",
|
51 |
+
"Other"
|
52 |
+
]
|
53 |
+
|
54 |
+
# Taxonomy categories
|
55 |
+
TAXONOMY_CATEGORIES = [
|
56 |
+
"All",
|
57 |
+
"Bug Detection",
|
58 |
+
"Code Style",
|
59 |
+
"Performance",
|
60 |
+
"Security",
|
61 |
+
"Maintainability",
|
62 |
+
"Documentation",
|
63 |
+
"Testing",
|
64 |
+
"Architecture",
|
65 |
+
"Best Practices",
|
66 |
+
"Refactoring",
|
67 |
+
"Other"
|
68 |
+
]
|
69 |
+
|
70 |
+
# Quality metrics
|
71 |
+
QUALITY_METRICS = [
|
72 |
+
"readability",
|
73 |
+
"relevance",
|
74 |
+
"explanation_clarity",
|
75 |
+
"problem_identification",
|
76 |
+
"actionability",
|
77 |
+
"completeness",
|
78 |
+
"specificity",
|
79 |
+
"contextual_adequacy",
|
80 |
+
"consistency",
|
81 |
+
"brevity"
|
82 |
+
]
|
83 |
+
|
84 |
+
# Table headers
|
85 |
+
MAIN_HEADERS = ["Model", "Programming Language", "Comment Language", "Taxonomy", "BLEU", "Pass@1", "Pass@5", "Pass@10"]
|
86 |
+
|
87 |
+
QUALITY_HEADERS = ["Model"] + [metric.replace("_", " ").title() for metric in QUALITY_METRICS]
|
88 |
+
|
89 |
+
# Default data
|
90 |
+
DEFAULT_DATA = [{
|
91 |
+
"model_name": "example/model",
|
92 |
+
"programming_language": "Python",
|
93 |
+
"comment_language": "English",
|
94 |
+
"taxonomy_category": "Bug Detection",
|
95 |
+
"bleu": 0.5,
|
96 |
+
"llm_pass_1": 0.5,
|
97 |
+
"llm_pass_5": 0.5,
|
98 |
+
"llm_pass_10": 0.5,
|
99 |
+
"metrics": {
|
100 |
+
"readability": 5, "relevance": 5, "explanation_clarity": 5,
|
101 |
+
"problem_identification": 5, "actionability": 5, "completeness": 5,
|
102 |
+
"specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5
|
103 |
+
},
|
104 |
+
"submission_ip": "127.0.0.1",
|
105 |
+
"submission_date": "2024-01-01T00:00:00Z"
|
106 |
+
}]
|
src/leaderboard/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# Leaderboard processing module
|
src/leaderboard/processor.py
ADDED
@@ -0,0 +1,306 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Leaderboard data processor for CodeReview Leaderboard
|
3 |
+
"""
|
4 |
+
|
5 |
+
import json
|
6 |
+
import traceback
|
7 |
+
from typing import List, Dict, Any, Optional
|
8 |
+
from datetime import datetime, timezone, timedelta
|
9 |
+
from pathlib import Path
|
10 |
+
from src.envs import LEADERBOARD_PATH, SUBMISSIONS_PATH, DEFAULT_DATA
|
11 |
+
from src.display.utils import validate_submission_data, get_statistics_summary
|
12 |
+
|
13 |
+
class LeaderboardProcessor:
|
14 |
+
"""Handles all leaderboard data operations"""
|
15 |
+
|
16 |
+
def __init__(self):
|
17 |
+
self.leaderboard_path = LEADERBOARD_PATH
|
18 |
+
self.submissions_path = SUBMISSIONS_PATH
|
19 |
+
self._ensure_data_files()
|
20 |
+
|
21 |
+
def _ensure_data_files(self):
|
22 |
+
"""Ensure data files exist with default data"""
|
23 |
+
if not self.leaderboard_path.exists():
|
24 |
+
self.save_leaderboard_data(DEFAULT_DATA)
|
25 |
+
|
26 |
+
if not self.submissions_path.exists():
|
27 |
+
self.save_submission_log([])
|
28 |
+
|
29 |
+
def load_leaderboard_data(self) -> List[Dict]:
|
30 |
+
"""Load leaderboard data from storage"""
|
31 |
+
try:
|
32 |
+
with open(self.leaderboard_path, 'r', encoding='utf-8') as f:
|
33 |
+
data = json.load(f)
|
34 |
+
return data.get("leaderboard", [])
|
35 |
+
except Exception as e:
|
36 |
+
print(f"Error loading leaderboard: {e}")
|
37 |
+
return DEFAULT_DATA.copy()
|
38 |
+
|
39 |
+
def save_leaderboard_data(self, data: List[Dict]) -> bool:
|
40 |
+
"""Save leaderboard data to storage"""
|
41 |
+
try:
|
42 |
+
to_store = {
|
43 |
+
"leaderboard": data,
|
44 |
+
"last_updated": datetime.now(timezone.utc).isoformat(),
|
45 |
+
"total_entries": len(data)
|
46 |
+
}
|
47 |
+
|
48 |
+
with open(self.leaderboard_path, 'w', encoding='utf-8') as f:
|
49 |
+
json.dump(to_store, f, indent=2, ensure_ascii=False)
|
50 |
+
|
51 |
+
return True
|
52 |
+
except Exception as e:
|
53 |
+
print(f"Error saving leaderboard: {e}")
|
54 |
+
return False
|
55 |
+
|
56 |
+
def load_submission_log(self) -> List[Dict]:
|
57 |
+
"""Load submission log from storage"""
|
58 |
+
try:
|
59 |
+
with open(self.submissions_path, 'r', encoding='utf-8') as f:
|
60 |
+
data = json.load(f)
|
61 |
+
return data.get("submissions", [])
|
62 |
+
except Exception as e:
|
63 |
+
print(f"Error loading submission log: {e}")
|
64 |
+
return []
|
65 |
+
|
66 |
+
def save_submission_log(self, submissions: List[Dict]) -> bool:
|
67 |
+
"""Save submission log to storage"""
|
68 |
+
try:
|
69 |
+
to_store = {
|
70 |
+
"submissions": submissions,
|
71 |
+
"last_updated": datetime.now(timezone.utc).isoformat(),
|
72 |
+
"total_submissions": len(submissions)
|
73 |
+
}
|
74 |
+
|
75 |
+
with open(self.submissions_path, 'w', encoding='utf-8') as f:
|
76 |
+
json.dump(to_store, f, indent=2, ensure_ascii=False)
|
77 |
+
|
78 |
+
return True
|
79 |
+
except Exception as e:
|
80 |
+
print(f"Error saving submission log: {e}")
|
81 |
+
return False
|
82 |
+
|
83 |
+
def add_submission(self, submission_data: Dict[str, Any], ip_address: str) -> tuple[bool, str]:
|
84 |
+
"""Add a new submission to the leaderboard"""
|
85 |
+
try:
|
86 |
+
# Validate submission data
|
87 |
+
is_valid, message = validate_submission_data(submission_data)
|
88 |
+
if not is_valid:
|
89 |
+
return False, message
|
90 |
+
|
91 |
+
# Add metadata
|
92 |
+
submission_data["submission_ip"] = ip_address
|
93 |
+
submission_data["submission_date"] = datetime.now(timezone.utc).isoformat()
|
94 |
+
|
95 |
+
# Load current data
|
96 |
+
current_data = self.load_leaderboard_data()
|
97 |
+
|
98 |
+
# Check for existing model and replace if found
|
99 |
+
model_name = submission_data.get("model_name", "")
|
100 |
+
current_data = [entry for entry in current_data if entry.get("model_name") != model_name]
|
101 |
+
|
102 |
+
# Add new submission
|
103 |
+
current_data.append(submission_data)
|
104 |
+
|
105 |
+
# Save updated data
|
106 |
+
if self.save_leaderboard_data(current_data):
|
107 |
+
# Log the submission
|
108 |
+
self._log_submission(submission_data, ip_address)
|
109 |
+
return True, "✅ Submission recorded successfully!"
|
110 |
+
else:
|
111 |
+
return False, "❌ Failed to save submission"
|
112 |
+
|
113 |
+
except Exception as e:
|
114 |
+
print(f"Error adding submission: {e}")
|
115 |
+
traceback.print_exc()
|
116 |
+
return False, f"❌ Submission failed: {str(e)}"
|
117 |
+
|
118 |
+
def _log_submission(self, submission_data: Dict[str, Any], ip_address: str):
|
119 |
+
"""Log submission for audit trail"""
|
120 |
+
try:
|
121 |
+
submissions = self.load_submission_log()
|
122 |
+
|
123 |
+
log_entry = {
|
124 |
+
"model_name": submission_data.get("model_name"),
|
125 |
+
"programming_language": submission_data.get("programming_language"),
|
126 |
+
"comment_language": submission_data.get("comment_language"),
|
127 |
+
"taxonomy_category": submission_data.get("taxonomy_category"),
|
128 |
+
"scores": {
|
129 |
+
"bleu": submission_data.get("bleu"),
|
130 |
+
"llm_pass_1": submission_data.get("llm_pass_1"),
|
131 |
+
"llm_pass_5": submission_data.get("llm_pass_5"),
|
132 |
+
"llm_pass_10": submission_data.get("llm_pass_10")
|
133 |
+
},
|
134 |
+
"submission_ip": ip_address,
|
135 |
+
"submission_date": submission_data.get("submission_date"),
|
136 |
+
"status": "accepted"
|
137 |
+
}
|
138 |
+
|
139 |
+
submissions.append(log_entry)
|
140 |
+
|
141 |
+
# Keep only last 1000 submissions
|
142 |
+
submissions = submissions[-1000:]
|
143 |
+
|
144 |
+
self.save_submission_log(submissions)
|
145 |
+
|
146 |
+
except Exception as e:
|
147 |
+
print(f"Error logging submission: {e}")
|
148 |
+
|
149 |
+
def get_model_history(self, model_name: str) -> List[Dict]:
|
150 |
+
"""Get submission history for a specific model"""
|
151 |
+
try:
|
152 |
+
submissions = self.load_submission_log()
|
153 |
+
return [
|
154 |
+
sub for sub in submissions
|
155 |
+
if sub.get("model_name") == model_name
|
156 |
+
]
|
157 |
+
except Exception as e:
|
158 |
+
print(f"Error getting model history: {e}")
|
159 |
+
return []
|
160 |
+
|
161 |
+
def get_ip_submissions(self, ip_address: str, limit: int = 10) -> List[Dict]:
|
162 |
+
"""Get recent submissions from a specific IP"""
|
163 |
+
try:
|
164 |
+
submissions = self.load_submission_log()
|
165 |
+
ip_submissions = [
|
166 |
+
sub for sub in submissions
|
167 |
+
if sub.get("submission_ip") == ip_address
|
168 |
+
]
|
169 |
+
|
170 |
+
# Sort by date and limit
|
171 |
+
ip_submissions.sort(key=lambda x: x.get("submission_date", ""), reverse=True)
|
172 |
+
return ip_submissions[:limit]
|
173 |
+
|
174 |
+
except Exception as e:
|
175 |
+
print(f"Error getting IP submissions: {e}")
|
176 |
+
return []
|
177 |
+
|
178 |
+
def check_rate_limit(self, ip_address: str, max_submissions: int = 5, hours: int = 24) -> tuple[bool, str]:
|
179 |
+
"""Check if IP has exceeded rate limit"""
|
180 |
+
try:
|
181 |
+
submissions = self.get_ip_submissions(ip_address, max_submissions * 2)
|
182 |
+
|
183 |
+
# Count submissions within the time window
|
184 |
+
cutoff_time = datetime.now(timezone.utc) - timedelta(hours=hours)
|
185 |
+
recent_submissions = [
|
186 |
+
sub for sub in submissions
|
187 |
+
if datetime.fromisoformat(sub.get("submission_date", "")).replace(tzinfo=timezone.utc) > cutoff_time
|
188 |
+
]
|
189 |
+
|
190 |
+
if len(recent_submissions) >= max_submissions:
|
191 |
+
return False, f"Rate limit exceeded: {len(recent_submissions)}/{max_submissions} submissions in {hours} hours"
|
192 |
+
|
193 |
+
return True, f"Rate limit OK: {len(recent_submissions)}/{max_submissions} submissions in {hours} hours"
|
194 |
+
|
195 |
+
except Exception as e:
|
196 |
+
print(f"Error checking rate limit: {e}")
|
197 |
+
return True, "Rate limit check failed, allowing submission"
|
198 |
+
|
199 |
+
def get_leaderboard_stats(self) -> Dict[str, Any]:
|
200 |
+
"""Get comprehensive leaderboard statistics"""
|
201 |
+
try:
|
202 |
+
data = self.load_leaderboard_data()
|
203 |
+
submissions = self.load_submission_log()
|
204 |
+
|
205 |
+
basic_stats = get_statistics_summary(data)
|
206 |
+
|
207 |
+
# Additional stats
|
208 |
+
recent_submissions = len([
|
209 |
+
sub for sub in submissions
|
210 |
+
if datetime.fromisoformat(sub.get("submission_date", "")).replace(tzinfo=timezone.utc) >
|
211 |
+
datetime.now(timezone.utc) - timedelta(days=7)
|
212 |
+
])
|
213 |
+
|
214 |
+
return {
|
215 |
+
**basic_stats,
|
216 |
+
"recent_submissions_7d": recent_submissions,
|
217 |
+
"total_logged_submissions": len(submissions),
|
218 |
+
"last_updated": datetime.now(timezone.utc).isoformat()
|
219 |
+
}
|
220 |
+
|
221 |
+
except Exception as e:
|
222 |
+
print(f"Error getting leaderboard stats: {e}")
|
223 |
+
return {}
|
224 |
+
|
225 |
+
def backup_data(self) -> bool:
|
226 |
+
"""Create backup of current data"""
|
227 |
+
try:
|
228 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
229 |
+
backup_dir = Path("backups")
|
230 |
+
backup_dir.mkdir(exist_ok=True)
|
231 |
+
|
232 |
+
# Backup leaderboard
|
233 |
+
if self.leaderboard_path.exists():
|
234 |
+
backup_path = backup_dir / f"leaderboard_{timestamp}.json"
|
235 |
+
with open(self.leaderboard_path, 'r') as src, open(backup_path, 'w') as dst:
|
236 |
+
dst.write(src.read())
|
237 |
+
|
238 |
+
# Backup submissions
|
239 |
+
if self.submissions_path.exists():
|
240 |
+
backup_path = backup_dir / f"submissions_{timestamp}.json"
|
241 |
+
with open(self.submissions_path, 'r') as src, open(backup_path, 'w') as dst:
|
242 |
+
dst.write(src.read())
|
243 |
+
|
244 |
+
return True
|
245 |
+
|
246 |
+
except Exception as e:
|
247 |
+
print(f"Error creating backup: {e}")
|
248 |
+
return False
|
249 |
+
|
250 |
+
def export_data(self, format_type: str = "json") -> str:
|
251 |
+
"""Export leaderboard data in specified format"""
|
252 |
+
try:
|
253 |
+
from src.display.utils import export_leaderboard_data
|
254 |
+
|
255 |
+
data = self.load_leaderboard_data()
|
256 |
+
return export_leaderboard_data(data, format_type)
|
257 |
+
|
258 |
+
except Exception as e:
|
259 |
+
print(f"Error exporting data: {e}")
|
260 |
+
return f"Export failed: {str(e)}"
|
261 |
+
|
262 |
+
def validate_data_integrity(self) -> Dict[str, Any]:
|
263 |
+
"""Validate data integrity and return report"""
|
264 |
+
try:
|
265 |
+
data = self.load_leaderboard_data()
|
266 |
+
submissions = self.load_submission_log()
|
267 |
+
|
268 |
+
issues = []
|
269 |
+
|
270 |
+
# Check for duplicate models
|
271 |
+
model_names = [entry.get("model_name") for entry in data]
|
272 |
+
duplicates = [name for name in model_names if model_names.count(name) > 1]
|
273 |
+
if duplicates:
|
274 |
+
issues.append(f"Duplicate models found: {set(duplicates)}")
|
275 |
+
|
276 |
+
# Check for missing required fields
|
277 |
+
required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
|
278 |
+
for i, entry in enumerate(data):
|
279 |
+
missing = [field for field in required_fields if not entry.get(field)]
|
280 |
+
if missing:
|
281 |
+
issues.append(f"Entry {i}: Missing fields {missing}")
|
282 |
+
|
283 |
+
# Check score ranges
|
284 |
+
for i, entry in enumerate(data):
|
285 |
+
scores = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
|
286 |
+
for score in scores:
|
287 |
+
value = entry.get(score)
|
288 |
+
if value is not None and (value < 0 or value > 1):
|
289 |
+
issues.append(f"Entry {i}: {score} out of range: {value}")
|
290 |
+
|
291 |
+
return {
|
292 |
+
"is_valid": len(issues) == 0,
|
293 |
+
"issues": issues,
|
294 |
+
"total_entries": len(data),
|
295 |
+
"total_submissions": len(submissions),
|
296 |
+
"check_date": datetime.now(timezone.utc).isoformat()
|
297 |
+
}
|
298 |
+
|
299 |
+
except Exception as e:
|
300 |
+
return {
|
301 |
+
"is_valid": False,
|
302 |
+
"issues": [f"Validation failed: {str(e)}"],
|
303 |
+
"total_entries": 0,
|
304 |
+
"total_submissions": 0,
|
305 |
+
"check_date": datetime.now(timezone.utc).isoformat()
|
306 |
+
}
|
src/submission/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
# Submission handling module
|
src/submission/submit.py
ADDED
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Submission system for CodeReview Leaderboard
|
3 |
+
"""
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import re
|
7 |
+
from typing import Dict, Any, List, Tuple
|
8 |
+
from datetime import datetime, timezone
|
9 |
+
from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
|
10 |
+
from src.leaderboard.processor import LeaderboardProcessor
|
11 |
+
from src.display.utils import get_main_leaderboard_data, get_quality_metrics_data
|
12 |
+
|
13 |
+
class SubmissionHandler:
|
14 |
+
"""Handles model submissions with validation and rate limiting"""
|
15 |
+
|
16 |
+
def __init__(self):
|
17 |
+
self.processor = LeaderboardProcessor()
|
18 |
+
|
19 |
+
def get_client_ip(self, request: gr.Request) -> str:
|
20 |
+
"""Extract client IP address from request"""
|
21 |
+
try:
|
22 |
+
# Check for forwarded headers first
|
23 |
+
forwarded_for = request.headers.get('X-Forwarded-For')
|
24 |
+
if forwarded_for:
|
25 |
+
# Take the first IP if multiple
|
26 |
+
ip = forwarded_for.split(',')[0].strip()
|
27 |
+
return ip
|
28 |
+
|
29 |
+
# Check for real IP header
|
30 |
+
real_ip = request.headers.get('X-Real-IP')
|
31 |
+
if real_ip:
|
32 |
+
return real_ip.strip()
|
33 |
+
|
34 |
+
# Fall back to client host
|
35 |
+
if hasattr(request, 'client') and hasattr(request.client, 'host'):
|
36 |
+
return request.client.host
|
37 |
+
|
38 |
+
# Default fallback
|
39 |
+
return "127.0.0.1"
|
40 |
+
|
41 |
+
except Exception as e:
|
42 |
+
print(f"Error getting client IP: {e}")
|
43 |
+
return "127.0.0.1"
|
44 |
+
|
45 |
+
def validate_model_name(self, model_name: str) -> Tuple[bool, str]:
|
46 |
+
"""Validate model name format"""
|
47 |
+
if not model_name or not model_name.strip():
|
48 |
+
return False, "Model name cannot be empty"
|
49 |
+
|
50 |
+
model_name = model_name.strip()
|
51 |
+
|
52 |
+
# Check length
|
53 |
+
if len(model_name) > 100:
|
54 |
+
return False, "Model name too long (max 100 characters)"
|
55 |
+
|
56 |
+
# Check for valid characters
|
57 |
+
if not re.match(r'^[a-zA-Z0-9._/-]+$', model_name):
|
58 |
+
return False, "Model name contains invalid characters (only letters, numbers, dots, hyphens, underscores, and slashes allowed)"
|
59 |
+
|
60 |
+
# Check for organization/model format
|
61 |
+
if "/" in model_name:
|
62 |
+
parts = model_name.split("/")
|
63 |
+
if len(parts) != 2:
|
64 |
+
return False, "Model name should be in format 'organization/model'"
|
65 |
+
if not parts[0] or not parts[1]:
|
66 |
+
return False, "Both organization and model name must be specified"
|
67 |
+
|
68 |
+
return True, "Valid model name"
|
69 |
+
|
70 |
+
def validate_scores(self, scores: Dict[str, float]) -> Tuple[bool, str]:
|
71 |
+
"""Validate score values"""
|
72 |
+
required_scores = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
|
73 |
+
|
74 |
+
for score_name in required_scores:
|
75 |
+
value = scores.get(score_name)
|
76 |
+
|
77 |
+
if value is None:
|
78 |
+
return False, f"Missing score: {score_name}"
|
79 |
+
|
80 |
+
if not isinstance(value, (int, float)):
|
81 |
+
return False, f"Invalid score format for {score_name}: must be a number"
|
82 |
+
|
83 |
+
if not (0 <= value <= 1):
|
84 |
+
return False, f"Score {score_name} out of range: {value} (must be between 0 and 1)"
|
85 |
+
|
86 |
+
# Check logical consistency
|
87 |
+
if scores["llm_pass_1"] > scores["llm_pass_5"]:
|
88 |
+
return False, "Pass@1 score cannot be higher than Pass@5"
|
89 |
+
|
90 |
+
if scores["llm_pass_5"] > scores["llm_pass_10"]:
|
91 |
+
return False, "Pass@5 score cannot be higher than Pass@10"
|
92 |
+
|
93 |
+
return True, "Valid scores"
|
94 |
+
|
95 |
+
def validate_metrics(self, metrics: Dict[str, int]) -> Tuple[bool, str]:
|
96 |
+
"""Validate quality metrics"""
|
97 |
+
for metric_name in QUALITY_METRICS:
|
98 |
+
value = metrics.get(metric_name)
|
99 |
+
|
100 |
+
if value is None:
|
101 |
+
return False, f"Missing metric: {metric_name}"
|
102 |
+
|
103 |
+
if not isinstance(value, (int, float)):
|
104 |
+
return False, f"Invalid metric format for {metric_name}: must be a number"
|
105 |
+
|
106 |
+
if not (0 <= value <= 10):
|
107 |
+
return False, f"Metric {metric_name} out of range: {value} (must be between 0 and 10)"
|
108 |
+
|
109 |
+
return True, "Valid metrics"
|
110 |
+
|
111 |
+
def submit_model(
|
112 |
+
self,
|
113 |
+
request: gr.Request,
|
114 |
+
current_data: List[Dict],
|
115 |
+
model_name: str,
|
116 |
+
programming_language: str,
|
117 |
+
comment_language: str,
|
118 |
+
taxonomy_category: str,
|
119 |
+
bleu: float,
|
120 |
+
llm_pass_1: float,
|
121 |
+
llm_pass_5: float,
|
122 |
+
llm_pass_10: float,
|
123 |
+
readability: int,
|
124 |
+
relevance: int,
|
125 |
+
explanation_clarity: int,
|
126 |
+
problem_identification: int,
|
127 |
+
actionability: int,
|
128 |
+
completeness: int,
|
129 |
+
specificity: int,
|
130 |
+
contextual_adequacy: int,
|
131 |
+
consistency: int,
|
132 |
+
brevity: int,
|
133 |
+
) -> Tuple[List[Dict], List[List[str]], List[List[str]], str]:
|
134 |
+
"""Handle model submission with full validation"""
|
135 |
+
|
136 |
+
try:
|
137 |
+
# Get client IP
|
138 |
+
client_ip = self.get_client_ip(request)
|
139 |
+
|
140 |
+
# Check rate limiting
|
141 |
+
rate_ok, rate_msg = self.processor.check_rate_limit(client_ip)
|
142 |
+
if not rate_ok:
|
143 |
+
return current_data, [], [], f"❌ {rate_msg}"
|
144 |
+
|
145 |
+
# Validate model name
|
146 |
+
name_valid, name_msg = self.validate_model_name(model_name)
|
147 |
+
if not name_valid:
|
148 |
+
return current_data, [], [], f"❌ {name_msg}"
|
149 |
+
|
150 |
+
# Validate scores
|
151 |
+
scores = {
|
152 |
+
"bleu": bleu,
|
153 |
+
"llm_pass_1": llm_pass_1,
|
154 |
+
"llm_pass_5": llm_pass_5,
|
155 |
+
"llm_pass_10": llm_pass_10
|
156 |
+
}
|
157 |
+
scores_valid, scores_msg = self.validate_scores(scores)
|
158 |
+
if not scores_valid:
|
159 |
+
return current_data, [], [], f"❌ {scores_msg}"
|
160 |
+
|
161 |
+
# Validate metrics
|
162 |
+
metrics = {
|
163 |
+
"readability": readability,
|
164 |
+
"relevance": relevance,
|
165 |
+
"explanation_clarity": explanation_clarity,
|
166 |
+
"problem_identification": problem_identification,
|
167 |
+
"actionability": actionability,
|
168 |
+
"completeness": completeness,
|
169 |
+
"specificity": specificity,
|
170 |
+
"contextual_adequacy": contextual_adequacy,
|
171 |
+
"consistency": consistency,
|
172 |
+
"brevity": brevity,
|
173 |
+
}
|
174 |
+
metrics_valid, metrics_msg = self.validate_metrics(metrics)
|
175 |
+
if not metrics_valid:
|
176 |
+
return current_data, [], [], f"❌ {metrics_msg}"
|
177 |
+
|
178 |
+
# Create submission data
|
179 |
+
submission_data = {
|
180 |
+
"model_name": model_name.strip(),
|
181 |
+
"programming_language": programming_language,
|
182 |
+
"comment_language": comment_language,
|
183 |
+
"taxonomy_category": taxonomy_category,
|
184 |
+
"bleu": bleu,
|
185 |
+
"llm_pass_1": llm_pass_1,
|
186 |
+
"llm_pass_5": llm_pass_5,
|
187 |
+
"llm_pass_10": llm_pass_10,
|
188 |
+
"metrics": metrics
|
189 |
+
}
|
190 |
+
|
191 |
+
# Submit to processor
|
192 |
+
success, message = self.processor.add_submission(submission_data, client_ip)
|
193 |
+
|
194 |
+
if success:
|
195 |
+
# Load updated data
|
196 |
+
updated_data = self.processor.load_leaderboard_data()
|
197 |
+
|
198 |
+
# Format tables
|
199 |
+
main_table = get_main_leaderboard_data(updated_data)
|
200 |
+
quality_table = get_quality_metrics_data(updated_data)
|
201 |
+
|
202 |
+
return updated_data, main_table, quality_table, message
|
203 |
+
else:
|
204 |
+
return current_data, [], [], message
|
205 |
+
|
206 |
+
except Exception as e:
|
207 |
+
print(f"Error in submission: {e}")
|
208 |
+
return current_data, [], [], f"❌ Submission failed: {str(e)}"
|
209 |
+
|
210 |
+
def get_submission_form_components(self):
|
211 |
+
"""Create gradio components for submission form"""
|
212 |
+
|
213 |
+
with gr.Accordion("📝 Submit New Model Results", open=False):
|
214 |
+
gr.Markdown("""
|
215 |
+
### Submission Guidelines
|
216 |
+
- Provide accurate scores based on proper evaluation
|
217 |
+
- Model name should follow 'organization/model' format
|
218 |
+
- All metrics are required
|
219 |
+
- Submissions are rate-limited per IP address
|
220 |
+
""")
|
221 |
+
|
222 |
+
with gr.Row():
|
223 |
+
model_name = gr.Textbox(
|
224 |
+
label="Model Name",
|
225 |
+
placeholder="e.g., microsoft/CodeT5-base",
|
226 |
+
info="Use organization/model format"
|
227 |
+
)
|
228 |
+
programming_language = gr.Dropdown(
|
229 |
+
choices=PROGRAMMING_LANGUAGES,
|
230 |
+
value="All",
|
231 |
+
label="Programming Language",
|
232 |
+
info="Primary programming language evaluated"
|
233 |
+
)
|
234 |
+
comment_language = gr.Dropdown(
|
235 |
+
choices=COMMENT_LANGUAGES,
|
236 |
+
value="English",
|
237 |
+
label="Comment Language",
|
238 |
+
info="Natural language of code comments"
|
239 |
+
)
|
240 |
+
taxonomy_category = gr.Dropdown(
|
241 |
+
choices=TAXONOMY_CATEGORIES,
|
242 |
+
value="All",
|
243 |
+
label="Taxonomy Category",
|
244 |
+
info="Primary review category focus"
|
245 |
+
)
|
246 |
+
|
247 |
+
gr.Markdown("### 📊 Performance Scores (0.0 - 1.0)")
|
248 |
+
with gr.Row():
|
249 |
+
bleu = gr.Number(
|
250 |
+
label="BLEU Score",
|
251 |
+
value=0.0,
|
252 |
+
minimum=0.0,
|
253 |
+
maximum=1.0,
|
254 |
+
step=0.001,
|
255 |
+
info="BLEU similarity score"
|
256 |
+
)
|
257 |
+
pass1 = gr.Number(
|
258 |
+
label="Pass@1",
|
259 |
+
value=0.0,
|
260 |
+
minimum=0.0,
|
261 |
+
maximum=1.0,
|
262 |
+
step=0.001,
|
263 |
+
info="Success rate in 1 attempt"
|
264 |
+
)
|
265 |
+
pass5 = gr.Number(
|
266 |
+
label="Pass@5",
|
267 |
+
value=0.0,
|
268 |
+
minimum=0.0,
|
269 |
+
maximum=1.0,
|
270 |
+
step=0.001,
|
271 |
+
info="Success rate in 5 attempts"
|
272 |
+
)
|
273 |
+
pass10 = gr.Number(
|
274 |
+
label="Pass@10",
|
275 |
+
value=0.0,
|
276 |
+
minimum=0.0,
|
277 |
+
maximum=1.0,
|
278 |
+
step=0.001,
|
279 |
+
info="Success rate in 10 attempts"
|
280 |
+
)
|
281 |
+
|
282 |
+
gr.Markdown("### 📋 Quality Metrics (0 - 10)")
|
283 |
+
with gr.Row():
|
284 |
+
readability = gr.Slider(
|
285 |
+
minimum=0, maximum=10, value=5, step=1,
|
286 |
+
label="Readability",
|
287 |
+
info="How readable are the generated reviews?"
|
288 |
+
)
|
289 |
+
relevance = gr.Slider(
|
290 |
+
minimum=0, maximum=10, value=5, step=1,
|
291 |
+
label="Relevance",
|
292 |
+
info="How relevant to the code changes?"
|
293 |
+
)
|
294 |
+
explanation_clarity = gr.Slider(
|
295 |
+
minimum=0, maximum=10, value=5, step=1,
|
296 |
+
label="Explanation Clarity",
|
297 |
+
info="How clear are the explanations?"
|
298 |
+
)
|
299 |
+
problem_identification = gr.Slider(
|
300 |
+
minimum=0, maximum=10, value=5, step=1,
|
301 |
+
label="Problem Identification",
|
302 |
+
info="How well does it identify issues?"
|
303 |
+
)
|
304 |
+
actionability = gr.Slider(
|
305 |
+
minimum=0, maximum=10, value=5, step=1,
|
306 |
+
label="Actionability",
|
307 |
+
info="How actionable are the suggestions?"
|
308 |
+
)
|
309 |
+
|
310 |
+
with gr.Row():
|
311 |
+
completeness = gr.Slider(
|
312 |
+
minimum=0, maximum=10, value=5, step=1,
|
313 |
+
label="Completeness",
|
314 |
+
info="How complete are the reviews?"
|
315 |
+
)
|
316 |
+
specificity = gr.Slider(
|
317 |
+
minimum=0, maximum=10, value=5, step=1,
|
318 |
+
label="Specificity",
|
319 |
+
info="How specific are the comments?"
|
320 |
+
)
|
321 |
+
contextual_adequacy = gr.Slider(
|
322 |
+
minimum=0, maximum=10, value=5, step=1,
|
323 |
+
label="Contextual Adequacy",
|
324 |
+
info="How well does it understand context?"
|
325 |
+
)
|
326 |
+
consistency = gr.Slider(
|
327 |
+
minimum=0, maximum=10, value=5, step=1,
|
328 |
+
label="Consistency",
|
329 |
+
info="How consistent across reviews?"
|
330 |
+
)
|
331 |
+
brevity = gr.Slider(
|
332 |
+
minimum=0, maximum=10, value=5, step=1,
|
333 |
+
label="Brevity",
|
334 |
+
info="How concise are the reviews?"
|
335 |
+
)
|
336 |
+
|
337 |
+
submit_btn = gr.Button("🚀 Submit Model", variant="primary")
|
338 |
+
status_msg = gr.Markdown("")
|
339 |
+
|
340 |
+
# Return all components for use in the main app
|
341 |
+
return {
|
342 |
+
"model_name": model_name,
|
343 |
+
"programming_language": programming_language,
|
344 |
+
"comment_language": comment_language,
|
345 |
+
"taxonomy_category": taxonomy_category,
|
346 |
+
"bleu": bleu,
|
347 |
+
"pass1": pass1,
|
348 |
+
"pass5": pass5,
|
349 |
+
"pass10": pass10,
|
350 |
+
"readability": readability,
|
351 |
+
"relevance": relevance,
|
352 |
+
"explanation_clarity": explanation_clarity,
|
353 |
+
"problem_identification": problem_identification,
|
354 |
+
"actionability": actionability,
|
355 |
+
"completeness": completeness,
|
356 |
+
"specificity": specificity,
|
357 |
+
"contextual_adequacy": contextual_adequacy,
|
358 |
+
"consistency": consistency,
|
359 |
+
"brevity": brevity,
|
360 |
+
"submit_btn": submit_btn,
|
361 |
+
"status_msg": status_msg,
|
362 |
+
}
|
363 |
+
|
364 |
+
def get_submission_history(self, ip_address: str) -> List[List[str]]:
|
365 |
+
"""Get submission history for display"""
|
366 |
+
try:
|
367 |
+
submissions = self.processor.get_ip_submissions(ip_address)
|
368 |
+
|
369 |
+
table_data = []
|
370 |
+
for sub in submissions:
|
371 |
+
row = [
|
372 |
+
sub.get("model_name", ""),
|
373 |
+
sub.get("programming_language", ""),
|
374 |
+
sub.get("comment_language", ""),
|
375 |
+
sub.get("taxonomy_category", ""),
|
376 |
+
f"{sub.get('scores', {}).get('llm_pass_1', 0):.3f}",
|
377 |
+
sub.get("submission_date", "").split("T")[0] if sub.get("submission_date") else "",
|
378 |
+
sub.get("status", "")
|
379 |
+
]
|
380 |
+
table_data.append(row)
|
381 |
+
|
382 |
+
return table_data
|
383 |
+
|
384 |
+
except Exception as e:
|
385 |
+
print(f"Error getting submission history: {e}")
|
386 |
+
return []
|