Spaces:

kenkaneki
/

CodeReviewBench

Running

App Files Files Community

kenkaneki commited on Jul 3

Commit

94789e6

2 Parent(s): b31be61 346c3c5

force merge remake into main

Browse files

Files changed (16) hide show

.env.template +6 -0
.gitignore +44 -5
.gitmodules +3 -0
.gradio/certificate.pem +31 -0
README.md +143 -200
app.py +1225 -336
example_submission.jsonl +4 -0
requirements.txt +8 -19
src/about.py +44 -32
src/display/css_html_js.py +62 -271
src/display/formatting.py +56 -167
src/display/utils.py +410 -287
src/envs.py +20 -99
src/leaderboard/processor.py +258 -293
src/populate.py +171 -0
src/submission/submit.py +178 -380

.env.template ADDED Viewed

	@@ -0,0 +1,6 @@

+HF_TOKEN="your_huggingface_write_token"
+OWNER="your_huggingface_username_or_org"
+RESULTS_DATASET_ID="your_username/guardbench-results"
+SUBMITTER_TOKEN="your_secret_submission_token"
+ADMIN_USERNAME="admin"
+ADMIN_PASSWORD="password" # Change this!

.gitignore CHANGED Viewed

@@ -1,13 +1,52 @@
-auto_evals/
-venv/
 __pycache__/
 .env
-.ipynb_checkpoints
-*ipynb
 .vscode/
 eval-queue/
 eval-results/
 eval-queue-bk/
 eval-results-bk/
-logs/

+# Python
 __pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+.venv/
+*.egg-info/
+.installed.cfg
+*.egg
+.gradio/
+# Environment variables
 .env
+# Virtual Environment
+venv/
+ENV/
+# IDE
+.idea/
 .vscode/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Hugging Face cache
 eval-queue/
 eval-results/
 eval-queue-bk/
 eval-results-bk/
+# Data files
+data/
+# Versioned leaderboard files
+data/leaderboard_v*.json

.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "guard-bench-submodule"]
+	path = guard-bench-submodule
+	url = https://github.com/whitecircle-ai/circle-guard-bench.git

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

README.md CHANGED Viewed

@@ -1,222 +1,165 @@
 ---
-title: CodeReview Leaderboard
-emoji: 🥇
-colorFrom: green
 colorTo: indigo
 sdk: gradio
 app_file: app.py
 pinned: true
-license: mit
-short_description: CodeReview Leaderboard for evaluating code review models
-sdk_version: 5.19.0
-storage: persistent
----
-# 🏆 CodeReview Leaderboard
-A comprehensive benchmark and leaderboard for code review generation models, inspired by [circle-guard-bench](https://huggingface.co/spaces/whitecircle-ai/circle-guard-bench).
-## ✨ Features
-### 🎯 Core Functionality
-- **Multi-dimensional Evaluation**: Track models across BLEU scores, Pass@1/5/10 metrics, and 10 quality dimensions
-- **Advanced Filtering**: Filter results by programming language, comment language, and taxonomy category
-- **Real-time Updates**: Dynamic leaderboard updates with instant filtering
-- **Dark Theme**: Modern, eye-friendly interface with GitHub-inspired dark theme
-### 🔍 Advanced Analytics
-- **Language Performance**: Compare model performance across programming languages
-- **Category Analysis**: Analyze performance by review type (bug detection, security, etc.)
-- **Submission History**: Track all submissions with IP-based logging
-- **Statistical Insights**: Comprehensive statistics and trend analysis
-### 🛡️ Security & Quality
-- **IP-based Rate Limiting**: Prevent spam submissions (5 per 24 hours per IP)
-- **Comprehensive Validation**: Multi-layer validation for all submissions
-- **Audit Trail**: Complete submission logging for transparency
-- **Data Integrity**: Automatic data validation and backup systems
-### 🌐 Multi-Language Support
-- **Programming Languages**: Python, JavaScript, Java, C++, Go, Rust, and more
-- **Comment Languages**: English, Chinese, Spanish, French, German, Japanese, and more
-- **Taxonomy Categories**: Bug Detection, Security, Performance, Style, and more
-## 🚀 Quick Start
-### Installation
 ```bash
 pip install -r requirements.txt
 ```
-### Run Locally
 ```bash
 python app.py
 ```
-### Access the Interface
-Open your browser to `http://localhost:7860`
-## 📊 Usage Guide
-### 1. Viewing the Leaderboard
-- Navigate to the **🏆 Leaderboard** tab
-- Use the filter dropdowns to narrow results:
-  - **Programming Language**: Filter by specific programming languages
-  - **Comment Language**: Filter by natural language of comments
-  - **Taxonomy Category**: Filter by review category type
-- Click **🔄 Refresh** to update data
-### 2. Submitting Models
-- Go to the **📝 Submit Model** tab
-- Fill in the submission form:
-  - **Model Name**: Use `organization/model` format
-  - **Languages & Category**: Select appropriate filters
-  - **Performance Scores**: Provide BLEU and Pass@k scores (0.0-1.0)
-  - **Quality Metrics**: Rate across 10 dimensions (0-10)
-- Click **🚀 Submit Model** to add your results
-### 3. Analytics & Insights
-- Visit the **📈 Analytics** tab to see:
-  - Recent submission history
-  - Language performance comparisons
-  - Category performance analysis
-  - Trends and patterns
-### 4. Data Export
-- Use the **ℹ️ About** tab to export data in JSON or CSV format
-- Full leaderboard data available for research and analysis
-## 🏗️ Architecture
-### Directory Structure
 ```
-├── src/
-│   ├── about.py              # About page content
-│   ├── envs.py               # Environment configuration
-│   ├── display/              # Display utilities
-│   │   ├── css_html_js.py    # Styling and themes
-│   │   ├── formatting.py     # Data formatting
-│   │   └── utils.py          # Display utilities
-│   ├── leaderboard/          # Leaderboard processing
-│   │   └── processor.py      # Data operations
-│   └── submission/           # Submission handling
-│       └── submit.py         # Submission validation
-├── data/                     # Data storage
-│   ├── leaderboard_data.json # Main leaderboard
-│   └── submissions.json      # Submission log
-├── app.py                    # Main application
-└── requirements.txt          # Dependencies
-```
-### Key Components
-- **LeaderboardProcessor**: Handles all data operations, validation, and persistence
-- **SubmissionHandler**: Manages model submissions with IP tracking and validation
-- **Display Utils**: Provides filtering, formatting, and table generation
-- **Dark Theme**: Custom CSS for modern, accessible interface
-## 🎨 Features Inspired by circle-guard-bench
-### ✅ Implemented Features
-- **Multi-tab Interface**: Organized navigation with dedicated sections
-- **Advanced Filtering**: Real-time filtering by multiple criteria
-- **Dark Theme**: Modern, GitHub-inspired dark interface
-- **IP-based Submissions**: Secure submission tracking
-- **Comprehensive Analytics**: Detailed performance insights
-- **Data Export**: Multiple export formats
-- **Rate Limiting**: Anti-spam protection
-### 🔧 Technical Improvements
-- **Modular Architecture**: Clean separation of concerns
-- **Type Safety**: Full type annotations throughout
-- **Error Handling**: Comprehensive error handling and logging
-- **Data Validation**: Multi-layer validation with Pydantic
-- **Performance**: Optimized data processing and display
-## 📈 Metrics & Evaluation
-### Performance Metrics
-- **BLEU**: Text similarity score (0.0-1.0)
-- **Pass@1**: Success rate in single attempt (0.0-1.0)
-- **Pass@5**: Success rate in 5 attempts (0.0-1.0)
-- **Pass@10**: Success rate in 10 attempts (0.0-1.0)
-### Quality Dimensions
-1. **Readability**: How clear and readable are the reviews?
-2. **Relevance**: How relevant to the code changes?
-3. **Explanation Clarity**: How well does it explain issues?
-4. **Problem Identification**: How effectively does it identify problems?
-5. **Actionability**: How actionable are the suggestions?
-6. **Completeness**: How thorough are the reviews?
-7. **Specificity**: How specific are the comments?
-8. **Contextual Adequacy**: How well does it understand context?
-9. **Consistency**: How consistent across different reviews?
-10. **Brevity**: How concise without losing important information?
-## 🔒 Security Features
-### Rate Limiting
-- **5 submissions per IP per 24 hours**
-- **Automatic IP tracking and logging**
-- **Graceful error handling for rate limits**
-### Data Validation
-- **Model name format validation**
-- **Score range validation (0.0-1.0 for performance, 0-10 for quality)**
-- **Logical consistency checks (Pass@1 ≤ Pass@5 ≤ Pass@10)**
-- **Required field validation**
-### Audit Trail
-- **Complete submission logging**
-- **IP address tracking (partially masked for privacy)**
-- **Timestamp recording**
-- **Data integrity checks**
-## 🤝 Contributing
-1. Fork the repository
-2. Create a feature branch
-3. Make your changes
-4. Add tests if applicable
-5. Submit a pull request
-## 📄 License
-This project is licensed under the MIT License - see the LICENSE file for details.
-## 🙏 Acknowledgments
-- Inspired by [circle-guard-bench](https://huggingface.co/spaces/whitecircle-ai/circle-guard-bench)
-- Built with [Gradio](https://gradio.app/) for the web interface
-- Thanks to the open-source community for tools and inspiration
-## 📞 Support
-For questions, issues, or contributions:
-- Open an issue on GitHub
-- Check the documentation
-- Contact the maintainers
----
-**Built with ❤️ for the code review research community**

 ---
+title: CircleGuardBench
+emoji: ⚪
+colorFrom: gray
 colorTo: indigo
 sdk: gradio
+sdk_version: 4.44.1
 app_file: app.py
 pinned: true
+short_description: First benchmark testing LLM guards on safety and accuracy.
+models:
+- AtlaAI/Selene-1-Mini-Llama-3.1-8B
+- google/gemma-3-12b-it
+- google/gemma-3-4b-it
+- meta-llama/Llama-3.1-8B-Instruct
+- meta-llama/Llama-3.2-3B-Instruct
+- meta-llama/Llama-4-Maverick-17B-128E-Instruct
+- meta-llama/Llama-4-Scout-17B-16E-Instruct
+- meta-llama/Llama-Guard-3-1B
+- meta-llama/Llama-Guard-3-8B
+- meta-llama/Llama-Guard-4-12B
+- mistralai/Ministral-8B-Instruct-2410
+- mistralai/Mistral-Small-3.1-24B-Instruct-2503
+- Qwen/Qwen2.5-7B-Instruct
+- Qwen/Qwen3-0.6B
+- Qwen/Qwen3-1.7B
+- Qwen/Qwen3-4B
+- Qwen/Qwen3-8B
+---
+# CodeReview Bench Leaderboard
+A comprehensive leaderboard for evaluating automated code review systems across programming languages and review quality dimensions.
+## Features
+- **Multi-Language Support**: Evaluates models across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more
+- **Dual Language Comments**: Supports both Russian and English comment languages
+- **Comprehensive Metrics**:
+  - LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity)
+  - Exact-match metrics (pass@1, pass@5, pass@10, BLEU@10)
+- **Interactive Visualization**: Compare model performance across categories with radar plots
+- **Easy Submission**: Submit your model results via web interface
+## Metrics
+### LLM-based Multimetric
+- **Readability**: How easy the review is to understand
+- **Relevance**: How relevant the review is to the code
+- **Explanation Clarity**: How clear the explanations are
+- **Problem Identification**: How well problems are identified
+- **Actionability**: How actionable the suggestions are
+- **Completeness**: How complete the review is
+- **Specificity**: How specific the feedback is
+- **Contextual Adequacy**: How well the review fits the context
+- **Consistency**: How consistent the review style is
+- **Brevity**: How concise the review is
+### Exact-Match Metrics
+- **Pass@1**: Percentage of correct reviews on first attempt
+- **Pass@5**: Percentage of correct reviews in top 5 attempts
+- **Pass@10**: Percentage of correct reviews in top 10 attempts
+- **BLEU@10**: BLEU score for top 10 review candidates
+## Programming Languages Supported
+- Python
+- JavaScript
+- Java
+- C++
+- C#
+- TypeScript
+- Go
+- Rust
+- Swift
+- Kotlin
+- Ruby
+- PHP
+- C
+- Scala
+- R
+- Dart
+- Other
+## Comment Languages
+- Russian (ru)
+- English (en)
+## Example Categories
+- Bug Fix
+- Code Style
+- Performance
+- Security
+- Refactoring
+- Documentation
+- Testing
+- Architecture
+- Other
+## Installation
 ```bash
 pip install -r requirements.txt
 ```
+## Usage
 ```bash
 python app.py
 ```
+## Submission Format
+Submit your results as a JSONL file where each line contains:
+```json
+{
+  "model_name": "your-model-name",
+  "programming_language": "python",
+  "comment_language": "en",
+  "readability": 8.5,
+  "relevance": 9.0,
+  "explanation_clarity": 7.8,
+  "problem_identification": 8.2,
+  "actionability": 8.7,
+  "completeness": 8.0,
+  "specificity": 7.5,
+  "contextual_adequacy": 8.3,
+  "consistency": 8.8,
+  "brevity": 7.2,
+  "pass_at_1": 0.75,
+  "pass_at_5": 0.88,
+  "pass_at_10": 0.92,
+  "bleu_at_10": 0.65,
+  "total_evaluations": 100
+}
 ```
+## Environment Variables
+Set the following environment variables:
+```bash
+HF_TOKEN=your_huggingface_token
+OWNER=your-organization
+RESULTS_DATASET_ID=your-org/codereview-bench-results
+```
+## Citation
+```bibtex
+@misc{codereviewbench2025,
+  author = {CodeReview Bench Team},
+  title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
+  year = {2025},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/your-org/codereview-bench}}
+}
+```

app.py CHANGED Viewed

@@ -1,365 +1,1254 @@
 """
-CodeReview Leaderboard - Inspired by circle-guard-bench
-A comprehensive leaderboard for code review generation models
 """
 import gradio as gr
-from typing import List, Dict, Any
-from datetime import datetime, timezone
-# Import our modules
-from src.envs import (
-    PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES,
-    MAIN_HEADERS, QUALITY_HEADERS
 )
-from src.about import TITLE, INTRODUCTION_TEXT
-from src.display.css_html_js import DARK_THEME_CSS, CUSTOM_JS, HEADER_HTML, FOOTER_HTML
 from src.display.utils import (
-    get_main_leaderboard_data, get_quality_metrics_data,
-    get_submission_history_data, get_statistics_summary
 )
-from src.leaderboard.processor import LeaderboardProcessor
-from src.submission.submit import SubmissionHandler
-# Initialize processors
-processor = LeaderboardProcessor()
-submission_handler = SubmissionHandler()
-# Global state
-current_filters = {
-    "programming_language": "All",
-    "comment_language": "All",
-    "taxonomy_category": "All"
-}
-def update_leaderboard_tables(
-    programming_language: str = "All",
-    comment_language: str = "All",
-    taxonomy_category: str = "All"
-):
-    """Update leaderboard tables with filters"""
-    global current_filters
-    current_filters = {
-        "programming_language": programming_language,
-        "comment_language": comment_language,
-        "taxonomy_category": taxonomy_category
     }
-    # Load current data
-    data = processor.load_leaderboard_data()
-    # Get filtered tables
-    main_table = get_main_leaderboard_data(
-        data, programming_language, comment_language, taxonomy_category
     )
-    quality_table = get_quality_metrics_data(
-        data, programming_language, comment_language, taxonomy_category
     )
-    # Get statistics
-    stats = get_statistics_summary(data)
-    # Format statistics display
-    stats_text = f"""
-    ## 📊 Current Statistics
-    - **Total Models**: {stats['total_models']}
-    - **Total Submissions**: {stats['total_submissions']}
-    - **Average Pass@1**: {stats['avg_pass_1']:.3f}
-    - **Best Model**: {stats['best_model']}
-    - **Languages Covered**: {stats['languages_covered']}
-    - **Categories Covered**: {stats['categories_covered']}
     """
-    return main_table, quality_table, stats_text
-def refresh_data():
-    """Refresh all data from storage"""
-    return update_leaderboard_tables(
-        current_filters["programming_language"],
-        current_filters["comment_language"],
-        current_filters["taxonomy_category"]
-    )
-def handle_submission(
-    request: gr.Request,
-    *args
 ):
-    """Handle model submission"""
-    # Get current data
-    current_data = processor.load_leaderboard_data()
-    # Call submission handler
-    result = submission_handler.submit_model(request, current_data, *args)
-    # If submission was successful, refresh tables
-    if result[0] != current_data:  # Data was updated
-        main_table, quality_table, stats_text = update_leaderboard_tables(
-            current_filters["programming_language"],
-            current_filters["comment_language"],
-            current_filters["taxonomy_category"]
         )
-        return result[0], main_table, quality_table, result[3], stats_text
     else:
-        return result[0], result[1], result[2], result[3], None
-# Create the Gradio interface
-with gr.Blocks(
-    theme=gr.themes.Base(),
-    css=DARK_THEME_CSS,
-    js=CUSTOM_JS,
-    title=TITLE,
-    head="<meta name='viewport' content='width=device-width, initial-scale=1'>"
-) as demo:
-    # Header
-    gr.HTML(HEADER_HTML)
-    # State to store leaderboard data
-    leaderboard_state = gr.State(value=processor.load_leaderboard_data())
-    # Main content tabs
-    with gr.Tabs():
-        # Leaderboard Tab
-        with gr.Tab("🏆 Leaderboard"):
-            # Filters
-            with gr.Row():
-                prog_lang_filter = gr.Dropdown(
-                    choices=PROGRAMMING_LANGUAGES,
-                    value="All",
-                    label="🔍 Programming Language",
-                    info="Filter by programming language"
-                )
-                comment_lang_filter = gr.Dropdown(
-                    choices=COMMENT_LANGUAGES,
-                    value="All",
-                    label="🌍 Comment Language",
-                    info="Filter by comment language"
-                )
-                taxonomy_filter = gr.Dropdown(
-                    choices=TAXONOMY_CATEGORIES,
-                    value="All",
-                    label="🏷️ Taxonomy Category",
-                    info="Filter by review category"
                 )
-                refresh_btn = gr.Button("🔄 Refresh", variant="secondary")
-            # Statistics
-            stats_display = gr.Markdown("")
-            # Main leaderboard table
-            with gr.Row():
-                main_leaderboard = gr.Dataframe(
-                    headers=MAIN_HEADERS,
-                    label="🏅 Main Leaderboard",
-                    interactive=False,
-                    wrap=True,
-                    max_height=600
                 )
-            # Quality metrics table
-            with gr.Row():
-                quality_metrics = gr.Dataframe(
-                    headers=QUALITY_HEADERS,
-                    label="📊 Quality Metrics",
-                    interactive=False,
-                    wrap=True,
-                    max_height=600
                 )
-        # Submission Tab
-        with gr.Tab("📝 Submit Model"):
-            # Create submission form
-            form_components = submission_handler.get_submission_form_components()
-            # Connect submission handler
-            form_components["submit_btn"].click(
-                fn=handle_submission,
-                inputs=[
-                    leaderboard_state,
-                    form_components["model_name"],
-                    form_components["programming_language"],
-                    form_components["comment_language"],
-                    form_components["taxonomy_category"],
-                    form_components["bleu"],
-                    form_components["pass1"],
-                    form_components["pass5"],
-                    form_components["pass10"],
-                    form_components["readability"],
-                    form_components["relevance"],
-                    form_components["explanation_clarity"],
-                    form_components["problem_identification"],
-                    form_components["actionability"],
-                    form_components["completeness"],
-                    form_components["specificity"],
-                    form_components["contextual_adequacy"],
-                    form_components["consistency"],
-                    form_components["brevity"],
-                ],
-                outputs=[
-                    leaderboard_state,
-                    main_leaderboard,
-                    quality_metrics,
-                    form_components["status_msg"],
-                    stats_display
-                ]
-            )
-        # Analytics Tab
-        with gr.Tab("📈 Analytics"):
-            with gr.Row():
-                analytics_prog_lang = gr.Dropdown(
-                    choices=PROGRAMMING_LANGUAGES,
-                    value="All",
-                    label="Programming Language"
                 )
-                analytics_comment_lang = gr.Dropdown(
-                    choices=COMMENT_LANGUAGES,
-                    value="All",
-                    label="Comment Language"
                 )
-                analytics_taxonomy = gr.Dropdown(
-                    choices=TAXONOMY_CATEGORIES,
-                    value="All",
-                    label="Taxonomy Category"
                 )
-            # Submission history
-            submission_history = gr.Dataframe(
-                headers=["Model", "Programming Language", "Comment Language", "Taxonomy", "Pass@1", "Date", "IP"],
-                label="📋 Recent Submissions",
-                interactive=False,
-                max_height=400
-            )
-            # Language performance analysis
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown("### 🗣️ Language Performance Analysis")
-                    language_analysis = gr.Dataframe(
-                        headers=["Language", "Avg Pass@1", "Model Count", "Best Model"],
-                        label="Programming Language Performance",
-                        interactive=False
                     )
-                with gr.Column():
-                    gr.Markdown("### 🏷️ Category Performance Analysis")
-                    category_analysis = gr.Dataframe(
-                        headers=["Category", "Avg Pass@1", "Model Count", "Best Model"],
-                        label="Taxonomy Category Performance",
-                        interactive=False
                     )
-        # About Tab
-        with gr.Tab("ℹ️ About"):
-            gr.Markdown(INTRODUCTION_TEXT)
-            # Export functionality
-            with gr.Row():
-                export_format = gr.Dropdown(
-                    choices=["JSON", "CSV"],
-                    value="JSON",
-                    label="Export Format"
                 )
-                export_btn = gr.Button("📥 Export Data")
-            export_output = gr.Textbox(
-                label="Export Output",
-                lines=10,
-                max_lines=20,
-                show_copy_button=True
-            )
-    # Footer
-    gr.HTML(FOOTER_HTML)
-    # Initialize with data
-    initial_main, initial_quality, initial_stats = update_leaderboard_tables()
-    # Update tables when filters change
-    filter_inputs = [prog_lang_filter, comment_lang_filter, taxonomy_filter]
-    filter_outputs = [main_leaderboard, quality_metrics, stats_display]
-    for filter_input in filter_inputs:
-        filter_input.change(
-            fn=update_leaderboard_tables,
-            inputs=filter_inputs,
-            outputs=filter_outputs
-        )
-    # Refresh button
-    refresh_btn.click(
-        fn=refresh_data,
-        outputs=filter_outputs
-    )
-    # Analytics updates
-    analytics_inputs = [analytics_prog_lang, analytics_comment_lang, analytics_taxonomy]
-    def update_analytics(prog_lang, comment_lang, taxonomy):
-        """Update analytics tables"""
-        data = processor.load_leaderboard_data()
-        # Get submission history
-        history = get_submission_history_data(data, prog_lang, comment_lang, taxonomy)
-        # Get language performance
-        lang_perf = []
-        for lang in PROGRAMMING_LANGUAGES[1:]:
-            lang_data = [d for d in data if d.get("programming_language") == lang]
-            if lang_data:
-                avg_score = sum(d.get("llm_pass_1", 0) for d in lang_data) / len(lang_data)
-                best_model = max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
-                lang_perf.append([lang, f"{avg_score:.3f}", len(lang_data), best_model])
-        # Get category performance
-        cat_perf = []
-        for cat in TAXONOMY_CATEGORIES[1:]:
-            cat_data = [d for d in data if d.get("taxonomy_category") == cat]
-            if cat_data:
-                avg_score = sum(d.get("llm_pass_1", 0) for d in cat_data) / len(cat_data)
-                best_model = max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
-                cat_perf.append([cat, f"{avg_score:.3f}", len(cat_data), best_model])
-        return history, lang_perf, cat_perf
-    for analytics_input in analytics_inputs:
-        analytics_input.change(
-            fn=update_analytics,
-            inputs=analytics_inputs,
-            outputs=[submission_history, language_analysis, category_analysis]
-        )
-    # Export functionality
-    def export_data(format_type):
-        """Export leaderboard data"""
-        return processor.export_data(format_type.lower())
-    export_btn.click(
-        fn=export_data,
-        inputs=[export_format],
-        outputs=[export_output]
-    )
-    # Set initial values
-    demo.load(
-        fn=lambda: (initial_main, initial_quality, initial_stats),
-        outputs=[main_leaderboard, quality_metrics, stats_display]
-    )
-# Launch configuration
-if __name__ == "__main__":
-    demo.queue(max_size=20).launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=False,
-        show_error=True,
-        debug=True
     )
-# For deployment (HuggingFace Spaces, etc.)
-app = demo

 """
+CodeReview Bench Leaderboard Application
 """
+import os
+import json
+import tempfile
+import logging
 import gradio as gr
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from apscheduler.schedulers.background import BackgroundScheduler
+import numpy as np
+from gradio.themes.utils import fonts, colors
+from dataclasses import fields, dataclass
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    TITLE,
 )
+from src.display.css_html_js import custom_css
 from src.display.utils import (
+    CODEREVIEW_COLUMN,
+    DISPLAY_COLS,
+    METRIC_COLS,
+    HIDDEN_COLS,
+    NEVER_HIDDEN_COLS,
+    CATEGORIES,
+    COMMENT_LANGUAGES,
+    EXAMPLE_CATEGORIES,
+    ModelType,
+    Mode,
+    Precision,
+    WeightType,
+    ReviewModelType,
+    get_all_column_choices,
+    get_default_visible_columns,
 )
+from src.display.formatting import styled_message, styled_error, styled_warning
+from src.envs import (
+    ADMIN_USERNAME,
+    ADMIN_PASSWORD,
+    RESULTS_DATASET_ID,
+    SUBMITTER_TOKEN,
+    TOKEN,
+    DATA_PATH,
+)
+from src.populate import get_leaderboard_df, get_category_leaderboard_df
+from src.submission.submit import process_submission
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# Ensure data directory exists
+os.makedirs(DATA_PATH, exist_ok=True)
+# Available benchmark versions
+BENCHMARK_VERSIONS = ["v0"]
+CURRENT_VERSION = "v0"
+# Initialize leaderboard data
+try:
+    logger.info("Initializing leaderboard data...")
+    LEADERBOARD_DF = get_leaderboard_df(version=CURRENT_VERSION)
+    logger.info(f"Loaded leaderboard with {len(LEADERBOARD_DF)} entries")
+except Exception as e:
+    logger.error(f"Error loading leaderboard data: {e}")
+    LEADERBOARD_DF = pd.DataFrame()
+custom_theme = gr.themes.Default(
+    primary_hue=colors.slate,
+    secondary_hue=colors.slate,
+    neutral_hue=colors.neutral,
+    font=(fonts.GoogleFont("Inter"), "sans-serif"),
+).set(
+    # font_size="16px",
+    body_background_fill="#0f0f10",
+    body_background_fill_dark="#0f0f10",
+    body_text_color="#f4f4f5",
+    body_text_color_subdued="#a1a1aa",
+    block_background_fill="#1e1e1e",  # Cooler Grey
+    block_border_color="#333333",  # Cooler Grey
+    block_shadow="none",
+    # Swapped primary and secondary button styles
+    button_primary_background_fill="#121212",  # Changed to specific color for Refresh button
+    button_primary_text_color="#f4f4f5",
+    button_primary_border_color="#333333",  # Keep border grey or change to #121212?
+    button_secondary_background_fill="#f4f4f5",
+    button_secondary_text_color="#0f0f10",
+    button_secondary_border_color="#f4f4f5",
+    input_background_fill="#1e1e1e",  # Cooler Grey
+    input_border_color="#333333",  # Cooler Grey
+    input_placeholder_color="#71717a",
+    table_border_color="#333333",  # Cooler Grey
+    table_even_background_fill="#2d2d2d",  # Cooler Grey (Slightly lighter)
+    table_odd_background_fill="#1e1e1e",  # Cooler Grey
+    table_text_color="#f4f4f5",
+    link_text_color="#ffffff",
+    border_color_primary="#333333",  # Cooler Grey
+    background_fill_secondary="#333333",  # Cooler Grey
+    color_accent="#f4f4f5",
+    border_color_accent="#333333",  # Cooler Grey
+    button_primary_background_fill_hover="#424242",  # Cooler Grey
+    block_title_text_color="#f4f4f5",
+    accordion_text_color="#f4f4f5",
+    panel_background_fill="#1e1e1e",  # Cooler Grey
+    panel_border_color="#333333",  # Cooler Grey
+    # Explicitly setting primary/secondary/accent colors/borders
+    background_fill_primary="#0f0f10",
+    background_fill_primary_dark="#0f0f10",
+    background_fill_secondary_dark="#333333",  # Cooler Grey
+    border_color_primary_dark="#333333",  # Cooler Grey
+    border_color_accent_dark="#333333",  # Cooler Grey
+    border_color_accent_subdued="#424242",  # Cooler Grey
+    border_color_accent_subdued_dark="#424242",  # Cooler Grey
+    color_accent_soft="#a1a1aa",
+    color_accent_soft_dark="#a1a1aa",
+    # Explicitly setting input hover/focus states
+    input_background_fill_dark="#1e1e1e",  # Cooler Grey
+    input_background_fill_focus="#424242",  # Cooler Grey
+    input_background_fill_focus_dark="#424242",  # Cooler Grey
+    input_background_fill_hover="#2d2d2d",  # Cooler Grey
+    input_background_fill_hover_dark="#2d2d2d",  # Cooler Grey
+    input_border_color_dark="#333333",  # Cooler Grey
+    input_border_color_focus="#f4f4f5",
+    input_border_color_focus_dark="#f4f4f5",
+    input_border_color_hover="#424242",  # Cooler Grey
+    input_border_color_hover_dark="#424242",  # Cooler Grey
+    input_placeholder_color_dark="#71717a",
+    # Explicitly set dark variants for table backgrounds
+    table_even_background_fill_dark="#2d2d2d",  # Cooler Grey
+    table_odd_background_fill_dark="#1e1e1e",  # Cooler Grey
+    # Explicitly set dark text variants
+    body_text_color_dark="#f4f4f5",
+    body_text_color_subdued_dark="#a1a1aa",
+    block_title_text_color_dark="#f4f4f5",
+    accordion_text_color_dark="#f4f4f5",
+    table_text_color_dark="#f4f4f5",
+    # Explicitly set dark panel/block variants
+    panel_background_fill_dark="#1e1e1e",  # Cooler Grey
+    panel_border_color_dark="#333333",  # Cooler Grey
+    block_background_fill_dark="#1e1e1e",  # Cooler Grey
+    block_border_color_dark="#333333",  # Cooler Grey
+)
+@dataclass
+class ColumnInfo:
+    """Information about a column in the leaderboard."""
+    name: str
+    display_name: str
+    type: str = "text"
+    hidden: bool = False
+    never_hidden: bool = False
+    displayed_by_default: bool = True
+def update_column_choices(df):
+    """Update column choices based on what's actually in the dataframe"""
+    if df is None or df.empty:
+        return get_all_column_choices()
+    # Get columns that actually exist in the dataframe
+    existing_columns = list(df.columns)
+    # Get all possible columns with their display names
+    all_columns = get_all_column_choices()
+    # Filter to only include columns that exist in the dataframe
+    valid_columns = [
+        (col_name, display_name)
+        for col_name, display_name in all_columns
+        if col_name in existing_columns
+    ]
+    # Return default if there are no valid columns
+    if not valid_columns:
+        return get_all_column_choices()
+    return valid_columns
+# Update the column_selector initialization
+def get_initial_columns():
+    """Get initial columns to show in the dropdown"""
+    try:
+        # Get available columns in the main dataframe
+        available_cols = list(LEADERBOARD_DF.columns)
+        logger.info(f"Available columns in LEADERBOARD_DF: {available_cols}")
+        # If dataframe is empty, use default visible columns
+        if not available_cols:
+            return get_default_visible_columns()
+        # Get default visible columns that actually exist in the dataframe
+        valid_defaults = [
+            col for col in get_default_visible_columns() if col in available_cols
+        ]
+        # If none of the defaults exist, return all available columns
+        if not valid_defaults:
+            return available_cols
+        return valid_defaults
+    except Exception as e:
+        logger.error(f"Error getting initial columns: {e}")
+        return get_default_visible_columns()
+def init_leaderboard(dataframe, visible_columns=None):
+    """
+    Initialize a standard Gradio Dataframe component for the leaderboard.
+    """
+    if dataframe is None or dataframe.empty:
+        # Create an empty dataframe with the right columns
+        columns = [getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS]
+        dataframe = pd.DataFrame(columns=columns)
+        logger.warning("Initializing empty leaderboard")
+    # Lowercase model_name for display
+    if "model_name" in dataframe.columns:
+        dataframe = dataframe.copy()
+        dataframe["model_name"] = dataframe["model_name"].str.lower()
+    if "model_type" in dataframe.columns:
+        dataframe = dataframe.copy()
+        dataframe["model_type"] = dataframe["model_type"].str.replace(" : ", "-")
+    if "review_model_type" in dataframe.columns:
+        dataframe = dataframe.copy()
+        dataframe["review_model_type"] = dataframe["review_model_type"].str.replace("custom", "custom")
+    # print("\n\n", "dataframe", dataframe, "--------------------------------\n\n")
+    # Determine which columns to display
+    display_column_names = [
+        getattr(CODEREVIEW_COLUMN, col).name for col in DISPLAY_COLS
+    ]
+    hidden_column_names = [getattr(CODEREVIEW_COLUMN, col).name for col in HIDDEN_COLS]
+    # Columns that should always be shown
+    always_visible = [getattr(CODEREVIEW_COLUMN, col).name for col in NEVER_HIDDEN_COLS]
+    # Use provided visible columns if specified, otherwise use default
+    if visible_columns is None:
+        # Determine which columns to show initially
+        visible_columns = [
+            col for col in display_column_names if col not in hidden_column_names
+        ]
+    # Always include the never-hidden columns
+    for col in always_visible:
+        if col not in visible_columns and col in dataframe.columns:
+            visible_columns.append(col)
+    # Make sure we only include columns that actually exist in the dataframe
+    visible_columns = [col for col in visible_columns if col in dataframe.columns]
+    # Map GuardBench column types to Gradio's expected datatype strings
+    # Valid Gradio datatypes are: 'str', 'number', 'bool', 'date', 'markdown', 'html', 'image'
+    type_mapping = {
+        "text": "str",
+        "number": "number",
+        "bool": "bool",
+        "date": "date",
+        "markdown": "markdown",
+        "html": "html",
+        "image": "image",
+    }
+    # Create a list of datatypes in the format Gradio expects
+    datatypes = []
+    for col in visible_columns:
+        # Find the corresponding CODEREVIEW_COLUMN entry
+        col_type = None
+        for display_col in DISPLAY_COLS:
+            if getattr(CODEREVIEW_COLUMN, display_col).name == col:
+                orig_type = getattr(CODEREVIEW_COLUMN, display_col).type
+                # Map to Gradio's expected types
+                col_type = type_mapping.get(orig_type, "str")
+                break
+        # Default to 'str' if type not found or not mappable
+        if col_type is None:
+            col_type = "str"
+        datatypes.append(col_type)
+    # Create a dummy column for search functionality if it doesn't exist
+    if "search_dummy" not in dataframe.columns:
+        dataframe["search_dummy"] = dataframe.apply(
+            lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
+            axis=1,
+        )
+    # Select only the visible columns for display
+    visible_columns.remove("model_name")
+    visible_columns = ["model_name"] + visible_columns
+    display_df = dataframe[visible_columns].copy()
+    # print(f"--- DataFrame inside init_leaderboard (before rounding) ---")
+    # print(display_df[['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']].head() if all(c in display_df.columns for c in ['model_name', 'macro_accuracy', 'macro_recall', 'total_evals_count']) else "Relevant columns not present")
+    # print(f"-------------------------------------------------------------")
+    # Round numeric columns to 3 decimal places for display
+    numeric_cols = display_df.select_dtypes(include=np.number).columns
+    for col in numeric_cols:
+        # Avoid rounding integer columns like counts
+        if not pd.api.types.is_integer_dtype(display_df[col]):
+            # Format floats to exactly 3 decimal places, preserving trailing zeros
+            display_df[col] = display_df[col].apply(
+                lambda x: f"{x:.3f}" if pd.notna(x) else None
+            )
+    column_info_map = {
+        f.name: getattr(CODEREVIEW_COLUMN, f.name) for f in fields(CODEREVIEW_COLUMN)
     }
+    column_mapping = {
+        col: column_info_map.get(col, ColumnInfo(col, col)).display_name
+        for col in visible_columns
+    }
+    # Rename columns in the DataFrame
+    display_df.rename(columns=column_mapping, inplace=True)
+    # Apply styling - note: styling might need adjustment if it relies on column names
+    styler = display_df.style.set_properties(**{"text-align": "right"}).set_properties(
+        subset=["Model"], **{"width": "200px"}
     )
+    return gr.Dataframe(
+        value=styler,
+        datatype=datatypes,
+        interactive=False,
+        wrap=True,
+        height=2500,
+        elem_id="leaderboard-table",
+        row_count=len(display_df),
     )
+def search_filter_leaderboard(
+    df, search_query="", model_types=None, version=CURRENT_VERSION
+):
     """
+    Filter the leaderboard based on search query and model types.
+    """
+    if df is None or df.empty:
+        return df
+    filtered_df = df.copy()
+    # Add search dummy column if it doesn't exist
+    if "search_dummy" not in filtered_df.columns:
+        filtered_df["search_dummy"] = filtered_df.apply(
+            lambda row: " ".join(str(val) for val in row.values if pd.notna(val)),
+            axis=1,
+        )
+    # Apply model type filter
+    if model_types and len(model_types) > 0:
+        filtered_df = filtered_df[
+            filtered_df[GUARDBENCH_COLUMN.model_type.name].isin(model_types)
+        ]
+    # Apply search query
+    if search_query:
+        search_terms = [
+            term.strip() for term in search_query.split(";") if term.strip()
+        ]
+        if search_terms:
+            combined_mask = None
+            for term in search_terms:
+                mask = filtered_df["search_dummy"].str.contains(
+                    term, case=False, na=False
+                )
+                if combined_mask is None:
+                    combined_mask = mask
+                else:
+                    combined_mask = combined_mask | mask
+            if combined_mask is not None:
+                filtered_df = filtered_df[combined_mask]
+    # Drop the search dummy column before returning
+    visible_columns = [col for col in filtered_df.columns if col != "search_dummy"]
+    return filtered_df[visible_columns]
+def refresh_data_with_filters(
+    version=CURRENT_VERSION, search_query="", model_types=None, selected_columns=None
 ):
+    """
+    Refresh the leaderboard data and update all components with filtering.
+    Ensures we handle cases where dataframes might have limited columns.
+    """
+    global LEADERBOARD_DF
+    try:
+        logger.info(f"Performing refresh of leaderboard data with filters...")
+        # Get new data
+        main_df = get_leaderboard_df(version=version)
+        LEADERBOARD_DF = main_df
+        category_dfs = [
+            get_category_leaderboard_df(category, version=version)
+            for category in CATEGORIES
+        ]
+        selected_columns = [
+            x.lower()
+            .replace(" ", "_")
+            .replace("(", "")
+            .replace(")", "")
+            .replace("_recall", "_recall_binary")
+            .replace("_precision", "_precision_binary")
+            for x in selected_columns
+        ]
+        # Log the actual columns we have
+        logger.info(f"Main dataframe columns: {list(main_df.columns)}")
+        # Apply filters to each dataframe
+        filtered_main_df = search_filter_leaderboard(
+            main_df, search_query, model_types, version
+        )
+        filtered_category_dfs = [
+            search_filter_leaderboard(df, search_query, model_types, version)
+            for df in category_dfs
+        ]
+        # Get available columns from the dataframe
+        available_columns = list(filtered_main_df.columns)
+        # Filter selected columns to only those available in the data
+        if selected_columns:
+            # Convert display names to internal names first
+            internal_selected_columns = [
+                x.lower()
+                .replace(" ", "_")
+                .replace("(", "")
+                .replace(")", "")
+                .replace("_recall", "_recall_binary")
+                .replace("_precision", "_precision_binary")
+                for x in selected_columns
+            ]
+            valid_selected_columns = [
+                col for col in internal_selected_columns if col in available_columns
+            ]
+            if not valid_selected_columns and "model_name" in available_columns:
+                # Fallback if conversion/filtering leads to empty selection
+                valid_selected_columns = ["model_name"] + [
+                    col
+                    for col in get_default_visible_columns()
+                    if col in available_columns
+                ]
+        else:
+            # If no columns were selected in the dropdown, use default visible columns that exist
+            valid_selected_columns = [
+                col for col in get_default_visible_columns() if col in available_columns
+            ]
+        # Initialize dataframes for display with valid selected columns
+        main_dataframe = init_leaderboard(filtered_main_df, valid_selected_columns)
+        # For category dataframes, get columns that actually exist in each one
+        category_dataframes = []
+        for df in filtered_category_dfs:
+            df_columns = list(df.columns)
+            df_valid_columns = [
+                col for col in valid_selected_columns if col in df_columns
+            ]
+            if not df_valid_columns and "model_name" in df_columns:
+                df_valid_columns = ["model_name"] + get_default_visible_columns()
+            category_dataframes.append(init_leaderboard(df, df_valid_columns))
+        return main_dataframe, *category_dataframes
+    except Exception as e:
+        logger.error(f"Error in refresh with filters: {e}")
+        # Return the current leaderboards on error
+        return leaderboard, *[
+            tab.children[0] for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
+        ]
+def submit_results(
+    model_name: str,
+    base_model: str,
+    revision: str,
+    precision: str,
+    weight_type: str,
+    model_type: str,
+    mode: str,
+    submission_file: tempfile._TemporaryFileWrapper,
+    version: str,
+    review_model_type: ReviewModelType,
+):
+    """
+    Handle submission of results with model metadata.
+    """
+    if submission_file is None:
+        return styled_error("No submission file provided")
+    if not model_name:
+        return styled_error("Model name is required")
+    if not model_type:
+        return styled_error("Please select a model type")
+    if not mode:
+        return styled_error("Please select an inference mode")
+    file_path = submission_file.name
+    logger.info(f"Received submission for model {model_name}: {file_path}")
+    # Add metadata to the submission
+    metadata = {
+        "model_name": model_name,
+        "base_model": base_model,
+        "revision": revision if revision else "main",
+        "precision": precision,
+        "weight_type": weight_type,
+        "model_type": model_type,
+        "mode": mode,
+        "version": version,
+        "review_model_type": review_model_type,
+    }
+    # Process the submission
+    result = process_submission(file_path, metadata, version=version)
+    # Refresh the leaderboard data
+    global LEADERBOARD_DF
+    try:
+        logger.info(
+            f"Refreshing leaderboard data after submission for version {version}..."
         )
+        LEADERBOARD_DF = get_leaderboard_df(version=version)
+        logger.info("Refreshed leaderboard data after submission")
+    except Exception as e:
+        logger.error(f"Error refreshing leaderboard data: {e}")
+    return result
+def refresh_data(version=CURRENT_VERSION):
+    """
+    Refresh the leaderboard data and update all components.
+    """
+    try:
+        logger.info(f"Performing scheduled refresh of leaderboard data...")
+        # Get new data
+        main_df = get_leaderboard_df(version=version)
+        category_dfs = [
+            get_category_leaderboard_df(category, version=version)
+            for category in CATEGORIES
+        ]
+        # For gr.Dataframe, we return the actual dataframes
+        return main_df, *category_dfs
+    except Exception as e:
+        logger.error(f"Error in scheduled refresh: {e}")
+        return None, *[None for _ in CATEGORIES]
+def update_leaderboards(version):
+    """
+    Update all leaderboard components with data for the selected version.
+    """
+    try:
+        new_df = get_leaderboard_df(version=version)
+        category_dfs = [
+            get_category_leaderboard_df(category, version=version)
+            for category in CATEGORIES
+        ]
+        return new_df, *category_dfs
+    except Exception as e:
+        logger.error(f"Error updating leaderboards for version {version}: {e}")
+        return None, *[None for _ in CATEGORIES]
+def create_performance_plot(
+    selected_models, category, metric="f1_binary", version=CURRENT_VERSION
+):
+    """
+    Create a radar plot comparing model performance for selected models.
+    """
+    if category == "All Results":
+        df = get_leaderboard_df(version=version)
     else:
+        df = get_category_leaderboard_df(category, version=version)
+    if df.empty:
+        return go.Figure()
+    # Lowercase model_name in df and selected_models
+    df = df.copy()
+    df["model_name"] = df["model_name"].str.lower()
+    selected_models = [m.lower() for m in selected_models]
+    df = df[df["model_name"].isin(selected_models)]
+    metric_cols = [col for col in df.columns if metric in col]
+    fig = go.Figure()
+    colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
+    for idx, model in enumerate(selected_models):
+        model_data = df[df["model_name"] == model]
+        if not model_data.empty:
+            values = model_data[metric_cols].values[0].tolist()
+            values = values + [values[0]]
+            categories = [col.replace(f"_{metric}", "") for col in metric_cols]
+            # Replace 'jailbreaked' with 'jailbroken' in categories
+            categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
+            categories = categories + [categories[0]]
+            fig.add_trace(
+                go.Scatterpolar(
+                    r=values,
+                    theta=categories,
+                    name=model,
+                    line_color=colors[idx % len(colors)],
+                    fill="toself",
                 )
+            )
+    fig.update_layout(
+        paper_bgcolor="#000000",
+        plot_bgcolor="#000000",
+        font={"color": "#ffffff"},
+        title={
+            "text": f"{category} - {metric.upper()} Score Comparison",
+            "font": {"color": "#ffffff", "size": 24},
+        },
+        polar=dict(
+            bgcolor="#000000",
+            radialaxis=dict(
+                visible=True,
+                range=[0, 1],
+                gridcolor="#333333",
+                linecolor="#333333",
+                tickfont={"color": "#ffffff"},
+            ),
+            angularaxis=dict(
+                gridcolor="#333333",
+                linecolor="#333333",
+                tickfont={"color": "#ffffff"},
+            ),
+        ),
+        height=600,
+        showlegend=True,
+        legend=dict(
+            yanchor="top",
+            y=0.99,
+            xanchor="right",
+            x=0.99,
+            bgcolor="rgba(0,0,0,0.5)",
+            font={"color": "#ffffff"},
+        ),
+    )
+    return fig
+def update_model_choices(version):
+    """
+    Update the list of available models for the given version.
+    """
+    df = get_leaderboard_df(version=version)
+    if df.empty:
+        return []
+    return sorted(df["model_name"].str.lower().unique().tolist())
+def update_visualization(selected_models, selected_category, selected_metric, version):
+    """
+    Update the visualization based on user selections.
+    """
+    if not selected_models:
+        return go.Figure()
+    return create_performance_plot(
+        selected_models, selected_category, selected_metric, version
+    )
+# Create Gradio app
+demo = gr.Blocks(css=custom_css, theme=custom_theme)
+CATEGORY_DISPLAY_MAP = {
+    "Python": "Python",
+    "JavaScript": "JavaScript",
+    "Java": "Java",
+    "C++": "C++",
+    "C#": "C#",
+    "TypeScript": "TypeScript",
+    "Go": "Go",
+    "Rust": "Rust",
+    "Swift": "Swift",
+    "Kotlin": "Kotlin",
+    "Ruby": "Ruby",
+    "PHP": "PHP",
+    "C": "C",
+    "Scala": "Scala",
+    "R": "R",
+    "Dart": "Dart",
+    "Other": "Other"
+}
+# Create reverse mapping for lookups
+CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
+with demo:
+    gr.HTML(TITLE)
+    # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+    with gr.Row():
+        tabs = gr.Tabs(elem_classes="tab-buttons")
+        with tabs:
+            with gr.TabItem("Leaderboard", elem_id="codereview-leaderboard-tab", id=0):
+                with gr.Row():
+                    version_selector = gr.Dropdown(
+                        choices=BENCHMARK_VERSIONS,
+                        label="Benchmark Version",
+                        value=CURRENT_VERSION,
+                        interactive=True,
+                        elem_classes="version-selector",
+                        scale=1,
+                        visible=False,
+                    )
+                with gr.Row():
+                    search_input = gr.Textbox(
+                        placeholder="Search by models (use ; to split)",
+                        label="Search",
+                        elem_id="search-bar",
+                        scale=2,
+                    )
+                    model_type_filter = gr.Dropdown(
+                        choices=[
+                            t.to_str("-") for t in ModelType if t != ModelType.Unknown and t != ModelType.ClosedSource
+                        ],
+                        label="Access Type",
+                        multiselect=True,
+                        value=[],
+                        interactive=True,
+                        scale=1,
+                    )
+                    column_selector = gr.Dropdown(
+                        choices=get_all_column_choices(),
+                        label="Columns",
+                        multiselect=True,
+                        value=get_initial_columns(),
+                        interactive=True,
+                        visible=False,
+                        scale=1,
+                    )
+                with gr.Row():
+                    refresh_button = gr.Button(
+                        "Refresh", scale=0, elem_id="refresh-button"
+                    )
+                # Create tabs for each category
+                with gr.Tabs(elem_classes="category-tabs") as category_tabs:
+                    # First tab for average metrics across all categories
+                    with gr.TabItem("All Results", elem_id="overall-tab"):
+                        leaderboard = init_leaderboard(LEADERBOARD_DF)
+                    # Create a tab for each category using display names
+                    for category in CATEGORIES:
+                        display_name = CATEGORY_DISPLAY_MAP.get(category, category)
+                        elem_id = f"category-{display_name.lower().replace(' ', '-').replace('&', 'and')}-tab"
+                        with gr.TabItem(display_name, elem_id=elem_id):
+                            category_df = get_category_leaderboard_df(
+                                category, version=CURRENT_VERSION
+                            )
+                            category_leaderboard = init_leaderboard(category_df)
+                # Connect search and filter inputs to update function
+                def update_with_search_filters(
+                    version=CURRENT_VERSION,
+                    search_query="",
+                    model_types=None,
+                    selected_columns=None,
+                ):
+                    """
+                    Update the leaderboards with search and filter settings.
+                    """
+                    return refresh_data_with_filters(
+                        version, search_query, model_types, selected_columns
+                    )
+                # Refresh button functionality
+                def refresh_and_update(
+                    version, search_query, model_types, selected_columns
+                ):
+                    """
+                    Refresh data, update LEADERBOARD_DF, and return updated components.
+                    """
+                    global LEADERBOARD_DF
+                    main_df = get_leaderboard_df(version=version)
+                    LEADERBOARD_DF = main_df  # Update the global DataFrame
+                    return refresh_data_with_filters(
+                        version, search_query, model_types, selected_columns
+                    )
+                refresh_button.click(
+                    fn=refresh_and_update,
+                    inputs=[
+                        version_selector,
+                        search_input,
+                        model_type_filter,
+                        column_selector,
+                    ],
+                    outputs=[leaderboard]
+                    + [
+                        category_tabs.children[i].children[0]
+                        for i in range(1, len(CATEGORIES) + 1)
+                    ],
                 )
+                # Search input functionality
+                search_input.change(
+                    fn=refresh_data_with_filters,
+                    inputs=[
+                        version_selector,
+                        search_input,
+                        model_type_filter,
+                        column_selector,
+                    ],
+                    outputs=[leaderboard]
+                    + [
+                        category_tabs.children[i].children[0]
+                        for i in range(1, len(CATEGORIES) + 1)
+                    ],
                 )
+                # Model type filter functionality
+                model_type_filter.change(
+                    fn=refresh_data_with_filters,
+                    inputs=[
+                        version_selector,
+                        search_input,
+                        model_type_filter,
+                        column_selector,
+                    ],
+                    outputs=[leaderboard]
+                    + [
+                        category_tabs.children[i].children[0]
+                        for i in range(1, len(CATEGORIES) + 1)
+                    ],
                 )
+                # Version selector functionality
+                version_selector.change(
+                    fn=refresh_data_with_filters,
+                    inputs=[
+                        version_selector,
+                        search_input,
+                        model_type_filter,
+                        column_selector,
+                    ],
+                    outputs=[leaderboard]
+                    + [
+                        category_tabs.children[i].children[0]
+                        for i in range(1, len(CATEGORIES) + 1)
+                    ],
                 )
+                # Update the update_columns function to handle updating all tabs at once
+                def update_columns(selected_columns):
+                    """
+                    Update all leaderboards to show the selected columns.
+                    Ensures all selected columns are preserved in the update.
+                    """
+                    try:
+                        logger.info(f"Updating columns to show: {selected_columns}")
+                        # If no columns are selected, use default visible columns
+                        if not selected_columns or len(selected_columns) == 0:
+                            selected_columns = get_default_visible_columns()
+                            logger.info(
+                                f"No columns selected, using defaults: {selected_columns}"
+                            )
+                        # Convert display names to internal names
+                        internal_selected_columns = [
+                            x.lower()
+                            .replace(" ", "_")
+                            .replace("(", "")
+                            .replace(")", "")
+                            .replace("_recall", "_recall_binary")
+                            .replace("_precision", "_precision_binary")
+                            for x in selected_columns
+                        ]
+                        # Get the current data with ALL columns preserved
+                        main_df = get_leaderboard_df(version=version_selector.value)
+                        # Get category dataframes with ALL columns preserved
+                        category_dfs = [
+                            get_category_leaderboard_df(
+                                category, version=version_selector.value
+                            )
+                            for category in CATEGORIES
+                        ]
+                        # Log columns for debugging
+                        logger.info(f"Main dataframe columns: {list(main_df.columns)}")
+                        logger.info(
+                            f"Selected columns (internal): {internal_selected_columns}"
+                        )
+                        # IMPORTANT: Make sure model_name is always included
+                        if (
+                            "model_name" in main_df.columns
+                            and "model_name" not in internal_selected_columns
+                        ):
+                            internal_selected_columns = [
+                                "model_name"
+                            ] + internal_selected_columns
+                        # Initialize the main leaderboard with the selected columns
+                        # We're passing the internal_selected_columns directly to preserve the selection
+                        main_leaderboard = init_leaderboard(
+                            main_df, internal_selected_columns
+                        )
+                        # Initialize category dataframes with the same selected columns
+                        # This ensures consistency across all tabs
+                        category_leaderboards = []
+                        for df in category_dfs:
+                            # Use the same selected columns for each category
+                            # init_leaderboard will automatically handle filtering to columns that exist
+                            category_leaderboards.append(
+                                init_leaderboard(df, internal_selected_columns)
+                            )
+                        return main_leaderboard, *category_leaderboards
+                    except Exception as e:
+                        logger.error(f"Error updating columns: {e}")
+                        import traceback
+                        logger.error(traceback.format_exc())
+                        return leaderboard, *[
+                            tab.children[0]
+                            for tab in category_tabs.children[1 : len(CATEGORIES) + 1]
+                        ]
+                # Connect column selector to update function
+                column_selector.change(
+                    fn=update_columns,
+                    inputs=[column_selector],
+                    outputs=[leaderboard]
+                    + [
+                        category_tabs.children[i].children[0]
+                        for i in range(1, len(CATEGORIES) + 1)
+                    ],
                 )
+            with gr.TabItem("Visualize", elem_id="codereview-viz-tab", id=1):
+                with gr.Row():
+                    with gr.Column():
+                        viz_version_selector = gr.Dropdown(
+                            choices=BENCHMARK_VERSIONS,
+                            label="Benchmark Version",
+                            value=CURRENT_VERSION,
+                            interactive=True,
+                            visible=False,
+                        )
+                        # New: Mode selector
+                        def get_model_mode_choices(version):
+                            df = get_leaderboard_df(version=version)
+                            if df.empty:
+                                return []
+                            return sorted([
+                                f"{str(row['model_name']).lower()} [{row['mode']}]"
+                                for _, row in df.drop_duplicates(subset=["model_name", "mode"]).iterrows()
+                            ])
+                        model_mode_selector = gr.Dropdown(
+                            choices=get_model_mode_choices(CURRENT_VERSION),
+                            label="Select Model(s) [Mode] to Compare",
+                            multiselect=True,
+                            interactive=True,
+                        )
+                    with gr.Column():
+                        # Add Overall Performance to categories, use display names
+                        viz_categories_display = ["All Results"] + [
+                            CATEGORY_DISPLAY_MAP.get(cat, cat) for cat in CATEGORIES
+                        ]
+                        category_selector = gr.Dropdown(
+                            choices=viz_categories_display,
+                            label="Select Category",
+                            value=viz_categories_display[0],
+                            interactive=True,
+                        )
+                        metric_selector = gr.Dropdown(
+                            choices=[
+                                "accuracy",
+                                "f1_binary",
+                                "precision_binary",
+                                "recall_binary",
+                                "error_ratio",
+                            ],
+                            label="Select Metric",
+                            value="accuracy",
+                            interactive=True,
+                        )
+                plot_output = gr.Plot()
+                # Update visualization when any selector changes
+                def update_visualization_with_mode(
+                    selected_model_modes, selected_category, selected_metric, version
+                ):
+                    if not selected_model_modes:
+                        return go.Figure()
+                    df = (
+                        get_leaderboard_df(version=version)
+                        if selected_category == "All Results"
+                        else get_category_leaderboard_df(selected_category, version=version)
                     )
+                    if df.empty:
+                        return go.Figure()
+                    df = df.copy()
+                    df["model_name"] = df["model_name"].str.lower()
+                    selected_pairs = [s.rsplit(" [", 1) for s in selected_model_modes]
+                    selected_pairs = [
+                        (name.strip().lower(), mode.strip("] "))
+                        for name, mode in selected_pairs
+                    ]
+                    mask = df.apply(
+                        lambda row: (row["model_name"], str(row["mode"])) in selected_pairs,
+                        axis=1,
                     )
+                    filtered_df = df[mask]
+                    metric_cols = [col for col in filtered_df.columns if selected_metric in col]
+                    fig = go.Figure()
+                    colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
+                    for idx, (model_name, mode) in enumerate(selected_pairs):
+                        model_data = filtered_df[
+                            (filtered_df["model_name"] == model_name)
+                            & (filtered_df["mode"] == mode)
+                        ]
+                        if not model_data.empty:
+                            values = model_data[metric_cols].values[0].tolist()
+                            values = values + [values[0]]
+                            categories = [col.replace(f"_{selected_metric}", "") for col in metric_cols]
+                            # Replace 'jailbreaked' with 'jailbroken' in categories
+                            categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
+                            categories = categories + [categories[0]]
+                            fig.add_trace(
+                                go.Scatterpolar(
+                                    r=values,
+                                    theta=categories,
+                                    name=f"{model_name} [{mode}]",
+                                    line_color=colors[idx % len(colors)],
+                                    fill="toself",
+                                )
+                            )
+                    fig.update_layout(
+                        paper_bgcolor="#000000",
+                        plot_bgcolor="#000000",
+                        font={"color": "#ffffff"},
+                        title={
+                            "text": f"{selected_category} - {selected_metric.upper()} Score Comparison",
+                            "font": {"color": "#ffffff", "size": 24},
+                        },
+                        polar=dict(
+                            bgcolor="#000000",
+                            radialaxis=dict(
+                                visible=True,
+                                range=[0, 1],
+                                gridcolor="#333333",
+                                linecolor="#333333",
+                                tickfont={"color": "#ffffff"},
+                            ),
+                            angularaxis=dict(
+                                gridcolor="#333333",
+                                linecolor="#333333",
+                                tickfont={"color": "#ffffff"},
+                            ),
+                        ),
+                        height=600,
+                        showlegend=True,
+                        legend=dict(
+                            yanchor="top",
+                            y=0.99,
+                            xanchor="right",
+                            x=0.99,
+                            bgcolor="rgba(0,0,0,0.5)",
+                            font={"color": "#ffffff"},
+                        ),
+                    )
+                    return fig
+                # Connect selectors to update function
+                for control in [
+                    viz_version_selector,
+                    model_mode_selector,
+                    category_selector,
+                    metric_selector,
+                ]:
+                    control.change(
+                        fn=lambda smm, sc, s_metric, v: update_visualization_with_mode(
+                            smm, CATEGORY_REVERSE_MAP.get(sc, sc), s_metric, v
+                        ),
+                        inputs=[
+                            model_mode_selector,
+                            category_selector,
+                            metric_selector,
+                            viz_version_selector,
+                        ],
+                        outputs=plot_output,
+                    )
+                # Update model_mode_selector choices when version changes
+                viz_version_selector.change(
+                    fn=get_model_mode_choices,
+                    inputs=[viz_version_selector],
+                    outputs=[model_mode_selector],
                 )
+            # with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
+            #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+            with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=3):
+                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
+                with gr.Row():
+                    # with gr.Column(scale=3):
+                    #     gr.Markdown("# ✉️✨ Submit your results here!", elem_classes="markdown-text")
+                    with gr.Column(scale=1):
+                        # Add version selector specifically for the submission tab
+                        submission_version_selector = gr.Dropdown(
+                            choices=BENCHMARK_VERSIONS,
+                            label="Benchmark Version",
+                            value=CURRENT_VERSION,
+                            interactive=True,
+                            elem_classes="version-selector",
+                            visible=False,
+                        )
+                with gr.Row():
+                    with gr.Column():
+                        model_name_textbox = gr.Textbox(label="Model name")
+                        mode_selector = gr.Dropdown(
+                            choices=[m.name for m in Mode],
+                            label="Mode",
+                            multiselect=False,
+                            value=None,
+                            interactive=True,
+                        )
+                        revision_name_textbox = gr.Textbox(
+                            label="Revision commit", placeholder="main"
+                        )
+                        model_type = gr.Dropdown(
+                            choices=[
+                                t.to_str("-")
+                                for t in ModelType
+                                if t != ModelType.Unknown and t != ModelType.ClosedSource
+                            ],
+                            label="Model type",
+                            multiselect=False,
+                            value=None,
+                            interactive=True,
+                        )
+                        review_model_type = gr.Dropdown(
+                            choices=[t.name for t in ReviewModelType],
+                            label="Review model type",
+                            multiselect=False,
+                            value=ReviewModelType.CUSTOM.name,
+                            interactive=True,
+                        )
+                    with gr.Column():
+                        precision = gr.Dropdown(
+                            choices=[
+                                i.name for i in Precision if i != Precision.Unknown
+                            ],
+                            label="Precision",
+                            multiselect=False,
+                            value="float16",
+                            interactive=True,
+                        )
+                        weight_type = gr.Dropdown(
+                            choices=[i.name for i in WeightType],
+                            label="Weights type",
+                            multiselect=False,
+                            value="Original",
+                            interactive=True,
+                        )
+                        base_model_name_textbox = gr.Textbox(
+                            label="Base model (for delta or adapter weights)"
+                        )
+                with gr.Row():
+                    file_input = gr.File(
+                        label="Upload JSONL Results File", file_types=[".jsonl"]
+                    )
+                submit_button = gr.Button("Submit Results")
+                result_output = gr.Markdown()
+                submit_button.click(
+                    fn=submit_results,
+                    inputs=[
+                        model_name_textbox,
+                        base_model_name_textbox,
+                        revision_name_textbox,
+                        precision,
+                        weight_type,
+                        model_type,
+                        mode_selector,
+                        file_input,
+                        submission_version_selector,
+                        review_model_type,
+                    ],
+                    outputs=result_output,
+                )
+    # Version selector functionality
+    version_selector.change(
+        fn=update_leaderboards,
+        inputs=[version_selector],
+        outputs=[leaderboard]
+        + [
+            category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
+        ],
+    ).then(
+        lambda version: refresh_data_with_filters(version),
+        inputs=[version_selector],
+        outputs=[leaderboard]
+        + [
+            category_tabs.children[i].children[0] for i in range(1, len(CATEGORIES) + 1)
+        ],
     )
+# Set up the scheduler to refresh data periodically
+scheduler = BackgroundScheduler()
+scheduler.add_job(refresh_data, "interval", minutes=30)
+scheduler.start()
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

example_submission.jsonl ADDED Viewed

	@@ -0,0 +1,4 @@

+{"model_name": "GPT-4-CodeReview", "programming_language": "python", "comment_language": "en", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
+{"model_name": "GPT-4-CodeReview", "programming_language": "javascript", "comment_language": "en", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
+{"model_name": "Claude-3-CodeReview", "programming_language": "python", "comment_language": "en", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 8.0, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
+{"model_name": "Llama-CodeReview", "programming_language": "java", "comment_language": "en", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}

requirements.txt CHANGED Viewed

@@ -1,19 +1,8 @@
-APScheduler
-black
-datasets
-gradio>=4.0.0
-gradio[oauth]
-gradio_leaderboard==0.0.13
-gradio_client
-huggingface-hub>=0.18.0
-matplotlib
-numpy
-pandas>=1.3.0
-python-dateutil
-tqdm
-transformers
-tokenizers>=0.15.0
-sentencepiece
-fastapi
-uvicorn
-pydantic>=2.0.0

+gradio==4.44.1
+pandas>=2.0.0
+huggingface_hub>=0.20.0
+datasets>=2.0.0
+apscheduler>=3.10.0
+python-dotenv>=1.0.0
+plotly>=5.18.0
+pydantic==2.10.6

src/about.py CHANGED Viewed

@@ -1,48 +1,60 @@
 """
-About page content for CodeReview Leaderboard
 """
-TITLE = "🏆 CodeReview Leaderboard"
 INTRODUCTION_TEXT = """
-# CodeReview Leaderboard
-A comprehensive benchmark for evaluating code review generation models across multiple programming languages and comment types.
-## Overview
-This leaderboard tracks the performance of various models on code review tasks, providing insights into:
-- **Programming Language Performance**: How well models perform across different programming languages
-- **Comment Language Support**: Effectiveness in generating reviews in different natural languages
-- **Taxonomy Categories**: Performance across different types of code review feedback
-## Metrics
-- **BLEU**: Measures similarity between generated and reference reviews
-- **Pass@1/5/10**: Percentage of reviews that pass quality checks in 1, 5, or 10 attempts
-- **Multi-dimensional Quality Scores**: Detailed evaluation across 10 quality dimensions
-## Features
-✨ **Filter by Programming Language**: View results for specific programming languages (Python, JavaScript, Java, etc.)
-✨ **Comment Language Support**: Filter by the natural language of code comments
-✨ **Taxonomy Categories**: Browse results by review type (bug detection, style, performance, etc.)
-✨ **IP-based Submissions**: Secure submission system with IP tracking
-✨ **Dark Theme**: Modern, eye-friendly interface
 """
-SUBMISSION_GUIDELINES = """
-## Submission Guidelines
-1. **Model Requirements**: Submit results for at least 100 test cases
-2. **Format**: Provide scores in the specified format ranges
-3. **Reproducibility**: Include model details and evaluation setup
-4. **Quality Metrics**: Rate your model across all 10 quality dimensions
-5. **Metadata**: Specify programming language, comment language, and taxonomy focus
-"""
-CONTACT_INFO = """
-## Contact & Support
-For questions, issues, or contributions, please reach out through our repository or contact the maintainers.
-"""

 """
+Text content for the CodeReview Bench Leaderboard.
 """
+TITLE = """
+<div style="text-align: center; margin-bottom: 1rem">
+    <h1>CodeReview Bench Leaderboard</h1>
+</div>
+"""
 INTRODUCTION_TEXT = """
+## Introduction
+CodeReview Bench is a comprehensive benchmark for evaluating the quality and effectiveness of automated code review systems.
+This leaderboard tracks model performance across various programming languages and review criteria,
+including readability, relevance, explanation clarity, and actionability.
+Models are evaluated on their ability to provide high-quality code reviews that are helpful,
+accurate, and actionable across multiple programming languages and review categories.
+"""
+LLM_BENCHMARKS_TEXT = """
+CodeReview Bench is a comprehensive benchmark for evaluating automated code review systems across programming languages and review quality dimensions.
+It evaluates models on their ability to provide high-quality code reviews using both LLM-based multimetric evaluation (readability, relevance, explanation clarity, problem identification, actionability, completeness, specificity, contextual adequacy, consistency, brevity) and exact-match metrics (pass@1, pass@5, pass@10, BLEU@10).
+The benchmark supports both Russian and English comment languages across 17+ programming languages including Python, JavaScript, Java, C++, TypeScript, Go, Rust, and more.
+Learn more about automated code review evaluation and best practices.
 """
+EVALUATION_QUEUE_TEXT = """
+## Submit Your Model
+To add your model to the CodeReview Bench leaderboard:
+1. Run your evaluation using the CodeReview Bench framework
+2. Upload your results in .jsonl format using this form.
+3. Once validated, your model will appear on the leaderboard.
+### Requirements:
+- Results must include all required metrics: LLM-based multimetric scores and exact-match metrics
+- Submissions should cover multiple programming languages where applicable
+- Both Russian and English comment languages are supported
+### ✉️✨ Ready? Upload your results below!
+"""
+CITATION_BUTTON_LABEL = "Cite CodeReview Bench"
+CITATION_BUTTON_TEXT = """
+@misc{codereviewbench2025,
+  author = {CodeReview Bench Team},
+  title = {CodeReview Bench: Comprehensive Benchmark for Automated Code Review Systems},
+  year = {2025},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\\url{https://github.com/your-org/codereview-bench}}
+}
+"""

src/display/css_html_js.py CHANGED Viewed

@@ -1,306 +1,97 @@
 """
-Custom CSS, HTML, and JavaScript for the CodeReview Leaderboard
 """
-# Dark theme CSS
-DARK_THEME_CSS = """
-/* Dark Theme Styling */
-:root {
-    --bg-primary: #0d1117;
-    --bg-secondary: #161b22;
-    --bg-tertiary: #21262d;
-    --text-primary: #e6edf3;
-    --text-secondary: #7d8590;
-    --border-color: #30363d;
-    --accent-color: #ffffff;
-    --accent-hover: #f0f0f0;
-    --danger-color: #da3633;
-    --warning-color: #d29922;
-    --info-color: #1f6feb;
 }
-/* Global dark theme */
-.gradio-container {
-    background: var(--bg-primary) !important;
-    color: var(--text-primary) !important;
 }
-/* Headers and text */
-.gradio-container h1, .gradio-container h2, .gradio-container h3 {
-    color: var(--text-primary) !important;
 }
-.gradio-container p, .gradio-container span {
-    color: var(--text-secondary) !important;
 }
-/* Tabs */
-.gradio-container .tab-nav {
-    background: var(--bg-secondary) !important;
-    border-bottom: 1px solid var(--border-color) !important;
 }
-.gradio-container .tab-nav button {
-    background: transparent !important;
-    color: var(--text-secondary) !important;
-    border: none !important;
-    padding: 12px 24px !important;
-    transition: all 0.2s ease !important;
 }
-.gradio-container .tab-nav button:hover {
-    color: var(--text-primary) !important;
-    background: var(--bg-tertiary) !important;
 }
-.gradio-container .tab-nav button.selected {
-    color: var(--text-primary) !important;
-    background: var(--bg-tertiary) !important;
-    border-bottom: 2px solid var(--accent-color) !important;
 }
-/* Tables */
-.gradio-container .dataframe {
-    background: var(--bg-secondary) !important;
-    border: 1px solid var(--border-color) !important;
-    border-radius: 8px !important;
-    overflow: hidden !important;
 }
-.gradio-container .dataframe table {
-    background: var(--bg-secondary) !important;
 }
-.gradio-container .dataframe th {
-    background: var(--bg-tertiary) !important;
-    color: var(--text-primary) !important;
-    border-bottom: 2px solid var(--border-color) !important;
-    padding: 12px !important;
-    font-weight: 600 !important;
 }
-.gradio-container .dataframe td {
-    background: var(--bg-secondary) !important;
-    color: var(--text-primary) !important;
-    border-bottom: 1px solid var(--border-color) !important;
-    padding: 10px 12px !important;
 }
-.gradio-container .dataframe tr:hover td {
-    background: var(--bg-tertiary) !important;
 }
-/* Form inputs */
-.gradio-container input, .gradio-container select, .gradio-container textarea {
-    background: var(--bg-tertiary) !important;
-    color: var(--text-primary) !important;
-    border: 1px solid var(--border-color) !important;
-    border-radius: 6px !important;
-    padding: 8px 12px !important;
 }
-.gradio-container input:focus, .gradio-container select:focus, .gradio-container textarea:focus {
-    border-color: var(--accent-color) !important;
-    box-shadow: 0 0 0 2px rgba(255, 255, 255, 0.2) !important;
 }
-/* Buttons */
-.gradio-container button {
-    background: var(--accent-color) !important;
-    color: var(--bg-primary) !important;
-    border: 1px solid var(--border-color) !important;
-    border-radius: 6px !important;
-    padding: 8px 16px !important;
-    font-weight: 500 !important;
-    transition: all 0.2s ease !important;
-}
-.gradio-container button:hover {
-    background: var(--accent-hover) !important;
-    transform: translateY(-1px) !important;
-    color: var(--bg-primary) !important;
-}
-.gradio-container button:active {
-    transform: translateY(0) !important;
-}
-/* Dropdowns */
-.gradio-container .dropdown {
-    background: var(--bg-tertiary) !important;
-    border: 1px solid var(--border-color) !important;
-    border-radius: 6px !important;
-}
-.gradio-container .dropdown-menu {
-    background: var(--bg-secondary) !important;
-    border: 1px solid var(--border-color) !important;
-    border-radius: 6px !important;
-    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3) !important;
-}
-.gradio-container .dropdown-menu .dropdown-item {
-    color: var(--text-primary) !important;
-    padding: 8px 12px !important;
-}
-.gradio-container .dropdown-menu .dropdown-item:hover {
-    background: var(--bg-tertiary) !important;
-}
-/* Sliders */
-.gradio-container .slider {
-    background: var(--bg-tertiary) !important;
-}
-.gradio-container .slider input[type="range"] {
-    background: var(--bg-tertiary) !important;
-}
-.gradio-container .slider input[type="range"]::-webkit-slider-thumb {
-    background: var(--accent-color) !important;
-    border: 2px solid var(--bg-primary) !important;
-    border-radius: 50% !important;
-    width: 18px !important;
-    height: 18px !important;
-}
-.gradio-container .slider input[type="range"]::-webkit-slider-track {
-    background: var(--border-color) !important;
-    border-radius: 4px !important;
-    height: 6px !important;
-}
-/* Accordions */
-.gradio-container .accordion {
-    background: var(--bg-secondary) !important;
-    border: 1px solid var(--border-color) !important;
-    border-radius: 8px !important;
-    margin: 16px 0 !important;
-}
-.gradio-container .accordion-header {
-    background: var(--bg-tertiary) !important;
-    color: var(--text-primary) !important;
-    padding: 16px !important;
-    border-bottom: 1px solid var(--border-color) !important;
-    cursor: pointer !important;
-    font-weight: 500 !important;
-}
-.gradio-container .accordion-header:hover {
-    background: var(--bg-primary) !important;
-}
-/* Status messages */
-.gradio-container .success {
-    background: rgba(255, 255, 255, 0.1) !important;
-    color: var(--text-primary) !important;
-    border: 1px solid var(--accent-color) !important;
-    border-radius: 6px !important;
-    padding: 12px 16px !important;
-    margin: 8px 0 !important;
-}
-.gradio-container .error {
-    background: rgba(218, 54, 51, 0.1) !important;
-    color: var(--danger-color) !important;
-    border: 1px solid var(--danger-color) !important;
-    border-radius: 6px !important;
-    padding: 12px 16px !important;
-    margin: 8px 0 !important;
-}
-/* Responsive design */
-@media (max-width: 768px) {
-    .gradio-container {
-        padding: 16px !important;
-    }
-    .gradio-container .tab-nav button {
-        padding: 8px 16px !important;
-        font-size: 14px !important;
-    }
-    .gradio-container .dataframe {
-        font-size: 14px !important;
-    }
 }
 """
-# Custom JavaScript for enhanced functionality
-CUSTOM_JS = """
-// Enhanced table sorting and filtering
-function enhanceTable() {
-    const tables = document.querySelectorAll('.dataframe table');
-    tables.forEach(table => {
-        // Add sorting functionality
-        const headers = table.querySelectorAll('th');
-        headers.forEach((header, index) => {
-            header.style.cursor = 'pointer';
-            header.addEventListener('click', () => sortTable(table, index));
-        });
-    });
-}
-function sortTable(table, columnIndex) {
-    const tbody = table.querySelector('tbody');
-    const rows = Array.from(tbody.querySelectorAll('tr'));
-    rows.sort((a, b) => {
-        const aText = a.cells[columnIndex].textContent.trim();
-        const bText = b.cells[columnIndex].textContent.trim();
-        // Try to parse as numbers first
-        const aNum = parseFloat(aText);
-        const bNum = parseFloat(bText);
-        if (!isNaN(aNum) && !isNaN(bNum)) {
-            return bNum - aNum; // Descending for numbers
-        }
-        return aText.localeCompare(bText); // Ascending for text
-    });
-    rows.forEach(row => tbody.appendChild(row));
-}
-// Auto-refresh functionality
-function autoRefresh() {
-    setInterval(() => {
-        const refreshBtn = document.querySelector('button[aria-label="Refresh"]');
-        if (refreshBtn) {
-            refreshBtn.click();
-        }
-    }, 30000); // Refresh every 30 seconds
-}
-// Initialize enhancements
-document.addEventListener('DOMContentLoaded', function() {
-    enhanceTable();
-    autoRefresh();
-});
-"""
-# HTML components
-HEADER_HTML = """
-<div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-bottom: 20px;">
-    <h1 style="color: var(--text-primary); margin: 0; font-size: 2.5em; font-weight: 700;">
-        🏆 CodeReview Leaderboard
-    </h1>
-    <p style="color: var(--text-secondary); margin: 10px 0 0 0; font-size: 1.2em;">
-        Benchmarking code review generation models across languages and categories
-    </p>
-</div>
-"""
-FOOTER_HTML = """
-<div style="text-align: center; padding: 20px; background: var(--bg-secondary); border-radius: 12px; margin-top: 20px;">
-    <p style="color: var(--text-secondary); margin: 0; font-size: 0.9em;">
-        Built with ❤️ for the code review community |
-        <a href="https://github.com/your-repo" style="color: var(--accent-color); text-decoration: none;">
-            GitHub
-        </a>
-    </p>
-</div>
-"""

 """
+CSS and styling for the CodeReview Bench Leaderboard.
 """
+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+    text-align: justify !important;
+    line-height: 1.0 !important;
+    margin-top: 10px !important;
+    margin-bottom: 10px !important;
 }
+.tab-buttons button.selected {
+    border-color: #f4f4f5 !important;
+    background: #3f3f46 !important;
+    color: #f4f4f5 !important;
 }
+#citation-button textarea {
+    font-family: monospace !important;
 }
+.leaderboard-container {
+    margin-top: 20px;
 }
+.category-header {
+    font-weight: bold;
+    background-color: #f5f5f5;
+    padding: 10px;
+    margin-top: 15px;
+    border-radius: 5px;
 }
+.metric-name {
+    font-weight: bold;
+    color: #a1a1aa !important;
 }
+.model-name {
+    font-weight: bold;
 }
+.model-link:hover {
+    text-decoration: underline;
+    color: #ffffff !important;
 }
+.version-selector {
+    margin: 0 !important;
+    padding: 5px;
+    border-radius: 5px;
 }
+.version-selector label {
+    font-weight: bold;
+    color: #f4f4f5 !important;
 }
+.version-selector select {
+    border-color: #3f3f46 !important;
+    border-radius: 5px;
 }
+/* Make sure the version selector is properly aligned with refresh button */
+.version-selector > .block {
+    padding: 0 !important;
 }
+.version-selector > .block > .wrap {
+    position: relative;
+    top: -5px;
 }
+/* Force background/border for common layout containers */
+.gradio-row > .block,
+.gradio-column > .block,
+.form,
+.panel {
+    /* background: #18181b !important; */ /* Removed background override */
+    border-color: #27272a80 !important; /* Made border color semi-transparent */
+    border-width: 1px !important; /* Ensure border is visible */
+    border-style: solid !important;
 }
+/* Target the specific file upload component area */
+.gradio-file .wrap {
+     /* background: #18181b !important; */ /* Removed background override */
+     border-color: #27272a !important;
 }
+#refresh-button {
+    margin-top: 5px !important;
+    margin-bottom: 5px !important;
 }
 """

src/display/formatting.py CHANGED Viewed

@@ -1,182 +1,71 @@
 """
-Formatting utilities for display components
 """
-import re
-from typing import List, Dict, Any, Optional
-from datetime import datetime, timezone
-def format_score(score: float, precision: int = 3) -> str:
-    """Format a score with specified precision"""
-    if isinstance(score, (int, float)):
-        return f"{score:.{precision}f}"
-    return str(score)
-def format_percentage(score: float, precision: int = 1) -> str:
-    """Format a score as percentage"""
-    if isinstance(score, (int, float)):
-        return f"{score * 100:.{precision}f}%"
-    return str(score)
-def format_model_name(name: str) -> str:
-    """Format model name for display"""
-    # Remove common prefixes and make more readable
-    name = name.strip()
-    if "/" in name:
-        org, model = name.split("/", 1)
-        return f"<span style='color: var(--text-secondary); font-size: 0.9em;'>{org}/</span><strong>{model}</strong>"
-    return f"<strong>{name}</strong>"
-def format_timestamp(timestamp: str) -> str:
-    """Format timestamp for display"""
-    try:
-        dt = datetime.fromisoformat(timestamp.replace("Z", "+00:00"))
-        return dt.strftime("%Y-%m-%d %H:%M UTC")
-    except:
-        return timestamp
-def format_ip_address(ip: str) -> str:
-    """Format IP address for display (partial masking)"""
-    if not ip:
-        return "Unknown"
-    # Mask part of IP for privacy
-    parts = ip.split(".")
-    if len(parts) == 4:
-        return f"{parts[0]}.{parts[1]}.{parts[2]}.xxx"
-    return "xxx.xxx.xxx.xxx"
-def format_metric_score(score: int, metric_name: str) -> str:
-    """Format metric score with color coding"""
-    if not isinstance(score, (int, float)):
-        return str(score)
-    # Color coding based on score
-    if score >= 8:
-        color = "#ffffff"  # White
-    elif score >= 6:
-        color = "#d0d0d0"  # Light gray
-    elif score >= 4:
-        color = "#a0a0a0"  # Gray
-    else:
-        color = "#707070"  # Dark gray
-    return f"<span style='color: {color}; font-weight: 600;'>{score}</span>"
-def format_language_badge(language: str) -> str:
-    """Format programming language as a badge"""
-    if not language or language == "All":
-        return language
-    # Language-specific colors
-    colors = {
-        "Python": "#3776ab",
-        "JavaScript": "#f7df1e",
-        "Java": "#ed8b00",
-        "C++": "#00599c",
-        "C#": "#239120",
-        "Go": "#00add8",
-        "Rust": "#ce422b",
-        "TypeScript": "#3178c6",
-        "PHP": "#777bb4",
-        "Ruby": "#cc342d",
-        "Swift": "#fa7343",
-        "Kotlin": "#7f52ff",
-        "Scala": "#dc322f",
-        "R": "#276dc3",
-        "MATLAB": "#e16737"
-    }
-    color = colors.get(language, "#6c757d")
-    return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{language}</span>"
-def format_taxonomy_badge(category: str) -> str:
-    """Format taxonomy category as a badge"""
-    if not category or category == "All":
-        return category
-    # Category-specific colors
-    colors = {
-        "Bug Detection": "#dc3545",
-        "Code Style": "#6f42c1",
-        "Performance": "#fd7e14",
-        "Security": "#e83e8c",
-        "Maintainability": "#ffffff",
-        "Documentation": "#17a2b8",
-        "Testing": "#ffffff",
-        "Architecture": "#6c757d",
-        "Best Practices": "#007bff",
-        "Refactoring": "#ffc107"
-    }
-    color = colors.get(category, "#6c757d")
-    return f"<span style='background: {color}; color: white; padding: 2px 8px; border-radius: 12px; font-size: 0.8em; font-weight: 500;'>{category}</span>"
-def format_comment_language_flag(language: str) -> str:
-    """Format comment language with flag emoji"""
-    if not language or language == "All":
-        return language
-    # Language-specific flags
-    flags = {
-        "English": "🇺🇸",
-        "Chinese": "🇨🇳",
-        "Spanish": "🇪🇸",
-        "French": "🇫🇷",
-        "German": "🇩🇪",
-        "Japanese": "🇯🇵",
-        "Korean": "🇰🇷",
-        "Russian": "🇷🇺",
-        "Portuguese": "🇵🇹",
-        "Italian": "🇮🇹",
-        "Dutch": "🇳🇱"
-    }
-    flag = flags.get(language, "🌐")
-    return f"{flag} {language}"
-def sanitize_html(text: str) -> str:
-    """Sanitize HTML content to prevent XSS"""
-    if not isinstance(text, str):
-        return str(text)
-    # Remove potentially dangerous HTML tags
-    text = re.sub(r'<script[^>]*>.*?</script>', '', text, flags=re.DOTALL | re.IGNORECASE)
-    text = re.sub(r'<iframe[^>]*>.*?</iframe>', '', text, flags=re.DOTALL | re.IGNORECASE)
-    text = re.sub(r'on\w+="[^"]*"', '', text, flags=re.IGNORECASE)
-    text = re.sub(r'on\w+=\'[^\']*\'', '', text, flags=re.IGNORECASE)
-    return text
-def truncate_text(text: str, max_length: int = 50) -> str:
-    """Truncate text with ellipsis"""
-    if not isinstance(text, str):
-        text = str(text)
-    if len(text) <= max_length:
-        return text
-    return text[:max_length-3] + "..."
-def format_table_cell(value: Any, column_name: str) -> str:
-    """Format table cell based on column type"""
-    if value is None:
-        return "N/A"
-    # Handle different column types
-    if column_name.lower() in ["bleu", "pass@1", "pass@5", "pass@10"]:
-        return format_percentage(value)
-    elif column_name.lower() == "model":
-        return format_model_name(str(value))
-    elif column_name.lower() == "programming language":
-        return format_language_badge(str(value))
-    elif column_name.lower() == "comment language":
-        return format_comment_language_flag(str(value))
-    elif column_name.lower() == "taxonomy":
-        return format_taxonomy_badge(str(value))
-    elif column_name.lower() in ["readability", "relevance", "explanation clarity",
-                                 "problem identification", "actionability", "completeness",
-                                 "specificity", "contextual adequacy", "consistency", "brevity"]:
-        return format_metric_score(value, column_name.lower())
-    else:
-        return sanitize_html(str(value))

 """
+Formatting utilities for the GuardBench Leaderboard.
 """
+import pandas as pd
+import numpy as np
+def make_clickable_model(model_name: str) -> str:
+    """
+    Create a clickable link for a model name.
+    """
+    return f'<a href="https://huggingface.co/{model_name}" target="_blank">{model_name}</a>'
+def has_no_nan_values(df: pd.DataFrame, columns: list) -> pd.Series:
+    """
+    Check if a row has no NaN values in the specified columns.
+    """
+    return ~df[columns].isna().any(axis=1)
+def format_percentage(value: float) -> str:
+    """
+    Format a value as a percentage.
+    """
+    if pd.isna(value):
+        return "N/A"
+    return f"{value * 100:.2f}%"
+def format_number(value: float, precision: int = 2) -> str:
+    """
+    Format a number with specified precision.
+    """
+    if pd.isna(value):
+        return "N/A"
+    return f"{value:.{precision}f}"
+def styled_message(message: str) -> str:
+    """
+    Format a success message with styling.
+    """
+    return f"""
+    <div style="padding: 10px; border-radius: 5px; background-color: #e6f7e6; color: #2e7d32; border: 1px solid #2e7d32;">
+        ✅ {message}
+    </div>
+    """
+def styled_warning(message: str) -> str:
+    """
+    Format a warning message with styling.
+    """
+    return f"""
+    <div style="padding: 10px; border-radius: 5px; background-color: #fff8e1; color: #ff8f00; border: 1px solid #ff8f00;">
+        ⚠️ {message}
+    </div>
+    """
+def styled_error(message: str) -> str:
+    """
+    Format an error message with styling.
+    """
+    return f"""
+    <div style="padding: 10px; border-radius: 5px; background-color: #ffebee; color: #c62828; border: 1px solid #c62828;">
+        ❌ {message}
+    </div>
+    """

src/display/utils.py CHANGED Viewed

@@ -1,292 +1,415 @@
 """
-Display utilities for the CodeReview Leaderboard
 """
-from typing import List, Dict, Any, Optional, Tuple
-import json
-from datetime import datetime, timezone
-from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
-from src.display.formatting import format_table_cell, format_timestamp
-def filter_leaderboard_data(
-    data: List[Dict],
-    programming_language: str = "All",
-    comment_language: str = "All",
-    taxonomy_category: str = "All",
-    sort_by: str = "llm_pass_1",
-    sort_order: str = "desc"
-) -> List[Dict]:
-    """Filter and sort leaderboard data based on criteria"""
-    if not data:
-        return []
-    # Apply filters
-    filtered_data = data.copy()
-    if programming_language != "All":
-        filtered_data = [
-            entry for entry in filtered_data
-            if entry.get("programming_language", "").lower() == programming_language.lower()
-        ]
-    if comment_language != "All":
-        filtered_data = [
-            entry for entry in filtered_data
-            if entry.get("comment_language", "").lower() == comment_language.lower()
-        ]
-    if taxonomy_category != "All":
-        filtered_data = [
-            entry for entry in filtered_data
-            if entry.get("taxonomy_category", "").lower() == taxonomy_category.lower()
-        ]
-    # Sort data
-    reverse = sort_order.lower() == "desc"
-    try:
-        if sort_by in ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]:
-            filtered_data.sort(key=lambda x: x.get(sort_by, 0), reverse=reverse)
-        elif sort_by in QUALITY_METRICS:
-            filtered_data.sort(key=lambda x: x.get("metrics", {}).get(sort_by, 0), reverse=reverse)
-        else:
-            filtered_data.sort(key=lambda x: str(x.get(sort_by, "")), reverse=reverse)
-    except Exception as e:
-        print(f"Error sorting data: {e}")
-        # Default sort by pass@1
-        filtered_data.sort(key=lambda x: x.get("llm_pass_1", 0), reverse=True)
-    return filtered_data
-def get_main_leaderboard_data(
-    data: List[Dict],
-    programming_language: str = "All",
-    comment_language: str = "All",
-    taxonomy_category: str = "All",
-    sort_by: str = "llm_pass_1"
-) -> List[List[str]]:
-    """Get formatted main leaderboard table data"""
-    filtered_data = filter_leaderboard_data(
-        data, programming_language, comment_language, taxonomy_category, sort_by
-    )
-    table_rows = []
-    for entry in filtered_data:
-        row = [
-            format_table_cell(entry.get("model_name", ""), "model"),
-            format_table_cell(entry.get("programming_language", ""), "programming language"),
-            format_table_cell(entry.get("comment_language", ""), "comment language"),
-            format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
-            format_table_cell(entry.get("bleu", 0), "bleu"),
-            format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
-            format_table_cell(entry.get("llm_pass_5", 0), "pass@5"),
-            format_table_cell(entry.get("llm_pass_10", 0), "pass@10"),
-        ]
-        table_rows.append(row)
-    return table_rows
-def get_quality_metrics_data(
-    data: List[Dict],
-    programming_language: str = "All",
-    comment_language: str = "All",
-    taxonomy_category: str = "All",
-    sort_by: str = "llm_pass_1"
-) -> List[List[str]]:
-    """Get formatted quality metrics table data"""
-    filtered_data = filter_leaderboard_data(
-        data, programming_language, comment_language, taxonomy_category, sort_by
-    )
-    table_rows = []
-    for entry in filtered_data:
-        metrics = entry.get("metrics", {})
-        row = [format_table_cell(entry.get("model_name", ""), "model")]
-        for metric in QUALITY_METRICS:
-            formatted_value = format_table_cell(metrics.get(metric, 0), metric.replace("_", " "))
-            row.append(formatted_value)
-        table_rows.append(row)
-    return table_rows
-def get_submission_history_data(
-    data: List[Dict],
-    programming_language: str = "All",
-    comment_language: str = "All",
-    taxonomy_category: str = "All",
-    limit: int = 50
-) -> List[List[str]]:
-    """Get formatted submission history data"""
-    filtered_data = filter_leaderboard_data(
-        data, programming_language, comment_language, taxonomy_category, "submission_date", "desc"
-    )
-    # Limit results
-    filtered_data = filtered_data[:limit]
-    table_rows = []
-    for entry in filtered_data:
-        row = [
-            format_table_cell(entry.get("model_name", ""), "model"),
-            format_table_cell(entry.get("programming_language", ""), "programming language"),
-            format_table_cell(entry.get("comment_language", ""), "comment language"),
-            format_table_cell(entry.get("taxonomy_category", ""), "taxonomy"),
-            format_table_cell(entry.get("llm_pass_1", 0), "pass@1"),
-            format_timestamp(entry.get("submission_date", "")),
-            entry.get("submission_ip", "").split(".")[0] + ".xxx.xxx.xxx" if entry.get("submission_ip") else "Unknown"
-        ]
-        table_rows.append(row)
-    return table_rows
-def get_statistics_summary(data: List[Dict]) -> Dict[str, Any]:
-    """Get summary statistics for the leaderboard"""
-    if not data:
-        return {
-            "total_models": 0,
-            "total_submissions": 0,
-            "avg_pass_1": 0,
-            "best_model": "None",
-            "languages_covered": 0,
-            "categories_covered": 0
-        }
-    # Calculate statistics
-    total_models = len(set(entry.get("model_name", "") for entry in data))
-    total_submissions = len(data)
-    pass_1_scores = [entry.get("llm_pass_1", 0) for entry in data if entry.get("llm_pass_1") is not None]
-    avg_pass_1 = sum(pass_1_scores) / len(pass_1_scores) if pass_1_scores else 0
-    best_entry = max(data, key=lambda x: x.get("llm_pass_1", 0)) if data else None
-    best_model = best_entry.get("model_name", "None") if best_entry else "None"
-    languages_covered = len(set(entry.get("programming_language", "") for entry in data if entry.get("programming_language")))
-    categories_covered = len(set(entry.get("taxonomy_category", "") for entry in data if entry.get("taxonomy_category")))
-    return {
-        "total_models": total_models,
-        "total_submissions": total_submissions,
-        "avg_pass_1": avg_pass_1,
-        "best_model": best_model,
-        "languages_covered": languages_covered,
-        "categories_covered": categories_covered
-    }
-def validate_submission_data(data: Dict[str, Any]) -> Tuple[bool, str]:
-    """Validate submission data"""
-    required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
-    # Check required fields
-    for field in required_fields:
-        if not data.get(field):
-            return False, f"Missing required field: {field}"
-    # Validate scores
-    score_fields = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
-    for field in score_fields:
-        value = data.get(field)
-        if value is None:
-            return False, f"Missing score: {field}"
-        if not isinstance(value, (int, float)):
-            return False, f"Invalid score format: {field}"
-        if not 0 <= value <= 1:
-            return False, f"Score out of range (0-1): {field}"
-    # Validate metrics
-    metrics = data.get("metrics", {})
-    for metric in QUALITY_METRICS:
-        value = metrics.get(metric)
-        if value is None:
-            return False, f"Missing metric: {metric}"
-        if not isinstance(value, (int, float)):
-            return False, f"Invalid metric format: {metric}"
-        if not 0 <= value <= 10:
-            return False, f"Metric out of range (0-10): {metric}"
-    # Validate language and category choices
-    if data.get("programming_language") not in PROGRAMMING_LANGUAGES:
-        return False, "Invalid programming language"
-    if data.get("comment_language") not in COMMENT_LANGUAGES:
-        return False, "Invalid comment language"
-    if data.get("taxonomy_category") not in TAXONOMY_CATEGORIES:
-        return False, "Invalid taxonomy category"
-    return True, "Valid submission"
-def get_leaderboard_insights(data: List[Dict]) -> Dict[str, Any]:
-    """Get insights and trends from leaderboard data"""
-    if not data:
-        return {}
-    # Language performance analysis
-    lang_performance = {}
-    for lang in PROGRAMMING_LANGUAGES[1:]:  # Skip "All"
-        lang_data = [entry for entry in data if entry.get("programming_language") == lang]
-        if lang_data:
-            avg_score = sum(entry.get("llm_pass_1", 0) for entry in lang_data) / len(lang_data)
-            lang_performance[lang] = {
-                "avg_score": avg_score,
-                "model_count": len(lang_data),
-                "best_model": max(lang_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
-            }
-    # Category performance analysis
-    category_performance = {}
-    for category in TAXONOMY_CATEGORIES[1:]:  # Skip "All"
-        cat_data = [entry for entry in data if entry.get("taxonomy_category") == category]
-        if cat_data:
-            avg_score = sum(entry.get("llm_pass_1", 0) for entry in cat_data) / len(cat_data)
-            category_performance[category] = {
-                "avg_score": avg_score,
-                "model_count": len(cat_data),
-                "best_model": max(cat_data, key=lambda x: x.get("llm_pass_1", 0)).get("model_name", "")
-            }
-    return {
-        "language_performance": lang_performance,
-        "category_performance": category_performance,
-        "top_performers": sorted(data, key=lambda x: x.get("llm_pass_1", 0), reverse=True)[:5]
-    }
-def export_leaderboard_data(data: List[Dict], format_type: str = "json") -> str:
-    """Export leaderboard data in specified format"""
-    if format_type.lower() == "json":
-        return json.dumps(data, indent=2, ensure_ascii=False)
-    elif format_type.lower() == "csv":
-        # Simple CSV export
-        if not data:
-            return ""
-        # Get headers
-        headers = ["model_name", "programming_language", "comment_language", "taxonomy_category",
-                  "bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
-        headers.extend(QUALITY_METRICS)
-        lines = [",".join(headers)]
-        for entry in data:
-            row = []
-            for header in headers:
-                if header in QUALITY_METRICS:
-                    value = entry.get("metrics", {}).get(header, "")
-                else:
-                    value = entry.get(header, "")
-                row.append(str(value))
-            lines.append(",".join(row))
-        return "\n".join(lines)
-    else:
-        return "Unsupported format"

 """
+Utility classes and functions for the CodeReview Bench Leaderboard display.
 """
+from dataclasses import dataclass, field, fields
+from enum import Enum, auto
+from typing import List, Optional
+class Mode(Enum):
+    """Inference mode for the review model."""
+    CoT = auto()  # Chain of Thought
+    Strict = auto()
+    def __str__(self):
+        """String representation of the mode."""
+        return self.name
+class ModelType(Enum):
+    """Model types for the leaderboard."""
+    Unknown = auto()
+    OpenSource = auto()
+    ClosedSource = auto()
+    API = auto()
+    def to_str(self, separator: str = "-") -> str:
+        """Convert enum to string with separator."""
+        if self == ModelType.Unknown:
+            return "Unknown"
+        elif self == ModelType.OpenSource:
+            return f"Open{separator}Source"
+        elif self == ModelType.ClosedSource:
+            return f"Closed{separator}Source"
+        elif self == ModelType.API:
+            return "API"
+        return "Unknown"
+class ReviewModelType(str, Enum):
+    """Review model types for the leaderboard."""
+    GPT_4 = "gpt-4"
+    GPT_3_5 = "gpt-3.5-turbo"
+    CLAUDE = "claude"
+    LLAMA = "llama"
+    GEMINI = "gemini"
+    CUSTOM = "custom"
+    def __str__(self):
+        """String representation of the review model type."""
+        return self.value
+class Precision(Enum):
+    """Model precision types."""
+    Unknown = auto()
+    float16 = auto()
+    bfloat16 = auto()
+    float32 = auto()
+    int8 = auto()
+    int4 = auto()
+    NA = auto()
+    def __str__(self):
+        """String representation of the precision type."""
+        return self.name
+class WeightType(Enum):
+    """Model weight types."""
+    Original = auto()
+    Delta = auto()
+    Adapter = auto()
+    def __str__(self):
+        """String representation of the weight type."""
+        return self.name
+@dataclass
+class ColumnInfo:
+    """Information about a column in the leaderboard."""
+    name: str
+    display_name: str
+    type: str = "text"
+    hidden: bool = False
+    never_hidden: bool = False
+    displayed_by_default: bool = True
+@dataclass
+class CodeReviewBenchColumn:
+    """Columns for the CodeReview Bench leaderboard."""
+    # Core metadata
+    model_name: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="model_name",
+        display_name="Model",
+        never_hidden=True,
+        displayed_by_default=True
+    ))
+    mode: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="mode",
+        display_name="Mode",
+        displayed_by_default=True
+    ))
+    model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="model_type",
+        display_name="Access_Type",
+        displayed_by_default=True
+    ))
+    submission_date: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="submission_date",
+        display_name="Submission_Date",
+        displayed_by_default=False
+    ))
+    version: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="version",
+        display_name="Version",
+        displayed_by_default=False
+    ))
+    review_model_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="review_model_type",
+        display_name="Type",
+        displayed_by_default=False
+    ))
+    base_model: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="base_model",
+        display_name="Base Model",
+        displayed_by_default=False
+    ))
+    revision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="revision",
+        display_name="Revision",
+        displayed_by_default=False
+    ))
+    precision: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="precision",
+        display_name="Precision",
+        displayed_by_default=False
+    ))
+    weight_type: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="weight_type",
+        display_name="Weight Type",
+        displayed_by_default=False
+    ))
+    # LLM-based multimetric scores
+    readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="readability",
+        display_name="Readability",
+        type="number",
+        displayed_by_default=True
+    ))
+    relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="relevance",
+        display_name="Relevance",
+        type="number",
+        displayed_by_default=True
+    ))
+    explanation_clarity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="explanation_clarity",
+        display_name="Explanation_Clarity",
+        type="number",
+        displayed_by_default=True
+    ))
+    problem_identification: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="problem_identification",
+        display_name="Problem_Identification",
+        type="number",
+        displayed_by_default=True
+    ))
+    actionability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="actionability",
+        display_name="Actionability",
+        type="number",
+        displayed_by_default=True
+    ))
+    completeness: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="completeness",
+        display_name="Completeness",
+        type="number",
+        displayed_by_default=True
+    ))
+    specificity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="specificity",
+        display_name="Specificity",
+        type="number",
+        displayed_by_default=True
+    ))
+    contextual_adequacy: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="contextual_adequacy",
+        display_name="Contextual_Adequacy",
+        type="number",
+        displayed_by_default=True
+    ))
+    consistency: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="consistency",
+        display_name="Consistency",
+        type="number",
+        displayed_by_default=True
+    ))
+    brevity: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="brevity",
+        display_name="Brevity",
+        type="number",
+        displayed_by_default=True
+    ))
+    # LLM-based-exact-match metrics
+    pass_at_1: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="pass_at_1",
+        display_name="Pass@1",
+        type="number",
+        displayed_by_default=True
+    ))
+    pass_at_5: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="pass_at_5",
+        display_name="Pass@5",
+        type="number",
+        displayed_by_default=True
+    ))
+    pass_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="pass_at_10",
+        display_name="Pass@10",
+        type="number",
+        displayed_by_default=True
+    ))
+    bleu_at_10: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="bleu_at_10",
+        display_name="BLEU@10",
+        type="number",
+        displayed_by_default=True
+    ))
+    # Overall aggregated metrics
+    overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="overall_score",
+        display_name="Overall_Score",
+        type="number",
+        displayed_by_default=True
+    ))
+    multimetric_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="multimetric_average",
+        display_name="Multimetric_Average",
+        type="number",
+        displayed_by_default=True
+    ))
+    exact_match_average: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="exact_match_average",
+        display_name="Exact_Match_Average",
+        type="number",
+        displayed_by_default=True
+    ))
+    total_evaluations: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="total_evaluations",
+        display_name="Total_Evaluations",
+        type="number",
+        displayed_by_default=True
+    ))
+    # Language-specific metrics (Russian)
+    ru_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="ru_readability",
+        display_name="RU_Readability",
+        type="number",
+        displayed_by_default=False
+    ))
+    ru_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="ru_relevance",
+        display_name="RU_Relevance",
+        type="number",
+        displayed_by_default=False
+    ))
+    ru_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="ru_overall_score",
+        display_name="RU_Overall_Score",
+        type="number",
+        displayed_by_default=False
+    ))
+    # Language-specific metrics (English)
+    en_readability: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="en_readability",
+        display_name="EN_Readability",
+        type="number",
+        displayed_by_default=False
+    ))
+    en_relevance: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="en_relevance",
+        display_name="EN_Relevance",
+        type="number",
+        displayed_by_default=False
+    ))
+    en_overall_score: ColumnInfo = field(default_factory=lambda: ColumnInfo(
+        name="en_overall_score",
+        display_name="EN_Overall_Score",
+        type="number",
+        displayed_by_default=False
+    ))
+# Create instances for easy access
+CODEREVIEW_COLUMN = CodeReviewBenchColumn()
+# Extract column lists for different views
+COLS = [f.name for f in fields(CODEREVIEW_COLUMN)]
+DISPLAY_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
+                if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]
+# Manually reorder DISPLAY_COLS to put 'mode' after 'model_name'
+def reorder_display_cols():
+    cols = DISPLAY_COLS
+    if 'model_name' in cols and 'mode' in cols:
+        cols.remove('mode')
+        model_name_index = cols.index('model_name')
+        cols.insert(model_name_index + 1, 'mode')
+    return cols
+DISPLAY_COLS = reorder_display_cols()
+METRIC_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
+               if getattr(CODEREVIEW_COLUMN, f.name).type == "number"]
+HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
+               if getattr(CODEREVIEW_COLUMN, f.name).hidden]
+NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
+                     if getattr(CODEREVIEW_COLUMN, f.name).never_hidden]
+# Categories for CodeReview Bench (Programming Languages)
+CATEGORIES = [
+    'Python',
+    'JavaScript',
+    'Java',
+    'C++',
+    'C#',
+    'TypeScript',
+    'Go',
+    'Rust',
+    'Swift',
+    'Kotlin',
+    'Ruby',
+    'PHP',
+    'C',
+    'Scala',
+    'R',
+    'Dart',
+    'Other'
+]
+# Language taxonomies for CodeReview Bench
+COMMENT_LANGUAGES = [
+    'ru',  # Russian
+    'en'   # English
+]
+# Example categories
+EXAMPLE_CATEGORIES = [
+    'Bug_Fix',
+    'Code_Style',
+    'Performance',
+    'Security',
+    'Refactoring',
+    'Documentation',
+    'Testing',
+    'Architecture',
+    'Other'
+]
+# Metrics for CodeReview Bench
+MULTIMETRIC_METRICS = [
+    "readability",
+    "relevance",
+    "explanation_clarity",
+    "problem_identification",
+    "actionability",
+    "completeness",
+    "specificity",
+    "contextual_adequacy",
+    "consistency",
+    "brevity"
+]
+EXACT_MATCH_METRICS = [
+    "pass_at_1",
+    "pass_at_5",
+    "pass_at_10",
+    "bleu_at_10"
+]
+def get_all_column_choices():
+    """
+    Get all available column choices for the multiselect dropdown.
+    Returns:
+        List of tuples with (column_name, display_name) for all columns.
+    """
+    column_choices = []
+    default_visible_columns = get_default_visible_columns()
+    for f in fields(CODEREVIEW_COLUMN):
+        column_info = getattr(CODEREVIEW_COLUMN, f.name)
+        # Create a tuple with both the internal name and display name
+        if column_info.name not in default_visible_columns:
+            column_choices.append((column_info.name, column_info.display_name))
+    return column_choices
+def get_default_visible_columns():
+    """
+    Get the list of column names that should be visible by default.
+    Returns:
+        List of column names that are displayed by default.
+    """
+    return [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(CODEREVIEW_COLUMN)
+            if getattr(CODEREVIEW_COLUMN, f.name).displayed_by_default]

src/envs.py CHANGED Viewed

@@ -1,106 +1,27 @@
-"""
-Environment configuration and constants
-"""
 import os
-from pathlib import Path
-# Data paths
-DATA_DIR = Path("data")
-LEADERBOARD_PATH = DATA_DIR / "leaderboard_data.json"
-SUBMISSIONS_PATH = DATA_DIR / "submissions.json"
-# Create data directory if it doesn't exist
-DATA_DIR.mkdir(exist_ok=True)
-# Programming languages supported
-PROGRAMMING_LANGUAGES = [
-    "All",
-    "Python",
-    "JavaScript",
-    "Java",
-    "C++",
-    "C#",
-    "Go",
-    "Rust",
-    "TypeScript",
-    "PHP",
-    "Ruby",
-    "Swift",
-    "Kotlin",
-    "Scala",
-    "R",
-    "MATLAB",
-    "Other"
-]
-# Comment languages supported
-COMMENT_LANGUAGES = [
-    "All",
-    "English",
-    "Chinese",
-    "Spanish",
-    "French",
-    "German",
-    "Japanese",
-    "Korean",
-    "Russian",
-    "Portuguese",
-    "Italian",
-    "Dutch",
-    "Other"
-]
-# Taxonomy categories
-TAXONOMY_CATEGORIES = [
-    "All",
-    "Bug Detection",
-    "Code Style",
-    "Performance",
-    "Security",
-    "Maintainability",
-    "Documentation",
-    "Testing",
-    "Architecture",
-    "Best Practices",
-    "Refactoring",
-    "Other"
-]
-# Quality metrics
-QUALITY_METRICS = [
-    "readability",
-    "relevance",
-    "explanation_clarity",
-    "problem_identification",
-    "actionability",
-    "completeness",
-    "specificity",
-    "contextual_adequacy",
-    "consistency",
-    "brevity"
-]
-# Table headers
-MAIN_HEADERS = ["Model", "Programming Language", "Comment Language", "Taxonomy", "BLEU", "Pass@1", "Pass@5", "Pass@10"]
-QUALITY_HEADERS = ["Model"] + [metric.replace("_", " ").title() for metric in QUALITY_METRICS]
-# Default data
-DEFAULT_DATA = [{
-    "model_name": "example/model",
-    "programming_language": "Python",
-    "comment_language": "English",
-    "taxonomy_category": "Bug Detection",
-    "bleu": 0.5,
-    "llm_pass_1": 0.5,
-    "llm_pass_5": 0.5,
-    "llm_pass_10": 0.5,
-    "metrics": {
-        "readability": 5, "relevance": 5, "explanation_clarity": 5,
-        "problem_identification": 5, "actionability": 5, "completeness": 5,
-        "specificity": 5, "contextual_adequacy": 5, "consistency": 5, "brevity": 5
-    },
-    "submission_ip": "127.0.0.1",
-    "submission_date": "2024-01-01T00:00:00Z"
-}]

 import os
+from huggingface_hub import HfApi
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Hugging Face configuration
+TOKEN = os.environ.get("HF_TOKEN")  # A read/write token for your org
+OWNER = os.environ.get("OWNER", "codereview-bench")  # Change to your org
+SUBMITTER_TOKEN = os.environ.get("SUBMITTER_TOKEN")
+ADMIN_USERNAME = os.environ.get("ADMIN_USERNAME")
+ADMIN_PASSWORD = os.environ.get("ADMIN_PASSWORD")
+# Repository IDs
+REPO_ID = f"{OWNER}/codereview-bench"
+RESULTS_DATASET_ID = os.environ.get("RESULTS_DATASET_ID", f"{OWNER}/codereview-bench-results")
+# Cache paths
+CACHE_PATH = os.getenv("HF_HOME", ".")
+DATA_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
+# Local data paths
+LEADERBOARD_FILE = os.path.join(DATA_PATH, "leaderboard.json")
+# HF API instance
+API = HfApi(token=TOKEN)

src/leaderboard/processor.py CHANGED Viewed

@@ -1,306 +1,271 @@
 """
-Leaderboard data processor for CodeReview Leaderboard
 """
 import json
-import traceback
-from typing import List, Dict, Any, Optional
-from datetime import datetime, timezone, timedelta
-from pathlib import Path
-from src.envs import LEADERBOARD_PATH, SUBMISSIONS_PATH, DEFAULT_DATA
-from src.display.utils import validate_submission_data, get_statistics_summary
-class LeaderboardProcessor:
-    """Handles all leaderboard data operations"""
-    def __init__(self):
-        self.leaderboard_path = LEADERBOARD_PATH
-        self.submissions_path = SUBMISSIONS_PATH
-        self._ensure_data_files()
-    def _ensure_data_files(self):
-        """Ensure data files exist with default data"""
-        if not self.leaderboard_path.exists():
-            self.save_leaderboard_data(DEFAULT_DATA)
-        if not self.submissions_path.exists():
-            self.save_submission_log([])
-    def load_leaderboard_data(self) -> List[Dict]:
-        """Load leaderboard data from storage"""
-        try:
-            with open(self.leaderboard_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-                return data.get("leaderboard", [])
-        except Exception as e:
-            print(f"Error loading leaderboard: {e}")
-            return DEFAULT_DATA.copy()
-    def save_leaderboard_data(self, data: List[Dict]) -> bool:
-        """Save leaderboard data to storage"""
-        try:
-            to_store = {
-                "leaderboard": data,
-                "last_updated": datetime.now(timezone.utc).isoformat(),
-                "total_entries": len(data)
-            }
-            with open(self.leaderboard_path, 'w', encoding='utf-8') as f:
-                json.dump(to_store, f, indent=2, ensure_ascii=False)
-            return True
-        except Exception as e:
-            print(f"Error saving leaderboard: {e}")
-            return False
-    def load_submission_log(self) -> List[Dict]:
-        """Load submission log from storage"""
-        try:
-            with open(self.submissions_path, 'r', encoding='utf-8') as f:
-                data = json.load(f)
-                return data.get("submissions", [])
-        except Exception as e:
-            print(f"Error loading submission log: {e}")
-            return []
-    def save_submission_log(self, submissions: List[Dict]) -> bool:
-        """Save submission log to storage"""
-        try:
-            to_store = {
-                "submissions": submissions,
-                "last_updated": datetime.now(timezone.utc).isoformat(),
-                "total_submissions": len(submissions)
-            }
-            with open(self.submissions_path, 'w', encoding='utf-8') as f:
-                json.dump(to_store, f, indent=2, ensure_ascii=False)
-            return True
-        except Exception as e:
-            print(f"Error saving submission log: {e}")
-            return False
-    def add_submission(self, submission_data: Dict[str, Any], ip_address: str) -> tuple[bool, str]:
-        """Add a new submission to the leaderboard"""
-        try:
-            # Validate submission data
-            is_valid, message = validate_submission_data(submission_data)
-            if not is_valid:
-                return False, message
-            # Add metadata
-            submission_data["submission_ip"] = ip_address
-            submission_data["submission_date"] = datetime.now(timezone.utc).isoformat()
-            # Load current data
-            current_data = self.load_leaderboard_data()
-            # Check for existing model and replace if found
-            model_name = submission_data.get("model_name", "")
-            current_data = [entry for entry in current_data if entry.get("model_name") != model_name]
-            # Add new submission
-            current_data.append(submission_data)
-            # Save updated data
-            if self.save_leaderboard_data(current_data):
-                # Log the submission
-                self._log_submission(submission_data, ip_address)
-                return True, "✅ Submission recorded successfully!"
             else:
-                return False, "❌ Failed to save submission"
-        except Exception as e:
-            print(f"Error adding submission: {e}")
-            traceback.print_exc()
-            return False, f"❌ Submission failed: {str(e)}"
-    def _log_submission(self, submission_data: Dict[str, Any], ip_address: str):
-        """Log submission for audit trail"""
-        try:
-            submissions = self.load_submission_log()
-            log_entry = {
-                "model_name": submission_data.get("model_name"),
-                "programming_language": submission_data.get("programming_language"),
-                "comment_language": submission_data.get("comment_language"),
-                "taxonomy_category": submission_data.get("taxonomy_category"),
-                "scores": {
-                    "bleu": submission_data.get("bleu"),
-                    "llm_pass_1": submission_data.get("llm_pass_1"),
-                    "llm_pass_5": submission_data.get("llm_pass_5"),
-                    "llm_pass_10": submission_data.get("llm_pass_10")
-                },
-                "submission_ip": ip_address,
-                "submission_date": submission_data.get("submission_date"),
-                "status": "accepted"
-            }
-            submissions.append(log_entry)
-            # Keep only last 1000 submissions
-            submissions = submissions[-1000:]
-            self.save_submission_log(submissions)
-        except Exception as e:
-            print(f"Error logging submission: {e}")
-    def get_model_history(self, model_name: str) -> List[Dict]:
-        """Get submission history for a specific model"""
-        try:
-            submissions = self.load_submission_log()
-            return [
-                sub for sub in submissions
-                if sub.get("model_name") == model_name
-            ]
-        except Exception as e:
-            print(f"Error getting model history: {e}")
-            return []
-    def get_ip_submissions(self, ip_address: str, limit: int = 10) -> List[Dict]:
-        """Get recent submissions from a specific IP"""
-        try:
-            submissions = self.load_submission_log()
-            ip_submissions = [
-                sub for sub in submissions
-                if sub.get("submission_ip") == ip_address
-            ]
-            # Sort by date and limit
-            ip_submissions.sort(key=lambda x: x.get("submission_date", ""), reverse=True)
-            return ip_submissions[:limit]
-        except Exception as e:
-            print(f"Error getting IP submissions: {e}")
-            return []
-    def check_rate_limit(self, ip_address: str, max_submissions: int = 5, hours: int = 24) -> tuple[bool, str]:
-        """Check if IP has exceeded rate limit"""
-        try:
-            submissions = self.get_ip_submissions(ip_address, max_submissions * 2)
-            # Count submissions within the time window
-            cutoff_time = datetime.now(timezone.utc) - timedelta(hours=hours)
-            recent_submissions = [
-                sub for sub in submissions
-                if datetime.fromisoformat(sub.get("submission_date", "")).replace(tzinfo=timezone.utc) > cutoff_time
-            ]
-            if len(recent_submissions) >= max_submissions:
-                return False, f"Rate limit exceeded: {len(recent_submissions)}/{max_submissions} submissions in {hours} hours"
-            return True, f"Rate limit OK: {len(recent_submissions)}/{max_submissions} submissions in {hours} hours"
-        except Exception as e:
-            print(f"Error checking rate limit: {e}")
-            return True, "Rate limit check failed, allowing submission"
-    def get_leaderboard_stats(self) -> Dict[str, Any]:
-        """Get comprehensive leaderboard statistics"""
-        try:
-            data = self.load_leaderboard_data()
-            submissions = self.load_submission_log()
-            basic_stats = get_statistics_summary(data)
-            # Additional stats
-            recent_submissions = len([
-                sub for sub in submissions
-                if datetime.fromisoformat(sub.get("submission_date", "")).replace(tzinfo=timezone.utc) >
-                   datetime.now(timezone.utc) - timedelta(days=7)
-            ])
-            return {
-                **basic_stats,
-                "recent_submissions_7d": recent_submissions,
-                "total_logged_submissions": len(submissions),
-                "last_updated": datetime.now(timezone.utc).isoformat()
-            }
-        except Exception as e:
-            print(f"Error getting leaderboard stats: {e}")
-            return {}
-    def backup_data(self) -> bool:
-        """Create backup of current data"""
-        try:
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-            backup_dir = Path("backups")
-            backup_dir.mkdir(exist_ok=True)
-            # Backup leaderboard
-            if self.leaderboard_path.exists():
-                backup_path = backup_dir / f"leaderboard_{timestamp}.json"
-                with open(self.leaderboard_path, 'r') as src, open(backup_path, 'w') as dst:
-                    dst.write(src.read())
-            # Backup submissions
-            if self.submissions_path.exists():
-                backup_path = backup_dir / f"submissions_{timestamp}.json"
-                with open(self.submissions_path, 'r') as src, open(backup_path, 'w') as dst:
-                    dst.write(src.read())
-            return True
-        except Exception as e:
-            print(f"Error creating backup: {e}")
-            return False
-    def export_data(self, format_type: str = "json") -> str:
-        """Export leaderboard data in specified format"""
-        try:
-            from src.display.utils import export_leaderboard_data
-            data = self.load_leaderboard_data()
-            return export_leaderboard_data(data, format_type)
-        except Exception as e:
-            print(f"Error exporting data: {e}")
-            return f"Export failed: {str(e)}"
-    def validate_data_integrity(self) -> Dict[str, Any]:
-        """Validate data integrity and return report"""
-        try:
-            data = self.load_leaderboard_data()
-            submissions = self.load_submission_log()
-            issues = []
-            # Check for duplicate models
-            model_names = [entry.get("model_name") for entry in data]
-            duplicates = [name for name in model_names if model_names.count(name) > 1]
-            if duplicates:
-                issues.append(f"Duplicate models found: {set(duplicates)}")
-            # Check for missing required fields
-            required_fields = ["model_name", "programming_language", "comment_language", "taxonomy_category"]
-            for i, entry in enumerate(data):
-                missing = [field for field in required_fields if not entry.get(field)]
-                if missing:
-                    issues.append(f"Entry {i}: Missing fields {missing}")
-            # Check score ranges
-            for i, entry in enumerate(data):
-                scores = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
-                for score in scores:
-                    value = entry.get(score)
-                    if value is not None and (value < 0 or value > 1):
-                        issues.append(f"Entry {i}: {score} out of range: {value}")
-            return {
-                "is_valid": len(issues) == 0,
-                "issues": issues,
-                "total_entries": len(data),
-                "total_submissions": len(submissions),
-                "check_date": datetime.now(timezone.utc).isoformat()
-            }
-        except Exception as e:
-            return {
-                "is_valid": False,
-                "issues": [f"Validation failed: {str(e)}"],
-                "total_entries": 0,
-                "total_submissions": 0,
-                "check_date": datetime.now(timezone.utc).isoformat()
-            }

 """
+Process CodeReview Bench leaderboard data and submissions.
 """
 import json
+import os
+import pandas as pd
+from datetime import datetime
+from typing import Dict, List, Tuple, Optional
+import numpy as np
+from src.display.utils import (
+    CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES, COMMENT_LANGUAGES, EXAMPLE_CATEGORIES,
+    MULTIMETRIC_METRICS, EXACT_MATCH_METRICS
+)
+def process_jsonl_submission(file_path: str) -> Tuple[List[Dict], str]:
+    """
+    Process a JSONL submission file for CodeReview Bench.
+    Args:
+        file_path: Path to the JSONL submission file
+    Returns:
+        Tuple of (entries_list, message)
+    """
+    try:
+        entries = []
+        with open(file_path, 'r', encoding='utf-8') as f:
+            for line_num, line in enumerate(f, 1):
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    entry = json.loads(line)
+                    # Validate required fields
+                    required_fields = ['model_name', 'programming_language', 'comment_language']
+                    missing_fields = [field for field in required_fields if field not in entry]
+                    if missing_fields:
+                        return [], f"Missing required fields {missing_fields} in line {line_num}"
+                    # Validate metrics exist
+                    has_multimetric = any(metric in entry for metric in MULTIMETRIC_METRICS)
+                    has_exact_match = any(metric in entry for metric in EXACT_MATCH_METRICS)
+                    if not has_multimetric and not has_exact_match:
+                        return [], f"No valid metrics found in line {line_num}. Required: {MULTIMETRIC_METRICS + EXACT_MATCH_METRICS}"
+                    entries.append(entry)
+                except json.JSONDecodeError as e:
+                    return [], f"Invalid JSON in line {line_num}: {e}"
+        if not entries:
+            return [], "No valid entries found in submission file"
+        return entries, f"Successfully processed {len(entries)} entries"
+    except Exception as e:
+        return [], f"Error processing submission: {e}"
+def calculate_overall_score(entry: Dict) -> float:
+    """
+    Calculate overall score for a CodeReview Bench entry.
+    Args:
+        entry: Dictionary containing model evaluation results
+    Returns:
+        Overall score as float
+    """
+    # Calculate multimetric average
+    multimetric_scores = []
+    for metric in MULTIMETRIC_METRICS:
+        if metric in entry and isinstance(entry[metric], (int, float)):
+            multimetric_scores.append(entry[metric])
+    multimetric_avg = np.mean(multimetric_scores) if multimetric_scores else 0
+    # Calculate exact match average
+    exact_match_scores = []
+    for metric in EXACT_MATCH_METRICS:
+        if metric in entry and isinstance(entry[metric], (int, float)):
+            exact_match_scores.append(entry[metric])
+    exact_match_avg = np.mean(exact_match_scores) if exact_match_scores else 0
+    # Weighted combination (can be adjusted based on requirements)
+    overall_score = (multimetric_avg * 0.7) + (exact_match_avg * 0.3)
+    return overall_score
+def load_leaderboard_data(file_path: str) -> Dict:
+    """
+    Load the leaderboard data from a JSON file.
+    """
+    if not os.path.exists(file_path):
+        version = "v0"
+        if "_v" in file_path:
+            version = file_path.split("_")[-1].split(".")[0]
+        return {"entries": [], "last_updated": datetime.now().isoformat(), "version": version}
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    # Ensure version field exists
+    if "version" not in data:
+        version = "v0"
+        if "_v" in file_path:
+            version = file_path.split("_")[-1].split(".")[0]
+        data["version"] = version
+    return data
+def save_leaderboard_data(data: Dict, file_path: str) -> None:
+    """
+    Save the leaderboard data to a JSON file.
+    """
+    # Ensure the directory exists
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    # Update the last_updated timestamp
+    data["last_updated"] = datetime.now().isoformat()
+    # Ensure version is set
+    if "version" not in data:
+        version = "v0"
+        if "_v" in file_path:
+            version = file_path.split("_")[-1].split(".")[0]
+        data["version"] = version
+    with open(file_path, 'w') as f:
+        json.dump(data, f, indent=2)
+def leaderboard_to_dataframe(leaderboard_data: Dict) -> pd.DataFrame:
+    """
+    Convert leaderboard data to a pandas DataFrame for display.
+    """
+    rows = []
+    for entry in leaderboard_data.get("entries", []):
+        model_name = entry.get("model_name", "Unknown Model")
+        # Extract basic metadata
+        row = {
+            "model_name": model_name,
+            "model_type": entry.get("model_type", "Unknown"),
+            "mode": entry.get("mode", "Strict"),
+            "submission_date": entry.get("submission_date", ""),
+            "version": entry.get("version", "v0"),
+            "review_model_type": entry.get("review_model_type", "custom").lower()
+        }
+        # Add additional metadata fields if present
+        for key in ["base_model", "revision", "precision", "weight_type"]:
+            if key in entry:
+                row[key] = entry[key]
+        # Add multimetric scores
+        for metric in MULTIMETRIC_METRICS:
+            if metric in entry:
+                row[metric] = entry[metric]
             else:
+                row[metric] = pd.NA
+        # Add exact match metrics
+        for metric in EXACT_MATCH_METRICS:
+            if metric in entry:
+                row[metric] = entry[metric]
+            else:
+                row[metric] = pd.NA
+        # Calculate aggregated metrics
+        multimetric_scores = [entry.get(metric, 0) for metric in MULTIMETRIC_METRICS if metric in entry and pd.notna(entry[metric])]
+        exact_match_scores = [entry.get(metric, 0) for metric in EXACT_MATCH_METRICS if metric in entry and pd.notna(entry[metric])]
+        if multimetric_scores:
+            row["multimetric_average"] = np.mean(multimetric_scores)
+        else:
+            row["multimetric_average"] = pd.NA
+        if exact_match_scores:
+            row["exact_match_average"] = np.mean(exact_match_scores)
+        else:
+            row["exact_match_average"] = pd.NA
+        # Calculate overall score
+        row["overall_score"] = calculate_overall_score(entry)
+        # Add language-specific metrics if available
+        for lang in COMMENT_LANGUAGES:
+            for metric in ["readability", "relevance", "overall_score"]:
+                lang_key = f"{lang}_{metric}"
+                if lang_key in entry:
+                    row[lang_key] = entry[lang_key]
+                else:
+                    row[lang_key] = pd.NA
+        # Add evaluation count
+        row["total_evaluations"] = entry.get("total_evaluations", entry.get("evaluation_count", pd.NA))
+        rows.append(row)
+    # Create DataFrame and sort by overall score
+    df = pd.DataFrame(rows)
+    # Ensure all expected columns exist
+    for metric in MULTIMETRIC_METRICS + EXACT_MATCH_METRICS:
+        if metric not in df.columns:
+            df[metric] = pd.NA
+    # Sort by overall score (descending)
+    if not df.empty:
+        df = df.sort_values(by="overall_score", ascending=False, na_position='last')
+    # Ensure summary columns exist
+    summary_cols = ["overall_score", "multimetric_average", "exact_match_average", "total_evaluations"]
+    for col in summary_cols:
+        if col not in df.columns:
+            df[col] = pd.NA
+    return df
+def add_entries_to_leaderboard(leaderboard_data: Dict, new_entries: List[Dict]) -> Dict:
+    """
+    Add new entries to the leaderboard, replacing any with the same model name.
+    """
+    # Create a mapping of existing entries by model name and version
+    existing_entries = {
+        (entry["model_name"], entry.get("version", "v0")): i
+        for i, entry in enumerate(leaderboard_data.get("entries", []))
+    }
+    # Process each new entry
+    for new_entry in new_entries:
+        model_name = new_entry.get("model_name")
+        version = new_entry.get("version", "v0")
+        # Add calculated metrics
+        new_entry["overall_score"] = calculate_overall_score(new_entry)
+        # Calculate averages
+        multimetric_scores = [new_entry.get(metric) for metric in MULTIMETRIC_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
+        exact_match_scores = [new_entry.get(metric) for metric in EXACT_MATCH_METRICS if metric in new_entry and pd.notna(new_entry[metric])]
+        if multimetric_scores:
+            new_entry["multimetric_average"] = np.mean(multimetric_scores)
+        if exact_match_scores:
+            new_entry["exact_match_average"] = np.mean(exact_match_scores)
+        if (model_name, version) in existing_entries:
+            # Replace existing entry
+            leaderboard_data["entries"][existing_entries[(model_name, version)]] = new_entry
+        else:
+            # Add new entry
+            if "entries" not in leaderboard_data:
+                leaderboard_data["entries"] = []
+            leaderboard_data["entries"].append(new_entry)
+    # Update the last_updated timestamp
+    leaderboard_data["last_updated"] = datetime.now().isoformat()
+    return leaderboard_data

src/populate.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""
+Populate the CodeReview Bench leaderboard from HuggingFace datasets.
+"""
+import json
+import os
+import pandas as pd
+import tempfile
+from typing import Dict, List, Optional
+from datetime import datetime
+import numpy as np
+from huggingface_hub import hf_hub_download, HfApi
+from datasets import load_dataset
+from src.display.utils import CODEREVIEW_COLUMN, DISPLAY_COLS, CATEGORIES
+from src.envs import RESULTS_DATASET_ID, TOKEN, CACHE_PATH
+from src.leaderboard.processor import leaderboard_to_dataframe
+def get_latest_leaderboard(version="v0") -> Optional[Dict]:
+    """
+    Get the latest leaderboard data from HuggingFace dataset.
+    """
+    try:
+        # Try to download the leaderboard file
+        leaderboard_path = hf_hub_download(
+            repo_id=RESULTS_DATASET_ID,
+            filename=f"leaderboards/leaderboard_{version}.json",
+            repo_type="dataset",
+            token=TOKEN
+        )
+        with open(leaderboard_path, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"Error downloading leaderboard: {e}")
+        return None
+def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
+    """
+    Get a specific model's entry from the entries folder, uniquely identified by model_name, mode, and version.
+    """
+    try:
+        model_name_safe = model_name.replace("/", "_").replace(" ", "_")
+        mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
+        entry_path = hf_hub_download(
+            repo_id=RESULTS_DATASET_ID,
+            filename=f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json",
+            repo_type="dataset",
+            token=TOKEN
+        )
+        with open(entry_path, 'r') as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"Error downloading model entry: {e}")
+        return None
+def get_all_entries(version="v0") -> List[Dict]:
+    """
+    Get all entries from the HuggingFace dataset.
+    """
+    try:
+        api = HfApi(token=TOKEN)
+        files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
+        entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
+        all_entries = []
+        for entry_file in entry_files:
+            try:
+                entry_path = hf_hub_download(
+                    repo_id=RESULTS_DATASET_ID,
+                    filename=entry_file,
+                    repo_type="dataset",
+                    token=TOKEN
+                )
+                with open(entry_path, 'r') as f:
+                    entry_data = json.load(f)
+                    all_entries.append(entry_data)
+            except Exception as e:
+                print(f"Error loading entry {entry_file}: {e}")
+        return all_entries
+    except Exception as e:
+        print(f"Error getting all entries: {e}")
+        return []
+def get_leaderboard_df(version="v0") -> pd.DataFrame:
+    """
+    Get the leaderboard data as a DataFrame.
+    """
+    # Get latest leaderboard data
+    leaderboard_data = get_latest_leaderboard(version)
+    if not leaderboard_data:
+        # If no leaderboard exists, try to build it from entries
+        entries = get_all_entries(version)
+        if entries:
+            leaderboard_data = {
+                "entries": entries,
+                "last_updated": datetime.now().isoformat(),
+                "version": version
+            }
+        else:
+            # Return empty DataFrame if no data available
+            return pd.DataFrame(columns=DISPLAY_COLS)
+    # Convert to DataFrame
+    return leaderboard_to_dataframe(leaderboard_data)
+def get_category_leaderboard_df(category: str, version="v0") -> pd.DataFrame:
+    """
+    Get the leaderboard data filtered by a specific programming language category.
+    """
+    # Get latest leaderboard data
+    leaderboard_data = get_latest_leaderboard(version)
+    if not leaderboard_data:
+        # If no leaderboard exists, try to build it from entries
+        entries = get_all_entries(version)
+        if entries:
+            leaderboard_data = {
+                "entries": entries,
+                "last_updated": datetime.now().isoformat(),
+                "version": version
+            }
+        else:
+            # Return empty DataFrame if no data available
+            return pd.DataFrame(columns=DISPLAY_COLS)
+    # Filter entries to only include those with data for the specified programming language
+    filtered_entries = []
+    for entry in leaderboard_data.get("entries", []):
+        # Check if entry has data for this programming language
+        programming_language = entry.get("programming_language", "").lower()
+        if programming_language == category.lower() or category.lower() == "other":
+            # For "other" category, include entries that don't match any specific language
+            if category.lower() == "other":
+                if programming_language not in [cat.lower() for cat in CATEGORIES[:-1]]:  # Exclude "Other" from check
+                    filtered_entries.append(entry)
+            else:
+                filtered_entries.append(entry)
+    # Create a new leaderboard data structure with the filtered entries
+    filtered_leaderboard = {
+        "entries": filtered_entries,
+        "last_updated": leaderboard_data.get("last_updated", datetime.now().isoformat()),
+        "version": version
+    }
+    # Convert to DataFrame
+    return leaderboard_to_dataframe(filtered_leaderboard)
+def get_detailed_model_data(model_name: str, mode: str, version="v0") -> Dict:
+    """
+    Get detailed data for a specific model and mode.
+    """
+    entry = get_model_entry(model_name, mode, version)
+    if entry:
+        return entry
+    leaderboard_data = get_latest_leaderboard(version)
+    if leaderboard_data:
+        for entry in leaderboard_data.get("entries", []):
+            if entry.get("model_name") == model_name and str(entry.get("mode")).lower() == str(mode).lower():
+                return entry
+    return {}

src/submission/submit.py CHANGED Viewed

@@ -1,386 +1,184 @@
 """
-Submission system for CodeReview Leaderboard
 """
-import gradio as gr
-import re
-from typing import Dict, Any, List, Tuple
-from datetime import datetime, timezone
-from src.envs import PROGRAMMING_LANGUAGES, COMMENT_LANGUAGES, TAXONOMY_CATEGORIES, QUALITY_METRICS
-from src.leaderboard.processor import LeaderboardProcessor
-from src.display.utils import get_main_leaderboard_data, get_quality_metrics_data
-class SubmissionHandler:
-    """Handles model submissions with validation and rate limiting"""
-    def __init__(self):
-        self.processor = LeaderboardProcessor()
-    def get_client_ip(self, request: gr.Request) -> str:
-        """Extract client IP address from request"""
-        try:
-            # Check for forwarded headers first
-            forwarded_for = request.headers.get('X-Forwarded-For')
-            if forwarded_for:
-                # Take the first IP if multiple
-                ip = forwarded_for.split(',')[0].strip()
-                return ip
-            # Check for real IP header
-            real_ip = request.headers.get('X-Real-IP')
-            if real_ip:
-                return real_ip.strip()
-            # Fall back to client host
-            if hasattr(request, 'client') and hasattr(request.client, 'host'):
-                return request.client.host
-            # Default fallback
-            return "127.0.0.1"
-        except Exception as e:
-            print(f"Error getting client IP: {e}")
-            return "127.0.0.1"
-    def validate_model_name(self, model_name: str) -> Tuple[bool, str]:
-        """Validate model name format"""
-        if not model_name or not model_name.strip():
-            return False, "Model name cannot be empty"
-        model_name = model_name.strip()
-        # Check length
-        if len(model_name) > 100:
-            return False, "Model name too long (max 100 characters)"
-        # Check for valid characters
-        if not re.match(r'^[a-zA-Z0-9._/-]+$', model_name):
-            return False, "Model name contains invalid characters (only letters, numbers, dots, hyphens, underscores, and slashes allowed)"
-        # Check for organization/model format
-        if "/" in model_name:
-            parts = model_name.split("/")
-            if len(parts) != 2:
-                return False, "Model name should be in format 'organization/model'"
-            if not parts[0] or not parts[1]:
-                return False, "Both organization and model name must be specified"
-        return True, "Valid model name"
-    def validate_scores(self, scores: Dict[str, float]) -> Tuple[bool, str]:
-        """Validate score values"""
-        required_scores = ["bleu", "llm_pass_1", "llm_pass_5", "llm_pass_10"]
-        for score_name in required_scores:
-            value = scores.get(score_name)
-            if value is None:
-                return False, f"Missing score: {score_name}"
-            if not isinstance(value, (int, float)):
-                return False, f"Invalid score format for {score_name}: must be a number"
-            if not (0 <= value <= 1):
-                return False, f"Score {score_name} out of range: {value} (must be between 0 and 1)"
-        # Check logical consistency
-        if scores["llm_pass_1"] > scores["llm_pass_5"]:
-            return False, "Pass@1 score cannot be higher than Pass@5"
-        if scores["llm_pass_5"] > scores["llm_pass_10"]:
-            return False, "Pass@5 score cannot be higher than Pass@10"
-        return True, "Valid scores"
-    def validate_metrics(self, metrics: Dict[str, int]) -> Tuple[bool, str]:
-        """Validate quality metrics"""
-        for metric_name in QUALITY_METRICS:
-            value = metrics.get(metric_name)
-            if value is None:
-                return False, f"Missing metric: {metric_name}"
-            if not isinstance(value, (int, float)):
-                return False, f"Invalid metric format for {metric_name}: must be a number"
-            if not (0 <= value <= 10):
-                return False, f"Metric {metric_name} out of range: {value} (must be between 0 and 10)"
-        return True, "Valid metrics"
-    def submit_model(
-        self,
-        request: gr.Request,
-        current_data: List[Dict],
-        model_name: str,
-        programming_language: str,
-        comment_language: str,
-        taxonomy_category: str,
-        bleu: float,
-        llm_pass_1: float,
-        llm_pass_5: float,
-        llm_pass_10: float,
-        readability: int,
-        relevance: int,
-        explanation_clarity: int,
-        problem_identification: int,
-        actionability: int,
-        completeness: int,
-        specificity: int,
-        contextual_adequacy: int,
-        consistency: int,
-        brevity: int,
-    ) -> Tuple[List[Dict], List[List[str]], List[List[str]], str]:
-        """Handle model submission with full validation"""
-        try:
-            # Get client IP
-            client_ip = self.get_client_ip(request)
-            # Check rate limiting
-            rate_ok, rate_msg = self.processor.check_rate_limit(client_ip)
-            if not rate_ok:
-                return current_data, [], [], f"❌ {rate_msg}"
-            # Validate model name
-            name_valid, name_msg = self.validate_model_name(model_name)
-            if not name_valid:
-                return current_data, [], [], f"❌ {name_msg}"
-            # Validate scores
-            scores = {
-                "bleu": bleu,
-                "llm_pass_1": llm_pass_1,
-                "llm_pass_5": llm_pass_5,
-                "llm_pass_10": llm_pass_10
-            }
-            scores_valid, scores_msg = self.validate_scores(scores)
-            if not scores_valid:
-                return current_data, [], [], f"❌ {scores_msg}"
-            # Validate metrics
-            metrics = {
-                "readability": readability,
-                "relevance": relevance,
-                "explanation_clarity": explanation_clarity,
-                "problem_identification": problem_identification,
-                "actionability": actionability,
-                "completeness": completeness,
-                "specificity": specificity,
-                "contextual_adequacy": contextual_adequacy,
-                "consistency": consistency,
-                "brevity": brevity,
-            }
-            metrics_valid, metrics_msg = self.validate_metrics(metrics)
-            if not metrics_valid:
-                return current_data, [], [], f"❌ {metrics_msg}"
-            # Create submission data
-            submission_data = {
-                "model_name": model_name.strip(),
-                "programming_language": programming_language,
-                "comment_language": comment_language,
-                "taxonomy_category": taxonomy_category,
-                "bleu": bleu,
-                "llm_pass_1": llm_pass_1,
-                "llm_pass_5": llm_pass_5,
-                "llm_pass_10": llm_pass_10,
-                "metrics": metrics
-            }
-            # Submit to processor
-            success, message = self.processor.add_submission(submission_data, client_ip)
-            if success:
-                # Load updated data
-                updated_data = self.processor.load_leaderboard_data()
-                # Format tables
-                main_table = get_main_leaderboard_data(updated_data)
-                quality_table = get_quality_metrics_data(updated_data)
-                return updated_data, main_table, quality_table, message
-            else:
-                return current_data, [], [], message
-        except Exception as e:
-            print(f"Error in submission: {e}")
-            return current_data, [], [], f"❌ Submission failed: {str(e)}"
-    def get_submission_form_components(self):
-        """Create gradio components for submission form"""
-        with gr.Accordion("📝 Submit New Model Results", open=False):
-            gr.Markdown("""
-            ### Submission Guidelines
-            - Provide accurate scores based on proper evaluation
-            - Model name should follow 'organization/model' format
-            - All metrics are required
-            - Submissions are rate-limited per IP address
-            """)
-            with gr.Row():
-                model_name = gr.Textbox(
-                    label="Model Name",
-                    placeholder="e.g., microsoft/CodeT5-base",
-                    info="Use organization/model format"
-                )
-                programming_language = gr.Dropdown(
-                    choices=PROGRAMMING_LANGUAGES,
-                    value="All",
-                    label="Programming Language",
-                    info="Primary programming language evaluated"
-                )
-                comment_language = gr.Dropdown(
-                    choices=COMMENT_LANGUAGES,
-                    value="English",
-                    label="Comment Language",
-                    info="Natural language of code comments"
-                )
-                taxonomy_category = gr.Dropdown(
-                    choices=TAXONOMY_CATEGORIES,
-                    value="All",
-                    label="Taxonomy Category",
-                    info="Primary review category focus"
-                )
-            gr.Markdown("### 📊 Performance Scores (0.0 - 1.0)")
-            with gr.Row():
-                bleu = gr.Number(
-                    label="BLEU Score",
-                    value=0.0,
-                    minimum=0.0,
-                    maximum=1.0,
-                    step=0.001,
-                    info="BLEU similarity score"
-                )
-                pass1 = gr.Number(
-                    label="Pass@1",
-                    value=0.0,
-                    minimum=0.0,
-                    maximum=1.0,
-                    step=0.001,
-                    info="Success rate in 1 attempt"
-                )
-                pass5 = gr.Number(
-                    label="Pass@5",
-                    value=0.0,
-                    minimum=0.0,
-                    maximum=1.0,
-                    step=0.001,
-                    info="Success rate in 5 attempts"
-                )
-                pass10 = gr.Number(
-                    label="Pass@10",
-                    value=0.0,
-                    minimum=0.0,
-                    maximum=1.0,
-                    step=0.001,
-                    info="Success rate in 10 attempts"
-                )
-            gr.Markdown("### 📋 Quality Metrics (0 - 10)")
-            with gr.Row():
-                readability = gr.Slider(
-                    minimum=0, maximum=10, value=5, step=1,
-                    label="Readability",
-                    info="How readable are the generated reviews?"
-                )
-                relevance = gr.Slider(
-                    minimum=0, maximum=10, value=5, step=1,
-                    label="Relevance",
-                    info="How relevant to the code changes?"
-                )
-                explanation_clarity = gr.Slider(
-                    minimum=0, maximum=10, value=5, step=1,
-                    label="Explanation Clarity",
-                    info="How clear are the explanations?"
-                )
-                problem_identification = gr.Slider(
-                    minimum=0, maximum=10, value=5, step=1,
-                    label="Problem Identification",
-                    info="How well does it identify issues?"
-                )
-                actionability = gr.Slider(
-                    minimum=0, maximum=10, value=5, step=1,
-                    label="Actionability",
-                    info="How actionable are the suggestions?"
-                )
-            with gr.Row():
-                completeness = gr.Slider(
-                    minimum=0, maximum=10, value=5, step=1,
-                    label="Completeness",
-                    info="How complete are the reviews?"
-                )
-                specificity = gr.Slider(
-                    minimum=0, maximum=10, value=5, step=1,
-                    label="Specificity",
-                    info="How specific are the comments?"
-                )
-                contextual_adequacy = gr.Slider(
-                    minimum=0, maximum=10, value=5, step=1,
-                    label="Contextual Adequacy",
-                    info="How well does it understand context?"
-                )
-                consistency = gr.Slider(
-                    minimum=0, maximum=10, value=5, step=1,
-                    label="Consistency",
-                    info="How consistent across reviews?"
-                )
-                brevity = gr.Slider(
-                    minimum=0, maximum=10, value=5, step=1,
-                    label="Brevity",
-                    info="How concise are the reviews?"
-                )
-            submit_btn = gr.Button("🚀 Submit Model", variant="primary")
-            status_msg = gr.Markdown("")
-            # Return all components for use in the main app
-            return {
-                "model_name": model_name,
-                "programming_language": programming_language,
-                "comment_language": comment_language,
-                "taxonomy_category": taxonomy_category,
-                "bleu": bleu,
-                "pass1": pass1,
-                "pass5": pass5,
-                "pass10": pass10,
-                "readability": readability,
-                "relevance": relevance,
-                "explanation_clarity": explanation_clarity,
-                "problem_identification": problem_identification,
-                "actionability": actionability,
-                "completeness": completeness,
-                "specificity": specificity,
-                "contextual_adequacy": contextual_adequacy,
-                "consistency": consistency,
-                "brevity": brevity,
-                "submit_btn": submit_btn,
-                "status_msg": status_msg,
-            }
-    def get_submission_history(self, ip_address: str) -> List[List[str]]:
-        """Get submission history for display"""
         try:
-            submissions = self.processor.get_ip_submissions(ip_address)
-            table_data = []
-            for sub in submissions:
-                row = [
-                    sub.get("model_name", ""),
-                    sub.get("programming_language", ""),
-                    sub.get("comment_language", ""),
-                    sub.get("taxonomy_category", ""),
-                    f"{sub.get('scores', {}).get('llm_pass_1', 0):.3f}",
-                    sub.get("submission_date", "").split("T")[0] if sub.get("submission_date") else "",
-                    sub.get("status", "")
-                ]
-                table_data.append(row)
-            return table_data
-        except Exception as e:
-            print(f"Error getting submission history: {e}")
-            return []

 """
+Handle submissions to the CodeReview Bench leaderboard.
 """
+import json
+import os
+import tempfile
+from datetime import datetime
+from typing import Dict, List, Tuple
+from huggingface_hub import HfApi
+from datasets import load_dataset
+from src.display.formatting import styled_error, styled_message
+from src.envs import RESULTS_DATASET_ID, TOKEN, REPO_ID
+from src.leaderboard.processor import process_jsonl_submission, add_entries_to_leaderboard
+def validate_submission(file_path: str) -> Tuple[bool, str]:
+    """
+    Validate a submission file.
+    """
+    try:
+        entries, message = process_jsonl_submission(file_path)
+        if not entries:
+            return False, message
+        return True, "Submission is valid"
+    except Exception as e:
+        return False, f"Error validating submission: {e}"
+def submit_entry_to_hub(entry: Dict, model_name: str, mode: str, version="v0") -> Tuple[bool, str]:
+    """
+    Submit a model's evaluation entry to the HuggingFace dataset. The entry is uniquely identified by model_name, mode, and version.
+    """
+    try:
+        # Create safe model name for file path
+        model_name_safe = model_name.replace("/", "_").replace(" ", "_")
+        mode_safe = str(mode).replace("/", "_").replace(" ", "_").lower()
+        # Create entry path in entries folder
+        entry_path = f"entries/entry_{model_name_safe}_{mode_safe}_{version}.json"
+        # Save entry to temporary file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
+            json.dump(entry, temp_file, indent=2)
+            temp_path = temp_file.name
+        # Upload file
+        api = HfApi(token=TOKEN)
+        api.upload_file(
+            path_or_fileobj=temp_path,
+            path_in_repo=entry_path,
+            repo_id=RESULTS_DATASET_ID,
+            repo_type="dataset",
+            commit_message=f"Add evaluation entry for {model_name} (mode {mode}, version {version})"
+        )
+        os.unlink(temp_path)
+        return True, f"Successfully uploaded evaluation entry for {model_name} (mode {mode})"
+    except Exception as e:
+        return False, f"Error submitting entry to dataset: {e}"
+def submit_leaderboard_to_hub(entries: List[Dict], version="v0") -> Tuple[bool, str]:
+    """
+    Submit updated leaderboard to the HuggingFace dataset.
+    """
+    try:
+        # Create leaderboard data
+        leaderboard_data = {
+            "entries": entries,
+            "last_updated": datetime.now().isoformat(),
+            "version": version
+        }
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as temp_file:
+            json.dump(leaderboard_data, temp_file, indent=2)
+            temp_path = temp_file.name
+        # Upload file
+        api = HfApi(token=TOKEN)
+        api.upload_file(
+            path_or_fileobj=temp_path,
+            path_in_repo=f"leaderboards/leaderboard_{version}.json",
+            repo_id=RESULTS_DATASET_ID,
+            repo_type="dataset",
+            commit_message=f"Update leaderboard for version {version}"
+        )
+        os.unlink(temp_path)
+        return True, "Leaderboard updated successfully"
+    except Exception as e:
+        return False, f"Error updating leaderboard: {e}"
+def process_submission(file_path: str, metadata: Dict, version="v0") -> str:
+    """
+    Process a submission to the CodeReview Bench leaderboard.
+    """
+    try:
+        # Validate submission
+        is_valid, validation_message = validate_submission(file_path)
+        if not is_valid:
+            return styled_error(validation_message)
+        # Process the submission entries
+        entries, message = process_jsonl_submission(file_path)
+        if not entries:
+            return styled_error(f"Failed to process submission: {message}")
+        # Upload raw submission file
+        model_name = metadata.get("model_name", "unknown")
+        model_name_safe = model_name.replace("/", "_").replace(" ", "_")
+        api = HfApi(token=TOKEN)
+        submission_path = f"submissions_{version}/{model_name_safe}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl"
+        api.upload_file(
+            path_or_fileobj=file_path,
+            path_in_repo=submission_path,
+            repo_id=RESULTS_DATASET_ID,
+            repo_type="dataset",
+            commit_message=f"Add raw submission for {model_name}"
+        )
+        # Process entries and add metadata
+        processed_entries = []
+        for entry in entries:
+            # Add metadata to entry
+            entry.update({
+                "model_name": metadata.get("model_name"),
+                "model_type": metadata.get("model_type"),
+                "review_model_type": str(metadata.get("review_model_type", "custom")).lower(),
+                "mode": metadata.get("mode"),
+                "base_model": metadata.get("base_model"),
+                "revision": metadata.get("revision"),
+                "precision": metadata.get("precision"),
+                "weight_type": metadata.get("weight_type"),
+                "version": version,
+                "submission_date": datetime.now().isoformat()
+            })
+            processed_entries.append(entry)
+        # Submit entries to entries folder
+        for entry in processed_entries:
+            success, message = submit_entry_to_hub(entry, model_name, metadata.get("mode"), version)
+            if not success:
+                return styled_error(message)
+        # Get all entries from HF dataset and update leaderboard
+        files = api.list_repo_files(repo_id=RESULTS_DATASET_ID, repo_type="dataset")
+        entry_files = [f for f in files if f.startswith("entries/") and f.endswith(f"_{version}.json")]
+        all_entries = []
+        for entry_file in entry_files:
+            try:
+                entry_path = api.hf_hub_download(
+                    repo_id=RESULTS_DATASET_ID,
+                    filename=entry_file,
+                    repo_type="dataset",
+                )
+                with open(entry_path, 'r') as f:
+                    entry_data = json.load(f)
+                    all_entries.append(entry_data)
+            except Exception as e:
+                print(f"Error loading entry {entry_file}: {e}")
+        # Update leaderboard with all entries
+        success, message = submit_leaderboard_to_hub(all_entries, version)
+        if not success:
+            return styled_error(message)
+        return styled_message("Submission successful! Model evaluated and leaderboard updated.")
+    except Exception as e:
+        return styled_error(f"Error processing submission: {e}")
+    finally:
+        # Clean up temporary files if they exist
         try:
+            if os.path.exists(file_path):
+                os.remove(file_path)
+        except:
+            pass