kenkaneki commited on
Commit
bf8f34b
·
1 Parent(s): 94789e6
Files changed (5) hide show
  1. app.py +66 -205
  2. example_submission.jsonl +4 -4
  3. leaderboard_data.json +28 -19
  4. src/display/utils.py +12 -15
  5. src/populate.py +22 -5
app.py CHANGED
@@ -33,6 +33,7 @@ from src.display.utils import (
33
  CATEGORIES,
34
  COMMENT_LANGUAGES,
35
  EXAMPLE_CATEGORIES,
 
36
  ModelType,
37
  Mode,
38
  Precision,
@@ -350,10 +351,10 @@ def init_leaderboard(dataframe, visible_columns=None):
350
 
351
 
352
  def search_filter_leaderboard(
353
- df, search_query="", model_types=None, version=CURRENT_VERSION
354
  ):
355
  """
356
- Filter the leaderboard based on search query and model types.
357
  """
358
  if df is None or df.empty:
359
  return df
@@ -367,11 +368,14 @@ def search_filter_leaderboard(
367
  axis=1,
368
  )
369
 
370
- # Apply model type filter
371
- if model_types and len(model_types) > 0:
372
- filtered_df = filtered_df[
373
- filtered_df[GUARDBENCH_COLUMN.model_type.name].isin(model_types)
374
- ]
 
 
 
375
 
376
  # Apply search query
377
  if search_query:
@@ -398,7 +402,7 @@ def search_filter_leaderboard(
398
 
399
 
400
  def refresh_data_with_filters(
401
- version=CURRENT_VERSION, search_query="", model_types=None, selected_columns=None
402
  ):
403
  """
404
  Refresh the leaderboard data and update all components with filtering.
@@ -429,10 +433,10 @@ def refresh_data_with_filters(
429
 
430
  # Apply filters to each dataframe
431
  filtered_main_df = search_filter_leaderboard(
432
- main_df, search_query, model_types, version
433
  )
434
  filtered_category_dfs = [
435
- search_filter_leaderboard(df, search_query, model_types, version)
436
  for df in category_dfs
437
  ]
438
 
@@ -502,6 +506,8 @@ def submit_results(
502
  submission_file: tempfile._TemporaryFileWrapper,
503
  version: str,
504
  review_model_type: ReviewModelType,
 
 
505
  ):
506
  """
507
  Handle submission of results with model metadata.
@@ -532,6 +538,8 @@ def submit_results(
532
  "mode": mode,
533
  "version": version,
534
  "review_model_type": review_model_type,
 
 
535
  }
536
 
537
  # Process the submission
@@ -691,22 +699,9 @@ demo = gr.Blocks(css=custom_css, theme=custom_theme)
691
 
692
  CATEGORY_DISPLAY_MAP = {
693
  "Python": "Python",
694
- "JavaScript": "JavaScript",
695
  "Java": "Java",
696
- "C++": "C++",
697
- "C#": "C#",
698
- "TypeScript": "TypeScript",
699
- "Go": "Go",
700
- "Rust": "Rust",
701
- "Swift": "Swift",
702
- "Kotlin": "Kotlin",
703
- "Ruby": "Ruby",
704
- "PHP": "PHP",
705
- "C": "C",
706
  "Scala": "Scala",
707
- "R": "R",
708
- "Dart": "Dart",
709
- "Other": "Other"
710
  }
711
  # Create reverse mapping for lookups
712
  CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
@@ -739,16 +734,31 @@ with demo:
739
  elem_id="search-bar",
740
  scale=2,
741
  )
742
- model_type_filter = gr.Dropdown(
743
- choices=[
744
- t.to_str("-") for t in ModelType if t != ModelType.Unknown and t != ModelType.ClosedSource
745
- ],
746
- label="Access Type",
 
 
 
 
 
 
747
  multiselect=True,
748
  value=[],
749
  interactive=True,
750
  scale=1,
751
  )
 
 
 
 
 
 
 
 
 
752
  column_selector = gr.Dropdown(
753
  choices=get_all_column_choices(),
754
  label="Columns",
@@ -783,19 +793,19 @@ with demo:
783
  def update_with_search_filters(
784
  version=CURRENT_VERSION,
785
  search_query="",
786
- model_types=None,
787
  selected_columns=None,
788
  ):
789
  """
790
  Update the leaderboards with search and filter settings.
791
  """
792
  return refresh_data_with_filters(
793
- version, search_query, model_types, selected_columns
794
  )
795
 
796
  # Refresh button functionality
797
  def refresh_and_update(
798
- version, search_query, model_types, selected_columns
799
  ):
800
  """
801
  Refresh data, update LEADERBOARD_DF, and return updated components.
@@ -804,7 +814,7 @@ with demo:
804
  main_df = get_leaderboard_df(version=version)
805
  LEADERBOARD_DF = main_df # Update the global DataFrame
806
  return refresh_data_with_filters(
807
- version, search_query, model_types, selected_columns
808
  )
809
 
810
  refresh_button.click(
@@ -812,7 +822,7 @@ with demo:
812
  inputs=[
813
  version_selector,
814
  search_input,
815
- model_type_filter,
816
  column_selector,
817
  ],
818
  outputs=[leaderboard]
@@ -827,7 +837,7 @@ with demo:
827
  inputs=[
828
  version_selector,
829
  search_input,
830
- model_type_filter,
831
  column_selector,
832
  ],
833
  outputs=[leaderboard]
@@ -837,13 +847,13 @@ with demo:
837
  ],
838
  )
839
 
840
- # Model type filter functionality
841
- model_type_filter.change(
842
  fn=refresh_data_with_filters,
843
  inputs=[
844
  version_selector,
845
  search_input,
846
- model_type_filter,
847
  column_selector,
848
  ],
849
  outputs=[leaderboard]
@@ -859,7 +869,7 @@ with demo:
859
  inputs=[
860
  version_selector,
861
  search_input,
862
- model_type_filter,
863
  column_selector,
864
  ],
865
  outputs=[leaderboard]
@@ -963,175 +973,10 @@ with demo:
963
  ],
964
  )
965
 
966
- with gr.TabItem("Visualize", elem_id="codereview-viz-tab", id=1):
967
- with gr.Row():
968
- with gr.Column():
969
- viz_version_selector = gr.Dropdown(
970
- choices=BENCHMARK_VERSIONS,
971
- label="Benchmark Version",
972
- value=CURRENT_VERSION,
973
- interactive=True,
974
- visible=False,
975
- )
976
-
977
- # New: Mode selector
978
- def get_model_mode_choices(version):
979
- df = get_leaderboard_df(version=version)
980
- if df.empty:
981
- return []
982
- return sorted([
983
- f"{str(row['model_name']).lower()} [{row['mode']}]"
984
- for _, row in df.drop_duplicates(subset=["model_name", "mode"]).iterrows()
985
- ])
986
-
987
- model_mode_selector = gr.Dropdown(
988
- choices=get_model_mode_choices(CURRENT_VERSION),
989
- label="Select Model(s) [Mode] to Compare",
990
- multiselect=True,
991
- interactive=True,
992
- )
993
- with gr.Column():
994
- # Add Overall Performance to categories, use display names
995
- viz_categories_display = ["All Results"] + [
996
- CATEGORY_DISPLAY_MAP.get(cat, cat) for cat in CATEGORIES
997
- ]
998
- category_selector = gr.Dropdown(
999
- choices=viz_categories_display,
1000
- label="Select Category",
1001
- value=viz_categories_display[0],
1002
- interactive=True,
1003
- )
1004
- metric_selector = gr.Dropdown(
1005
- choices=[
1006
- "accuracy",
1007
- "f1_binary",
1008
- "precision_binary",
1009
- "recall_binary",
1010
- "error_ratio",
1011
- ],
1012
- label="Select Metric",
1013
- value="accuracy",
1014
- interactive=True,
1015
- )
1016
-
1017
- plot_output = gr.Plot()
1018
-
1019
- # Update visualization when any selector changes
1020
- def update_visualization_with_mode(
1021
- selected_model_modes, selected_category, selected_metric, version
1022
- ):
1023
- if not selected_model_modes:
1024
- return go.Figure()
1025
- df = (
1026
- get_leaderboard_df(version=version)
1027
- if selected_category == "All Results"
1028
- else get_category_leaderboard_df(selected_category, version=version)
1029
- )
1030
- if df.empty:
1031
- return go.Figure()
1032
- df = df.copy()
1033
- df["model_name"] = df["model_name"].str.lower()
1034
- selected_pairs = [s.rsplit(" [", 1) for s in selected_model_modes]
1035
- selected_pairs = [
1036
- (name.strip().lower(), mode.strip("] "))
1037
- for name, mode in selected_pairs
1038
- ]
1039
- mask = df.apply(
1040
- lambda row: (row["model_name"], str(row["mode"])) in selected_pairs,
1041
- axis=1,
1042
- )
1043
- filtered_df = df[mask]
1044
- metric_cols = [col for col in filtered_df.columns if selected_metric in col]
1045
- fig = go.Figure()
1046
- colors = ["#8FCCCC", "#C2A4B6", "#98B4A6", "#B68F7C"]
1047
- for idx, (model_name, mode) in enumerate(selected_pairs):
1048
- model_data = filtered_df[
1049
- (filtered_df["model_name"] == model_name)
1050
- & (filtered_df["mode"] == mode)
1051
- ]
1052
- if not model_data.empty:
1053
- values = model_data[metric_cols].values[0].tolist()
1054
- values = values + [values[0]]
1055
- categories = [col.replace(f"_{selected_metric}", "") for col in metric_cols]
1056
- # Replace 'jailbreaked' with 'jailbroken' in categories
1057
- categories = [cat.replace('jailbreaked', 'jailbroken') for cat in categories]
1058
- categories = categories + [categories[0]]
1059
- fig.add_trace(
1060
- go.Scatterpolar(
1061
- r=values,
1062
- theta=categories,
1063
- name=f"{model_name} [{mode}]",
1064
- line_color=colors[idx % len(colors)],
1065
- fill="toself",
1066
- )
1067
- )
1068
- fig.update_layout(
1069
- paper_bgcolor="#000000",
1070
- plot_bgcolor="#000000",
1071
- font={"color": "#ffffff"},
1072
- title={
1073
- "text": f"{selected_category} - {selected_metric.upper()} Score Comparison",
1074
- "font": {"color": "#ffffff", "size": 24},
1075
- },
1076
- polar=dict(
1077
- bgcolor="#000000",
1078
- radialaxis=dict(
1079
- visible=True,
1080
- range=[0, 1],
1081
- gridcolor="#333333",
1082
- linecolor="#333333",
1083
- tickfont={"color": "#ffffff"},
1084
- ),
1085
- angularaxis=dict(
1086
- gridcolor="#333333",
1087
- linecolor="#333333",
1088
- tickfont={"color": "#ffffff"},
1089
- ),
1090
- ),
1091
- height=600,
1092
- showlegend=True,
1093
- legend=dict(
1094
- yanchor="top",
1095
- y=0.99,
1096
- xanchor="right",
1097
- x=0.99,
1098
- bgcolor="rgba(0,0,0,0.5)",
1099
- font={"color": "#ffffff"},
1100
- ),
1101
- )
1102
- return fig
1103
-
1104
- # Connect selectors to update function
1105
- for control in [
1106
- viz_version_selector,
1107
- model_mode_selector,
1108
- category_selector,
1109
- metric_selector,
1110
- ]:
1111
- control.change(
1112
- fn=lambda smm, sc, s_metric, v: update_visualization_with_mode(
1113
- smm, CATEGORY_REVERSE_MAP.get(sc, sc), s_metric, v
1114
- ),
1115
- inputs=[
1116
- model_mode_selector,
1117
- category_selector,
1118
- metric_selector,
1119
- viz_version_selector,
1120
- ],
1121
- outputs=plot_output,
1122
- )
1123
-
1124
- # Update model_mode_selector choices when version changes
1125
- viz_version_selector.change(
1126
- fn=get_model_mode_choices,
1127
- inputs=[viz_version_selector],
1128
- outputs=[model_mode_selector],
1129
- )
1130
-
1131
  # with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
1132
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
1133
 
1134
- with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=3):
1135
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
1136
 
1137
  with gr.Row():
@@ -1179,6 +1024,20 @@ with demo:
1179
  value=ReviewModelType.CUSTOM.name,
1180
  interactive=True,
1181
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1182
 
1183
  with gr.Column():
1184
  precision = gr.Dropdown(
@@ -1222,6 +1081,8 @@ with demo:
1222
  file_input,
1223
  submission_version_selector,
1224
  review_model_type,
 
 
1225
  ],
1226
  outputs=result_output,
1227
  )
 
33
  CATEGORIES,
34
  COMMENT_LANGUAGES,
35
  EXAMPLE_CATEGORIES,
36
+ TOPICS,
37
  ModelType,
38
  Mode,
39
  Precision,
 
351
 
352
 
353
  def search_filter_leaderboard(
354
+ df, search_query="", comment_languages=None, version=CURRENT_VERSION
355
  ):
356
  """
357
+ Filter the leaderboard based on search query and comment languages.
358
  """
359
  if df is None or df.empty:
360
  return df
 
368
  axis=1,
369
  )
370
 
371
+ # Apply comment language filter (assuming there's a comment_language column in the data)
372
+ if comment_languages and len(comment_languages) > 0:
373
+ # Look for a comment language column in the dataframe
374
+ comment_lang_cols = [col for col in filtered_df.columns if 'comment_language' in col.lower()]
375
+ if comment_lang_cols:
376
+ filtered_df = filtered_df[
377
+ filtered_df[comment_lang_cols[0]].isin(comment_languages)
378
+ ]
379
 
380
  # Apply search query
381
  if search_query:
 
402
 
403
 
404
  def refresh_data_with_filters(
405
+ version=CURRENT_VERSION, search_query="", comment_languages=None, selected_columns=None
406
  ):
407
  """
408
  Refresh the leaderboard data and update all components with filtering.
 
433
 
434
  # Apply filters to each dataframe
435
  filtered_main_df = search_filter_leaderboard(
436
+ main_df, search_query, comment_languages, version
437
  )
438
  filtered_category_dfs = [
439
+ search_filter_leaderboard(df, search_query, comment_languages, version)
440
  for df in category_dfs
441
  ]
442
 
 
506
  submission_file: tempfile._TemporaryFileWrapper,
507
  version: str,
508
  review_model_type: ReviewModelType,
509
+ programming_language: str,
510
+ comment_language: str,
511
  ):
512
  """
513
  Handle submission of results with model metadata.
 
538
  "mode": mode,
539
  "version": version,
540
  "review_model_type": review_model_type,
541
+ "programming_language": programming_language,
542
+ "comment_language": comment_language,
543
  }
544
 
545
  # Process the submission
 
699
 
700
  CATEGORY_DISPLAY_MAP = {
701
  "Python": "Python",
 
702
  "Java": "Java",
 
 
 
 
 
 
 
 
 
 
703
  "Scala": "Scala",
704
+ "Go": "Go"
 
 
705
  }
706
  # Create reverse mapping for lookups
707
  CATEGORY_REVERSE_MAP = {v: k for k, v in CATEGORY_DISPLAY_MAP.items()}
 
734
  elem_id="search-bar",
735
  scale=2,
736
  )
737
+ comment_language_filter = gr.Dropdown(
738
+ choices=["en", "ru"],
739
+ label="Comment Language",
740
+ multiselect=True,
741
+ value=[],
742
+ interactive=True,
743
+ scale=1,
744
+ )
745
+ programming_language_filter = gr.Dropdown(
746
+ choices=["Python", "Java", "Scala", "Go"],
747
+ label="Programming Language",
748
  multiselect=True,
749
  value=[],
750
  interactive=True,
751
  scale=1,
752
  )
753
+ with gr.Row():
754
+ topic_filter = gr.Dropdown(
755
+ choices=TOPICS,
756
+ label="Topic",
757
+ multiselect=True,
758
+ value=[],
759
+ interactive=True,
760
+ scale=2,
761
+ )
762
  column_selector = gr.Dropdown(
763
  choices=get_all_column_choices(),
764
  label="Columns",
 
793
  def update_with_search_filters(
794
  version=CURRENT_VERSION,
795
  search_query="",
796
+ comment_languages=None,
797
  selected_columns=None,
798
  ):
799
  """
800
  Update the leaderboards with search and filter settings.
801
  """
802
  return refresh_data_with_filters(
803
+ version, search_query, comment_languages, selected_columns
804
  )
805
 
806
  # Refresh button functionality
807
  def refresh_and_update(
808
+ version, search_query, comment_languages, selected_columns
809
  ):
810
  """
811
  Refresh data, update LEADERBOARD_DF, and return updated components.
 
814
  main_df = get_leaderboard_df(version=version)
815
  LEADERBOARD_DF = main_df # Update the global DataFrame
816
  return refresh_data_with_filters(
817
+ version, search_query, comment_languages, selected_columns
818
  )
819
 
820
  refresh_button.click(
 
822
  inputs=[
823
  version_selector,
824
  search_input,
825
+ comment_language_filter,
826
  column_selector,
827
  ],
828
  outputs=[leaderboard]
 
837
  inputs=[
838
  version_selector,
839
  search_input,
840
+ comment_language_filter,
841
  column_selector,
842
  ],
843
  outputs=[leaderboard]
 
847
  ],
848
  )
849
 
850
+ # Comment language filter functionality
851
+ comment_language_filter.change(
852
  fn=refresh_data_with_filters,
853
  inputs=[
854
  version_selector,
855
  search_input,
856
+ comment_language_filter,
857
  column_selector,
858
  ],
859
  outputs=[leaderboard]
 
869
  inputs=[
870
  version_selector,
871
  search_input,
872
+ comment_language_filter,
873
  column_selector,
874
  ],
875
  outputs=[leaderboard]
 
973
  ],
974
  )
975
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
976
  # with gr.TabItem("About", elem_id="codereview-about-tab", id=2):
977
  # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
978
 
979
+ with gr.TabItem("Submit", elem_id="codereview-submit-tab", id=1):
980
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
981
 
982
  with gr.Row():
 
1024
  value=ReviewModelType.CUSTOM.name,
1025
  interactive=True,
1026
  )
1027
+ programming_language_selector = gr.Dropdown(
1028
+ choices=["Python", "Java", "Scala", "Go"],
1029
+ label="Programming Language",
1030
+ multiselect=False,
1031
+ value=None,
1032
+ interactive=True,
1033
+ )
1034
+ comment_language_selector = gr.Dropdown(
1035
+ choices=["en", "ru"],
1036
+ label="Comment Language",
1037
+ multiselect=False,
1038
+ value="en",
1039
+ interactive=True,
1040
+ )
1041
 
1042
  with gr.Column():
1043
  precision = gr.Dropdown(
 
1081
  file_input,
1082
  submission_version_selector,
1083
  review_model_type,
1084
+ programming_language_selector,
1085
+ comment_language_selector,
1086
  ],
1087
  outputs=result_output,
1088
  )
example_submission.jsonl CHANGED
@@ -1,4 +1,4 @@
1
- {"model_name": "GPT-4-CodeReview", "programming_language": "python", "comment_language": "en", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
2
- {"model_name": "GPT-4-CodeReview", "programming_language": "javascript", "comment_language": "en", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
3
- {"model_name": "Claude-3-CodeReview", "programming_language": "python", "comment_language": "en", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 8.0, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
4
- {"model_name": "Llama-CodeReview", "programming_language": "java", "comment_language": "en", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}
 
1
+ {"model_name": "GPT-4-CodeReview", "programming_language": "Python", "comment_language": "en", "topic": "Code Reliability", "observation_id": "obs_001", "code_snippet": "def calculate_sum(a, b):\n return a + b", "review_text": "This function is simple and correct, but consider adding type hints and docstring for better documentation.", "readability": 8.5, "relevance": 9.0, "explanation_clarity": 7.8, "problem_identification": 8.2, "actionability": 8.7, "completeness": 8.0, "specificity": 7.5, "contextual_adequacy": 8.3, "consistency": 8.8, "brevity": 7.2, "pass_at_1": 0.75, "pass_at_5": 0.88, "pass_at_10": 0.92, "bleu_at_10": 0.65, "total_evaluations": 100}
2
+ {"model_name": "GPT-4-CodeReview", "programming_language": "Java", "comment_language": "en", "topic": "Coding Standards", "observation_id": "obs_002", "code_snippet": "public class Calculator {\n public int add(int a, int b) {\n return a + b;\n }\n}", "review_text": "Consider following Java naming conventions and adding JavaDoc comments. The method is functionally correct.", "readability": 8.2, "relevance": 8.8, "explanation_clarity": 7.5, "problem_identification": 8.0, "actionability": 8.5, "completeness": 7.8, "specificity": 7.2, "contextual_adequacy": 8.1, "consistency": 8.6, "brevity": 7.0, "pass_at_1": 0.72, "pass_at_5": 0.85, "pass_at_10": 0.90, "bleu_at_10": 0.62, "total_evaluations": 100}
3
+ {"model_name": "Claude-3-CodeReview", "programming_language": "Scala", "comment_language": "ru", "topic": "Performance Issues", "observation_id": "obs_003", "code_snippet": "def fibonacci(n: Int): Int = {\n if (n <= 1) n\n else fibonacci(n-1) + fibonacci(n-2)\n}", "review_text": "Эта реализация неэффективна из-за экспоненциальной сложности. Рекомендуется использовать мемоизацию или итеративный подход.", "readability": 8.8, "relevance": 8.5, "explanation_clarity": 8.2, "problem_identification": 9.2, "actionability": 8.3, "completeness": 8.5, "specificity": 8.0, "contextual_adequacy": 8.6, "consistency": 8.2, "brevity": 8.8, "pass_at_1": 0.78, "pass_at_5": 0.89, "pass_at_10": 0.93, "bleu_at_10": 0.68, "total_evaluations": 100}
4
+ {"model_name": "Llama-CodeReview", "programming_language": "Go", "comment_language": "en", "topic": "Variables", "observation_id": "obs_004", "code_snippet": "package main\n\nimport \"fmt\"\n\nfunc main() {\n var x int = 5\n var y int = 10\n fmt.Println(x + y)\n}", "review_text": "Consider using short variable declarations (:=) for local variables. Also, the variable names could be more descriptive.", "readability": 7.5, "relevance": 7.8, "explanation_clarity": 7.0, "problem_identification": 7.5, "actionability": 7.2, "completeness": 7.8, "specificity": 6.8, "contextual_adequacy": 7.3, "consistency": 7.6, "brevity": 6.5, "pass_at_1": 0.65, "pass_at_5": 0.78, "pass_at_10": 0.85, "bleu_at_10": 0.55, "total_evaluations": 100}
leaderboard_data.json CHANGED
@@ -1,23 +1,32 @@
1
  {
2
- "leaderboard": [
3
  {
4
- "model_name": "example/model",
5
- "bleu": 0.5,
6
- "llm_pass_1": 0.5,
7
- "llm_pass_5": 0.5,
8
- "llm_pass_10": 0.5,
9
- "metrics": {
10
- "readability": 5,
11
- "relevance": 5,
12
- "explanation_clarity": 5,
13
- "problem_identification": 5,
14
- "actionability": 5,
15
- "completeness": 5,
16
- "specificity": 5,
17
- "contextual_adequacy": 5,
18
- "consistency": 5,
19
- "brevity": 5
20
- }
 
 
 
 
 
 
 
21
  }
22
- ]
 
 
23
  }
 
1
  {
2
+ "entries": [
3
  {
4
+ "model_name": "GPT-4-CodeReview",
5
+ "model_type": "LLM",
6
+ "mode": "Strict",
7
+ "review_model_type": "gpt-4",
8
+ "programming_language": "Python",
9
+ "comment_language": "en",
10
+ "topic": "Code Reliability",
11
+ "submission_date": "2024-10-06T12:00:00Z",
12
+ "version": "v0",
13
+ "readability": 8.5,
14
+ "relevance": 9.0,
15
+ "explanation_clarity": 7.8,
16
+ "problem_identification": 8.2,
17
+ "actionability": 8.7,
18
+ "completeness": 8.0,
19
+ "specificity": 7.5,
20
+ "contextual_adequacy": 8.3,
21
+ "consistency": 8.8,
22
+ "brevity": 7.2,
23
+ "pass_at_1": 0.75,
24
+ "pass_at_5": 0.88,
25
+ "pass_at_10": 0.92,
26
+ "bleu_at_10": 0.65,
27
+ "total_evaluations": 100
28
  }
29
+ ],
30
+ "last_updated": "2024-10-06T12:00:00Z",
31
+ "version": "v0"
32
  }
src/display/utils.py CHANGED
@@ -327,22 +327,9 @@ NEVER_HIDDEN_COLS = [getattr(CODEREVIEW_COLUMN, f.name).name for f in fields(COD
327
  # Categories for CodeReview Bench (Programming Languages)
328
  CATEGORIES = [
329
  'Python',
330
- 'JavaScript',
331
- 'Java',
332
- 'C++',
333
- 'C#',
334
- 'TypeScript',
335
- 'Go',
336
- 'Rust',
337
- 'Swift',
338
- 'Kotlin',
339
- 'Ruby',
340
- 'PHP',
341
- 'C',
342
  'Scala',
343
- 'R',
344
- 'Dart',
345
- 'Other'
346
  ]
347
 
348
  # Language taxonomies for CodeReview Bench
@@ -351,6 +338,16 @@ COMMENT_LANGUAGES = [
351
  'en' # English
352
  ]
353
 
 
 
 
 
 
 
 
 
 
 
354
  # Example categories
355
  EXAMPLE_CATEGORIES = [
356
  'Bug_Fix',
 
327
  # Categories for CodeReview Bench (Programming Languages)
328
  CATEGORIES = [
329
  'Python',
330
+ 'Java',
 
 
 
 
 
 
 
 
 
 
 
331
  'Scala',
332
+ 'Go'
 
 
333
  ]
334
 
335
  # Language taxonomies for CodeReview Bench
 
338
  'en' # English
339
  ]
340
 
341
+ # Topics for CodeReview Bench
342
+ TOPICS = [
343
+ 'Code Reliability',
344
+ 'Coding Standards',
345
+ 'Code Organization',
346
+ 'Performance Issues',
347
+ 'Validation',
348
+ 'Variables'
349
+ ]
350
+
351
  # Example categories
352
  EXAMPLE_CATEGORIES = [
353
  'Bug_Fix',
src/populate.py CHANGED
@@ -21,21 +21,38 @@ from src.leaderboard.processor import leaderboard_to_dataframe
21
  def get_latest_leaderboard(version="v0") -> Optional[Dict]:
22
  """
23
  Get the latest leaderboard data from HuggingFace dataset.
 
24
  """
 
25
  try:
26
- # Try to download the leaderboard file
27
  leaderboard_path = hf_hub_download(
28
  repo_id=RESULTS_DATASET_ID,
29
  filename=f"leaderboards/leaderboard_{version}.json",
30
  repo_type="dataset",
31
  token=TOKEN
32
  )
33
-
34
  with open(leaderboard_path, 'r') as f:
35
  return json.load(f)
36
- except Exception as e:
37
- print(f"Error downloading leaderboard: {e}")
38
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
 
41
  def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]:
 
21
  def get_latest_leaderboard(version="v0") -> Optional[Dict]:
22
  """
23
  Get the latest leaderboard data from HuggingFace dataset.
24
+ Fallback to local JSON file if HF download fails or is unavailable.
25
  """
26
+ # First try to fetch from HuggingFace Hub
27
  try:
 
28
  leaderboard_path = hf_hub_download(
29
  repo_id=RESULTS_DATASET_ID,
30
  filename=f"leaderboards/leaderboard_{version}.json",
31
  repo_type="dataset",
32
  token=TOKEN
33
  )
 
34
  with open(leaderboard_path, 'r') as f:
35
  return json.load(f)
36
+ except Exception as hf_err:
37
+ print(f"HF download failed or unavailable: {hf_err}. Trying local fallback...")
38
+
39
+ # Fallback: attempt to load a local leaderboard_data.json located at the project root
40
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
41
+ local_path_candidates = [
42
+ os.path.join(project_root, "leaderboard_data.json"), # legacy path in root
43
+ os.path.join(project_root, "data", "leaderboard.json"), # path defined in envs.py
44
+ ]
45
+
46
+ for local_path in local_path_candidates:
47
+ if os.path.exists(local_path):
48
+ try:
49
+ with open(local_path, 'r') as f:
50
+ return json.load(f)
51
+ except Exception as local_err:
52
+ print(f"Error loading local leaderboard file {local_path}: {local_err}")
53
+
54
+ # If nothing found, return None
55
+ return None
56
 
57
 
58
  def get_model_entry(model_name: str, mode: str, version="v0") -> Optional[Dict]: