Yuxuan-Zhang-Dexter commited on
Commit
6d4c755
·
1 Parent(s): 865dbef

update leaderboard with new agentic leaderboard layout

Browse files
app.py CHANGED
@@ -38,11 +38,11 @@ TIME_POINTS = {
38
  }
39
 
40
  # Load the initial JSON file with rank data
41
- with open(TIME_POINTS["03/25/2025"], "r") as f:
42
  rank_data = json.load(f)
43
 
44
  # Load the model leaderboard data
45
- with open("rank_single_model_03_25_2025.json", "r") as f:
46
  model_rank_data = json.load(f)
47
 
48
  # Add leaderboard state at the top level
@@ -72,17 +72,17 @@ leaderboard_state = {
72
 
73
 
74
  # Load video links and news data
75
- with open('assets/game_video_link.json', 'r') as f:
76
  VIDEO_LINKS = json.load(f)
77
 
78
- with open('assets/news.json', 'r') as f:
79
  NEWS_DATA = json.load(f)
80
 
81
  def load_rank_data(time_point):
82
  """Load rank data for a specific time point"""
83
  if time_point in TIME_POINTS:
84
  try:
85
- with open(TIME_POINTS[time_point], "r") as f:
86
  return json.load(f)
87
  except FileNotFoundError:
88
  return None
@@ -105,7 +105,7 @@ def prepare_dataframe_for_display(df, for_game=None):
105
 
106
  # Replace '_' with '-' for better display
107
  for col in display_df.columns:
108
- if col.endswith(' Score'):
109
  display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
110
 
111
  # If we're in detailed view, sort by score
@@ -120,36 +120,47 @@ def prepare_dataframe_for_display(df, for_game=None):
120
  # Filter out models that didn't participate
121
  display_df = display_df[~display_df[score_col].isna()]
122
  else:
123
- # For overall view, sort by average of game scores (implicitly used for ranking)
124
- # but we won't add an explicit 'Rank' or 'Average Rank' column to the final display_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- # Calculate an internal sorting key based on average scores, but don't add it to the display_df
127
- score_cols = [col for col in display_df.columns if col.endswith(' Score')]
128
- if score_cols:
129
- temp_sort_df = display_df.copy()
130
- for col in score_cols:
131
- temp_sort_df[col] = pd.to_numeric(temp_sort_df[col], errors='coerce')
132
-
133
- # Calculate average of the game scores (use mean of ranks from utils for actual ranking logic if different)
134
- # For display sorting, let's use a simple average of available scores.
135
- # The actual ranking for 'Average Rank' in leaderboard_utils uses mean of ranks, which is more robust.
136
- # Here we just need a consistent sort order.
137
-
138
- # Create a temporary column for sorting
139
- temp_sort_df['temp_avg_score_for_sort'] = temp_sort_df[score_cols].mean(axis=1)
140
-
141
- # Sort by this temporary average score (higher is better for scores)
142
- # and then by Player name as a tie-breaker
143
- display_df = display_df.loc[temp_sort_df.sort_values(by=['temp_avg_score_for_sort', 'Player'], ascending=[False, True]).index]
144
 
145
  # Add line breaks to column headers
146
  new_columns = {}
147
  for col in display_df.columns:
148
- if col.endswith(' Score'):
149
  # Replace 'Game Name Score' with 'Game Name\nScore'
150
  game_name = col.replace(' Score', '')
151
  new_col = f"{game_name}\nScore"
152
  new_columns[col] = new_col
 
 
 
153
 
154
  # Rename columns with new line breaks
155
  if new_columns:
@@ -164,8 +175,14 @@ def update_df_with_height(df):
164
  col_widths = ["40px"] # Row number column width
165
  col_widths.append("230px") # Player column - reduced by 20px
166
  col_widths.append("120px") # Organization column
 
 
 
 
 
167
  # Add game score columns
168
- for _ in range(len(df.columns) - 2):
 
169
  col_widths.append("120px")
170
 
171
  return gr.update(value=df,
@@ -184,7 +201,7 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
184
  # tetris_overall, tetris_details, # Commented out
185
  tetris_plan_overall, tetris_plan_details,
186
  ace_attorney_overall, ace_attorney_details,
187
- top_n=10,
188
  data_source=None):
189
  global leaderboard_state
190
 
@@ -304,21 +321,22 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
304
 
305
  # Get the appropriate DataFrame and charts based on current state
306
  if leaderboard_state["current_game"]:
307
- # For detailed view
 
308
  # if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
309
  # df = get_mario_leaderboard(data)
310
  if leaderboard_state["current_game"] == "Super Mario Bros":
311
- df = get_mario_planning_leaderboard(data)
312
  elif leaderboard_state["current_game"] == "Sokoban":
313
- df = get_sokoban_leaderboard(data)
314
  elif leaderboard_state["current_game"] == "2048":
315
- df = get_2048_leaderboard(data)
316
  elif leaderboard_state["current_game"] == "Candy Crush":
317
- df = get_candy_leaderboard(data)
318
  elif leaderboard_state["current_game"] == "Tetris":
319
- df = get_tetris_planning_leaderboard(data)
320
  elif leaderboard_state["current_game"] == "Ace Attorney":
321
- df = get_ace_attorney_leaderboard(data)
322
  else: # Should not happen if current_game is one of the known games
323
  df = pd.DataFrame() # Empty df
324
 
@@ -327,10 +345,12 @@ def update_leaderboard(# mario_overall, mario_details, # Commented out
327
  radar_chart = chart # In detailed view, radar and group bar can be the same as the main chart
328
  group_bar_chart = chart
329
  else:
330
- # For overall view
331
- df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n)
 
332
  display_df = prepare_dataframe_for_display(df)
333
- _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games)
 
334
  chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
335
 
336
  # Return values, including all four plot placeholders
@@ -405,7 +425,7 @@ def get_initial_state():
405
  }
406
  }
407
 
408
- def clear_filters(top_n=10, data_source=None):
409
  global leaderboard_state
410
 
411
  # Use provided data source or default to rank_data
@@ -420,9 +440,12 @@ def clear_filters(top_n=10, data_source=None):
420
  "Ace Attorney": True
421
  }
422
 
423
- df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n)
 
 
424
  display_df = prepare_dataframe_for_display(df)
425
- _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games)
 
426
 
427
  leaderboard_state = get_initial_state()
428
 
@@ -675,9 +698,18 @@ def build_app():
675
  max-width: 140px !important;
676
  }
677
 
678
- /* Game score columns */
679
- .table-container th:nth-child(n+4),
680
- .table-container td:nth-child(n+4) {
 
 
 
 
 
 
 
 
 
681
  width: 120px !important;
682
  min-width: 100px !important;
683
  max-width: 140px !important;
@@ -743,6 +775,27 @@ def build_app():
743
  width: 100% !important;
744
  margin-top: 40px !important;
745
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746
  """) as demo:
747
  gr.Markdown("# 🎮 Lmgame Bench: Leaderboard 🎲")
748
 
@@ -875,6 +928,14 @@ def build_app():
875
  with gr.Tabs():
876
  with gr.Tab("🏆 Agent Leaderboard"):
877
  # Visualization section
 
 
 
 
 
 
 
 
878
  with gr.Row():
879
  gr.Markdown("### 📊 Data Visualization")
880
 
@@ -884,6 +945,19 @@ def build_app():
884
  visible=False,
885
  elem_classes="visualization-container"
886
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
887
 
888
  with gr.Column(visible=True) as overall_visualizations:
889
  with gr.Tabs():
@@ -894,45 +968,32 @@ def build_app():
894
  elem_classes="visualization-container"
895
  )
896
  gr.Markdown(
897
- "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
898
  elem_classes="radar-tip"
899
  )
900
- # Comment out the Group Bar Chart tab
901
  with gr.Tab("📊 Group Bar Chart"):
902
- with gr.Row():
903
- # Calculate dynamic maximum based on total models
904
- agent_max_models = get_total_model_count(rank_data)
905
- top_n_slider = gr.Slider(
906
- minimum=1,
907
- maximum=agent_max_models,
908
- step=1,
909
- value=min(10, agent_max_models),
910
- label=f"Number of Top Models to Display (max: {agent_max_models})",
911
- elem_classes="top-n-slider"
912
- )
913
  group_bar_visualization = gr.Plot(
914
  label="Comparative Analysis (Group Bar Chart)",
915
  elem_classes="visualization-container"
916
  )
917
  gr.Markdown(
918
- "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*",
919
  elem_classes="radar-tip"
920
  )
921
-
922
 
923
  # Hidden placeholder for group bar visualization (to maintain code references)
924
  # group_bar_visualization = gr.Plot(visible=False)
925
 
926
  # Game selection section
927
  with gr.Row():
928
- gr.Markdown("### 🎮 Game Selection")
929
  with gr.Row():
930
  # with gr.Column(): # Commented out Super Mario BrosUI
931
  # gr.Markdown("**🎮 Super Mario Bros**")
932
  # mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
933
  # mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
934
  with gr.Column(): # Added Super Mario BrosUI
935
- gr.Markdown("**🎮 Super Mario Bros**")
936
  mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
937
  mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
938
  with gr.Column(): # Sokoban is now after mario_plan
@@ -972,12 +1033,16 @@ def build_app():
972
  # Leaderboard table
973
  with gr.Row():
974
  gr.Markdown("### 📋 Detailed Results")
 
 
 
 
975
 
976
  # Add reference to Jupyter notebook
977
  with gr.Row():
978
  gr.Markdown("*All data analysis can be replicated by checking [this Jupyter notebook](https://colab.research.google.com/drive/1CYFiJGm3EoBXXI8vICPVR82J9qrmmRvc#scrollTo=qft1Oald-21J)*")
979
 
980
- # Get initial leaderboard dataframe
981
  initial_df = get_combined_leaderboard(rank_data, {
982
  # "Super Mario Bros": True, # Commented out
983
  "Super Mario Bros": True,
@@ -987,7 +1052,7 @@ def build_app():
987
  # "Tetris(complete)": True, # Commented out
988
  "Tetris": True,
989
  "Ace Attorney": True
990
- })
991
 
992
  # Format the DataFrame for display
993
  initial_display_df = prepare_dataframe_for_display(initial_df)
@@ -996,8 +1061,14 @@ def build_app():
996
  col_widths = ["40px"] # Row number column width
997
  col_widths.append("230px") # Player column - reduced by 20px
998
  col_widths.append("120px") # Organization column
 
 
 
 
 
999
  # Add game score columns
1000
- for _ in range(len(initial_display_df.columns) - 2):
 
1001
  col_widths.append("120px")
1002
 
1003
  # Create a standard DataFrame component with enhanced styling
@@ -1062,8 +1133,8 @@ def build_app():
1062
  # Update leaderboard and visualizations when checkboxes change
1063
  for checkbox in checkbox_list:
1064
  checkbox.change(
1065
- lambda *args: update_leaderboard(*args, data_source=rank_data),
1066
- inputs=checkbox_list + [top_n_slider],
1067
  outputs=[
1068
  leaderboard_df,
1069
  detailed_visualization,
@@ -1072,22 +1143,10 @@ def build_app():
1072
  ] + checkbox_list
1073
  )
1074
 
1075
- # Update when top_n_slider changes
1076
- top_n_slider.change(
1077
- lambda *args: update_leaderboard(*args, data_source=rank_data),
1078
- inputs=checkbox_list + [top_n_slider],
1079
- outputs=[
1080
- leaderboard_df,
1081
- detailed_visualization,
1082
- radar_visualization,
1083
- group_bar_visualization
1084
- ] + checkbox_list
1085
- )
1086
-
1087
  # Update when clear button is clicked
1088
  clear_btn.click(
1089
- lambda *args: clear_filters(*args, data_source=rank_data),
1090
- inputs=[top_n_slider],
1091
  outputs=[
1092
  leaderboard_df,
1093
  detailed_visualization,
@@ -1096,7 +1155,7 @@ def build_app():
1096
  ] + checkbox_list
1097
  )
1098
 
1099
- # Initialize the app
1100
  demo.load(
1101
  lambda: clear_filters(data_source=rank_data),
1102
  inputs=[],
@@ -1119,6 +1178,20 @@ def build_app():
1119
  visible=False,
1120
  elem_classes="visualization-container"
1121
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1122
 
1123
  with gr.Column(visible=True) as model_overall_visualizations:
1124
  with gr.Tabs():
@@ -1132,17 +1205,6 @@ def build_app():
1132
  elem_classes="radar-tip"
1133
  )
1134
  with gr.Tab("📊 Group Bar Chart"):
1135
- with gr.Row():
1136
- # Calculate dynamic maximum based on total models
1137
- model_max_models = get_total_model_count(model_rank_data)
1138
- model_top_n_slider = gr.Slider(
1139
- minimum=1,
1140
- maximum=model_max_models,
1141
- step=1,
1142
- value=min(10, model_max_models),
1143
- label=f"Number of Top Models to Display (max: {model_max_models})",
1144
- elem_classes="top-n-slider"
1145
- )
1146
  model_group_bar_visualization = gr.Plot(
1147
  label="Comparative Analysis (Group Bar Chart)",
1148
  elem_classes="visualization-container"
@@ -1154,10 +1216,10 @@ def build_app():
1154
 
1155
  # Game selection section
1156
  with gr.Row():
1157
- gr.Markdown("### 🎮 Game Selection")
1158
  with gr.Row():
1159
  with gr.Column():
1160
- gr.Markdown("**🎮 Super Mario Bros**")
1161
  model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
1162
  model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
1163
  with gr.Column():
@@ -1193,8 +1255,10 @@ def build_app():
1193
  # Leaderboard table
1194
  with gr.Row():
1195
  gr.Markdown("### 📋 Detailed Results")
 
 
1196
 
1197
- # Get initial leaderboard dataframe
1198
  model_initial_df = get_combined_leaderboard(model_rank_data, {
1199
  "Super Mario Bros": True,
1200
  "Sokoban": True,
@@ -1202,7 +1266,7 @@ def build_app():
1202
  "Candy Crush": True,
1203
  "Tetris": True,
1204
  "Ace Attorney": True
1205
- })
1206
 
1207
  # Format the DataFrame for display
1208
  model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
@@ -1300,7 +1364,7 @@ def build_app():
1300
  ] + model_checkbox_list
1301
  )
1302
 
1303
- # Initialize the model leaderboard
1304
  demo.load(
1305
  lambda: clear_filters(data_source=model_rank_data),
1306
  inputs=[],
 
38
  }
39
 
40
  # Load the initial JSON file with rank data
41
+ with open(TIME_POINTS["03/25/2025"], "r", encoding='utf-8') as f:
42
  rank_data = json.load(f)
43
 
44
  # Load the model leaderboard data
45
+ with open("rank_single_model_03_25_2025.json", "r", encoding='utf-8') as f:
46
  model_rank_data = json.load(f)
47
 
48
  # Add leaderboard state at the top level
 
72
 
73
 
74
  # Load video links and news data
75
+ with open('assets/game_video_link.json', 'r', encoding='utf-8') as f:
76
  VIDEO_LINKS = json.load(f)
77
 
78
+ with open('assets/news.json', 'r', encoding='utf-8') as f:
79
  NEWS_DATA = json.load(f)
80
 
81
  def load_rank_data(time_point):
82
  """Load rank data for a specific time point"""
83
  if time_point in TIME_POINTS:
84
  try:
85
+ with open(TIME_POINTS[time_point], "r", encoding='utf-8') as f:
86
  return json.load(f)
87
  except FileNotFoundError:
88
  return None
 
105
 
106
  # Replace '_' with '-' for better display
107
  for col in display_df.columns:
108
+ if col.endswith(' Score') and col != 'Avg Normalized Score':
109
  display_df[col] = display_df[col].apply(lambda x: '-' if x == '_' else x)
110
 
111
  # If we're in detailed view, sort by score
 
120
  # Filter out models that didn't participate
121
  display_df = display_df[~display_df[score_col].isna()]
122
  else:
123
+ # For overall view, sort by average normalized score if available, otherwise fallback to average scores
124
+ if 'Avg Normalized Score' in display_df.columns:
125
+ # Sort by average normalized score (already calculated in leaderboard_utils)
126
+ display_df = display_df.sort_values(by='Avg Normalized Score', ascending=False)
127
+ else:
128
+ # Calculate an internal sorting key based on average scores, but don't add it to the display_df
129
+ score_cols = [col for col in display_df.columns if col.endswith(' Score')]
130
+ if score_cols:
131
+ temp_sort_df = display_df.copy()
132
+ for col in score_cols:
133
+ temp_sort_df[col] = pd.to_numeric(temp_sort_df[col], errors='coerce')
134
+
135
+ # Create a temporary column for sorting
136
+ temp_sort_df['temp_avg_score_for_sort'] = temp_sort_df[score_cols].mean(axis=1)
137
+
138
+ # Sort by this temporary average score (higher is better for scores)
139
+ # and then by Player name as a tie-breaker
140
+ display_df = display_df.loc[temp_sort_df.sort_values(by=['temp_avg_score_for_sort', 'Player'], ascending=[False, True]).index]
141
+
142
+ # Add medal emojis for top 3 performers
143
+ if len(display_df) > 0 and 'Player' in display_df.columns:
144
+ # Reset index to get proper ranking after sorting
145
+ display_df = display_df.reset_index(drop=True)
146
 
147
+ # Add medal emojis to Player names for top 3
148
+ medal_emojis = ['🥇', '🥈', '🥉']
149
+ for i in range(min(3, len(display_df))):
150
+ original_name = display_df.loc[i, 'Player']
151
+ display_df.loc[i, 'Player'] = f"{medal_emojis[i]} {original_name}"
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  # Add line breaks to column headers
154
  new_columns = {}
155
  for col in display_df.columns:
156
+ if col.endswith(' Score') and col != 'Avg Normalized Score':
157
  # Replace 'Game Name Score' with 'Game Name\nScore'
158
  game_name = col.replace(' Score', '')
159
  new_col = f"{game_name}\nScore"
160
  new_columns[col] = new_col
161
+ elif col == 'Avg Normalized Score':
162
+ # Add line break to Avg Normalized Score column
163
+ new_columns[col] = "Avg Normalized\nScore"
164
 
165
  # Rename columns with new line breaks
166
  if new_columns:
 
175
  col_widths = ["40px"] # Row number column width
176
  col_widths.append("230px") # Player column - reduced by 20px
177
  col_widths.append("120px") # Organization column
178
+
179
+ # Check if there's an Avg Normalized Score column
180
+ if any('Avg Normalized' in col for col in df.columns):
181
+ col_widths.append("140px") # Avg Normalized Score column - slightly wider
182
+
183
  # Add game score columns
184
+ remaining_cols = len(df.columns) - len(col_widths) + 1 # +1 because we subtracted row number column
185
+ for _ in range(remaining_cols):
186
  col_widths.append("120px")
187
 
188
  return gr.update(value=df,
 
201
  # tetris_overall, tetris_details, # Commented out
202
  tetris_plan_overall, tetris_plan_details,
203
  ace_attorney_overall, ace_attorney_details,
204
+ top_n=3,
205
  data_source=None):
206
  global leaderboard_state
207
 
 
321
 
322
  # Get the appropriate DataFrame and charts based on current state
323
  if leaderboard_state["current_game"]:
324
+ # For detailed view - use slider value for both leaderboards
325
+ limit = top_n
326
  # if leaderboard_state["current_game"] == "Super Mario Bros": # Commented out
327
  # df = get_mario_leaderboard(data)
328
  if leaderboard_state["current_game"] == "Super Mario Bros":
329
+ df = get_mario_planning_leaderboard(data, limit)
330
  elif leaderboard_state["current_game"] == "Sokoban":
331
+ df = get_sokoban_leaderboard(data, limit)
332
  elif leaderboard_state["current_game"] == "2048":
333
+ df = get_2048_leaderboard(data, limit)
334
  elif leaderboard_state["current_game"] == "Candy Crush":
335
+ df = get_candy_leaderboard(data, limit)
336
  elif leaderboard_state["current_game"] == "Tetris":
337
+ df = get_tetris_planning_leaderboard(data, limit)
338
  elif leaderboard_state["current_game"] == "Ace Attorney":
339
+ df = get_ace_attorney_leaderboard(data, limit)
340
  else: # Should not happen if current_game is one of the known games
341
  df = pd.DataFrame() # Empty df
342
 
 
345
  radar_chart = chart # In detailed view, radar and group bar can be the same as the main chart
346
  group_bar_chart = chart
347
  else:
348
+ # For overall view - use slider value for both leaderboards
349
+ limit = top_n
350
+ df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n, limit)
351
  display_df = prepare_dataframe_for_display(df)
352
+ # Pass appropriate title and top_n based on data source
353
+ _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games, limit_to_top_n=limit, top_n=top_n)
354
  chart = radar_chart # In overall view, the 'detailed' chart can be the radar chart
355
 
356
  # Return values, including all four plot placeholders
 
425
  }
426
  }
427
 
428
+ def clear_filters(top_n=3, data_source=None):
429
  global leaderboard_state
430
 
431
  # Use provided data source or default to rank_data
 
440
  "Ace Attorney": True
441
  }
442
 
443
+ # Use slider value for both leaderboards
444
+ limit = top_n
445
+ df, group_bar_chart = get_combined_leaderboard_with_group_bar(data, selected_games, top_n, limit)
446
  display_df = prepare_dataframe_for_display(df)
447
+ # Pass top_n parameter for consistent titles
448
+ _, radar_chart = get_combined_leaderboard_with_single_radar(data, selected_games, limit_to_top_n=limit, top_n=top_n)
449
 
450
  leaderboard_state = get_initial_state()
451
 
 
698
  max-width: 140px !important;
699
  }
700
 
701
+ /* Avg Normalized Score column (4th column) */
702
+ .table-container th:nth-child(4),
703
+ .table-container td:nth-child(4) {
704
+ width: 140px !important;
705
+ min-width: 120px !important;
706
+ max-width: 160px !important;
707
+ text-align: center !important;
708
+ }
709
+
710
+ /* Game score columns (5th column onwards) */
711
+ .table-container th:nth-child(n+5),
712
+ .table-container td:nth-child(n+5) {
713
  width: 120px !important;
714
  min-width: 100px !important;
715
  max-width: 140px !important;
 
775
  width: 100% !important;
776
  margin-top: 40px !important;
777
  }
778
+
779
+ .welcome-message {
780
+ background: linear-gradient(135deg, #a8edea 0%, #fed6e3 100%);
781
+ color: #333;
782
+ padding: 20px;
783
+ border-radius: 10px;
784
+ margin: 20px 0;
785
+ text-align: center;
786
+ box-shadow: 0 4px 15px rgba(0,0,0,0.05);
787
+ }
788
+
789
+ .welcome-message h3 {
790
+ margin: 0 0 10px 0;
791
+ font-size: 1.3em;
792
+ }
793
+
794
+ .welcome-message p {
795
+ margin: 0;
796
+ font-size: 1.1em;
797
+ line-height: 1.5;
798
+ }
799
  """) as demo:
800
  gr.Markdown("# 🎮 Lmgame Bench: Leaderboard 🎲")
801
 
 
928
  with gr.Tabs():
929
  with gr.Tab("🏆 Agent Leaderboard"):
930
  # Visualization section
931
+
932
+ with gr.Row():
933
+ gr.Markdown("""
934
+ **🎮 Welcome to LMGame Bench!**
935
+
936
+ We welcome everyone to implement their own gaming agents by replacing our baseAgent in `customer_runner.py` and test them on our benchmark. Join the competition and see how your agent performs!
937
+ """, elem_classes="welcome-message")
938
+
939
  with gr.Row():
940
  gr.Markdown("### 📊 Data Visualization")
941
 
 
945
  visible=False,
946
  elem_classes="visualization-container"
947
  )
948
+ # with gr.Row():
949
+ # # Calculate dynamic maximum based on total models
950
+ # agent_max_models = get_total_model_count(rank_data)
951
+ # top_n_slider = gr.Slider(
952
+ # minimum=1,
953
+ # maximum=agent_max_models,
954
+ # step=1,
955
+ # value=min(3, agent_max_models),
956
+ # label=f"Number of Top Models to Display in All Views (max: {agent_max_models})",
957
+ # elem_classes="top-n-slider"
958
+ # )
959
+
960
+
961
 
962
  with gr.Column(visible=True) as overall_visualizations:
963
  with gr.Tabs():
 
968
  elem_classes="visualization-container"
969
  )
970
  gr.Markdown(
971
+ "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*⚔️ - Model with our gaming agent*",
972
  elem_classes="radar-tip"
973
  )
 
974
  with gr.Tab("📊 Group Bar Chart"):
 
 
 
 
 
 
 
 
 
 
 
975
  group_bar_visualization = gr.Plot(
976
  label="Comparative Analysis (Group Bar Chart)",
977
  elem_classes="visualization-container"
978
  )
979
  gr.Markdown(
980
+ "*💡 Click a legend entry to isolate that model. Double-click additional ones to add them for comparison.*\n\n*⚔️ - Model with our gaming agent*",
981
  elem_classes="radar-tip"
982
  )
 
983
 
984
  # Hidden placeholder for group bar visualization (to maintain code references)
985
  # group_bar_visualization = gr.Plot(visible=False)
986
 
987
  # Game selection section
988
  with gr.Row():
989
+ gr.Markdown("### 🕹️ Game Selection")
990
  with gr.Row():
991
  # with gr.Column(): # Commented out Super Mario BrosUI
992
  # gr.Markdown("**🎮 Super Mario Bros**")
993
  # mario_overall = gr.Checkbox(label="Super Mario BrosScore", value=True)
994
  # mario_details = gr.Checkbox(label="Super Mario BrosDetails", value=False)
995
  with gr.Column(): # Added Super Mario BrosUI
996
+ gr.Markdown("**🍄 Super Mario Bros**")
997
  mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
998
  mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
999
  with gr.Column(): # Sokoban is now after mario_plan
 
1033
  # Leaderboard table
1034
  with gr.Row():
1035
  gr.Markdown("### 📋 Detailed Results")
1036
+ with gr.Row():
1037
+ gr.Markdown("*⚔️ - Model with our gaming agent*", elem_classes="radar-tip")
1038
+
1039
+ # Welcome message for custom gaming agents
1040
 
1041
  # Add reference to Jupyter notebook
1042
  with gr.Row():
1043
  gr.Markdown("*All data analysis can be replicated by checking [this Jupyter notebook](https://colab.research.google.com/drive/1CYFiJGm3EoBXXI8vICPVR82J9qrmmRvc#scrollTo=qft1Oald-21J)*")
1044
 
1045
+ # Get initial leaderboard dataframe (limited by default slider value for agent leaderboard)
1046
  initial_df = get_combined_leaderboard(rank_data, {
1047
  # "Super Mario Bros": True, # Commented out
1048
  "Super Mario Bros": True,
 
1052
  # "Tetris(complete)": True, # Commented out
1053
  "Tetris": True,
1054
  "Ace Attorney": True
1055
+ }, limit_to_top_n=min(3, get_total_model_count(rank_data)))
1056
 
1057
  # Format the DataFrame for display
1058
  initial_display_df = prepare_dataframe_for_display(initial_df)
 
1061
  col_widths = ["40px"] # Row number column width
1062
  col_widths.append("230px") # Player column - reduced by 20px
1063
  col_widths.append("120px") # Organization column
1064
+
1065
+ # Check if there's an Avg Normalized Score column
1066
+ if any('Avg Normalized' in col for col in initial_display_df.columns):
1067
+ col_widths.append("140px") # Avg Normalized Score column - slightly wider
1068
+
1069
  # Add game score columns
1070
+ remaining_cols = len(initial_display_df.columns) - len(col_widths) + 1 # +1 because we subtracted row number column
1071
+ for _ in range(remaining_cols):
1072
  col_widths.append("120px")
1073
 
1074
  # Create a standard DataFrame component with enhanced styling
 
1133
  # Update leaderboard and visualizations when checkboxes change
1134
  for checkbox in checkbox_list:
1135
  checkbox.change(
1136
+ lambda *args: update_leaderboard(*args, top_n=3, data_source=rank_data),
1137
+ inputs=checkbox_list,
1138
  outputs=[
1139
  leaderboard_df,
1140
  detailed_visualization,
 
1143
  ] + checkbox_list
1144
  )
1145
 
 
 
 
 
 
 
 
 
 
 
 
 
1146
  # Update when clear button is clicked
1147
  clear_btn.click(
1148
+ lambda: clear_filters(top_n=3, data_source=rank_data),
1149
+ inputs=[],
1150
  outputs=[
1151
  leaderboard_df,
1152
  detailed_visualization,
 
1155
  ] + checkbox_list
1156
  )
1157
 
1158
+ # Initialize the agent leaderboard (with top 5 limit)
1159
  demo.load(
1160
  lambda: clear_filters(data_source=rank_data),
1161
  inputs=[],
 
1178
  visible=False,
1179
  elem_classes="visualization-container"
1180
  )
1181
+
1182
+ with gr.Row():
1183
+ # Calculate dynamic maximum based on total models
1184
+ model_max_models = get_total_model_count(model_rank_data)
1185
+ model_top_n_slider = gr.Slider(
1186
+ minimum=1,
1187
+ maximum=model_max_models,
1188
+ step=1,
1189
+ value=min(5, model_max_models),
1190
+ label=f"Number of Top Models to Display in All Views (max: {model_max_models})",
1191
+ elem_classes="top-n-slider"
1192
+ )
1193
+
1194
+
1195
 
1196
  with gr.Column(visible=True) as model_overall_visualizations:
1197
  with gr.Tabs():
 
1205
  elem_classes="radar-tip"
1206
  )
1207
  with gr.Tab("📊 Group Bar Chart"):
 
 
 
 
 
 
 
 
 
 
 
1208
  model_group_bar_visualization = gr.Plot(
1209
  label="Comparative Analysis (Group Bar Chart)",
1210
  elem_classes="visualization-container"
 
1216
 
1217
  # Game selection section
1218
  with gr.Row():
1219
+ gr.Markdown("### 🕹️ Game Selection")
1220
  with gr.Row():
1221
  with gr.Column():
1222
+ gr.Markdown("**🍄 Super Mario Bros**")
1223
  model_mario_plan_overall = gr.Checkbox(label="Super Mario Bros Score", value=True)
1224
  model_mario_plan_details = gr.Checkbox(label="Super Mario Bros Details", value=False)
1225
  with gr.Column():
 
1255
  # Leaderboard table
1256
  with gr.Row():
1257
  gr.Markdown("### 📋 Detailed Results")
1258
+ with gr.Row():
1259
+ gr.Markdown("*💡 The slider above controls how many top models are shown in the radar chart, bar chart, and data table.*", elem_classes="radar-tip")
1260
 
1261
+ # Get initial leaderboard dataframe (limited by default slider value for model leaderboard)
1262
  model_initial_df = get_combined_leaderboard(model_rank_data, {
1263
  "Super Mario Bros": True,
1264
  "Sokoban": True,
 
1266
  "Candy Crush": True,
1267
  "Tetris": True,
1268
  "Ace Attorney": True
1269
+ }, limit_to_top_n=min(5, get_total_model_count(model_rank_data)))
1270
 
1271
  # Format the DataFrame for display
1272
  model_initial_display_df = prepare_dataframe_for_display(model_initial_df)
 
1364
  ] + model_checkbox_list
1365
  )
1366
 
1367
+ # Initialize the model leaderboard (with default slider limit)
1368
  demo.load(
1369
  lambda: clear_filters(data_source=model_rank_data),
1370
  inputs=[],
assets/model_color.json CHANGED
@@ -27,31 +27,31 @@
27
  "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
28
  "qwen3-235B-A22B-fp8": "#6A1B9A",
29
  "random (x30)": "#9E9E9E",
30
- "gamingagent + claude-3-7-sonnet-20250219": "#4A90E2",
31
- "gamingagent + claude-3-5-haiku-20241022": "#7FB5E6",
32
- "gamingagent + claude-3-5-sonnet-20241022": "#1A4C7C",
33
- "gamingagent + claude-opus-4-20250514": "#3A80D2",
34
- "gamingagent + claude-sonnet-4-20250514": "#5A9FE2",
35
- "gamingagent + gemini-2.0-flash": "#FF4081",
36
- "gamingagent + gemini-2.0-flash-thinking-exp-1219": "#C2185B",
37
- "gamingagent + gemini-2.5-pro-exp-03-25": "#FF80AB",
38
- "gamingagent + gemini-2.5-flash-preview-04-17": "#F06292",
39
- "gamingagent + gemini-2.5-flash-preview-05-20": "#F8BBD9",
40
- "gamingagent + gemini-2.5-pro-preview-05-06": "#AD1457",
41
- "gamingagent + gemini-2.5-pro-preview-06-05": "#EC407A",
42
- "gamingagent + gpt-4o-2024-11-20": "#00BFA5",
43
- "gamingagent + gpt-4.5-preview-2025-02-27": "#00796B",
44
- "gamingagent + gpt-4.1-2025-04-14": "#00897B",
45
- "gamingagent + o1-2024-12-17": "#4DB6AC",
46
- "gamingagent + o1-mini-2024-09-12": "#26A69A",
47
- "gamingagent + o3-mini-2025-01-31(medium)": "#80CBC4",
48
- "gamingagent + o3-2025-04-16": "#26C6DA",
49
- "gamingagent + o4-mini-2025-04-16": "#00ACC1",
50
- "gamingagent + grok-3-beta": "#FF7043",
51
- "gamingagent + grok-3-mini-beta": "#FF8A65",
52
- "gamingagent + deepseek-v3": "#FFC107",
53
- "gamingagent + deepseek-r1-0120": "#FFA000",
54
- "gamingagent + deepseek-r1-0528": "#FFB300",
55
- "gamingagent + llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
56
- "gamingagent + qwen3-235B-A22B-fp8": "#6A1B9A"
57
  }
 
27
  "llama-4-maverick-17b-128e-instruct-fp8": "#8E24AA",
28
  "qwen3-235B-A22B-fp8": "#6A1B9A",
29
  "random (x30)": "#9E9E9E",
30
+ "claude-3-7-sonnet-20250219 (⚔️)": "#4A90E2",
31
+ "claude-3-5-haiku-20241022 (⚔️)": "#7FB5E6",
32
+ "claude-3-5-sonnet-20241022 (⚔️)": "#1A4C7C",
33
+ "claude-opus-4-20250514 (⚔️)": "#3A80D2",
34
+ "claude-sonnet-4-20250514 (⚔️)": "#5A9FE2",
35
+ "gemini-2.0-flash (⚔️)": "#FF4081",
36
+ "gemini-2.0-flash-thinking-exp-1219 (⚔️)": "#C2185B",
37
+ "gemini-2.5-pro-exp-03-25 (⚔️)": "#FF80AB",
38
+ "gemini-2.5-flash-preview-04-17 (⚔️)": "#F06292",
39
+ "gemini-2.5-flash-preview-05-20 (⚔️)": "#F8BBD9",
40
+ "gemini-2.5-pro-preview-05-06 (⚔️)": "#AD1457",
41
+ "gemini-2.5-pro-preview-06-05 (⚔️)": "#EC407A",
42
+ "gpt-4o-2024-11-20 (⚔️)": "#00BFA5",
43
+ "gpt-4.5-preview-2025-02-27 (⚔️)": "#00796B",
44
+ "gpt-4.1-2025-04-14 (⚔️)": "#00897B",
45
+ "o1-2024-12-17 (⚔️)": "#4DB6AC",
46
+ "o1-mini-2024-09-12 (⚔️)": "#26A69A",
47
+ "o3-mini-2025-01-31(medium) (⚔️)": "#80CBC4",
48
+ "o3-2025-04-16 (⚔️)": "#26C6DA",
49
+ "o4-mini-2025-04-16 (⚔️)": "#00ACC1",
50
+ "grok-3-beta (⚔️)": "#FF7043",
51
+ "grok-3-mini-beta (⚔️)": "#FF8A65",
52
+ "deepseek-v3 (⚔️)": "#FFC107",
53
+ "deepseek-r1-0120 (⚔️)": "#FFA000",
54
+ "deepseek-r1-0528 (⚔️)": "#FFB300",
55
+ "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)": "#8E24AA",
56
+ "qwen3-235B-A22B-fp8 (⚔️)": "#6A1B9A"
57
  }
data_visualization.py CHANGED
@@ -2,13 +2,15 @@ import plotly.graph_objects as go
2
  import numpy as np
3
  import pandas as pd
4
  import json
 
 
5
  from leaderboard_utils import (
6
  get_combined_leaderboard,
7
  GAME_ORDER
8
  )
9
 
10
  # Load model colors
11
- with open('assets/model_color.json', 'r') as f:
12
  MODEL_COLORS = json.load(f)
13
 
14
  GAME_SCORE_COLUMNS = {
@@ -126,7 +128,7 @@ def create_radar_charts(df):
126
  categories = [c.replace(" Score", "") for c in game_cols]
127
 
128
  for col in game_cols:
129
- vals = df[col].replace("n/a", 0).astype(float)
130
  mean, std = vals.mean(), vals.std()
131
  df[f"norm_{col}"] = normalize_values(vals, mean, std)
132
 
@@ -179,7 +181,7 @@ def get_combined_leaderboard_with_radar(rank_data, selected_games):
179
  df_viz = df.copy()
180
  return df, create_radar_charts(df_viz)
181
 
182
- def create_group_bar_chart(df, top_n=10):
183
  game_cols = {}
184
  for game in GAME_ORDER:
185
  col = f"{game} Score"
@@ -330,8 +332,8 @@ def create_group_bar_chart(df, top_n=10):
330
 
331
 
332
 
333
- def get_combined_leaderboard_with_group_bar(rank_data, selected_games, top_n=10):
334
- df = get_combined_leaderboard(rank_data, selected_games)
335
  # Create a copy for visualization to avoid modifying the original
336
  df_viz = df.copy()
337
  return df, create_group_bar_chart(df_viz, top_n)
@@ -344,7 +346,7 @@ def hex_to_rgba(hex_color, alpha=0.2):
344
  return f'rgba({r}, {g}, {b}, {alpha})'
345
 
346
 
347
- def create_single_radar_chart(df, selected_games=None, highlight_models=None):
348
  if selected_games is None:
349
  selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
350
 
@@ -359,11 +361,25 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
359
  game_cols = [f"{game} Score" for game in selected_games]
360
  categories = formatted_games
361
 
362
- # Normalize
 
 
 
 
363
  for col in game_cols:
364
- vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
365
- mean, std = vals.mean(), vals.std()
366
- df[f"norm_{col}"] = normalize_values(vals, mean, std)
 
 
 
 
 
 
 
 
 
 
367
 
368
  # Group players by prefix and sort alphabetically
369
  model_groups = {}
@@ -411,12 +427,23 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
411
  hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
412
  ))
413
 
 
 
 
 
 
 
 
 
 
 
 
414
  fig.update_layout(
415
  autosize=True,
416
  height=550, # Reduced height for better proportion with legend
417
  margin=dict(l=400, r=100, t=20, b=20),
418
  title=dict(
419
- text="AI Normalized Performance Across Games",
420
  x=0.5,
421
  xanchor='center',
422
  yanchor='top',
@@ -462,12 +489,20 @@ def create_single_radar_chart(df, selected_games=None, highlight_models=None):
462
 
463
  return fig
464
 
465
- def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None):
466
- df = get_combined_leaderboard(rank_data, selected_games)
 
 
 
 
 
467
  selected_game_names = [g for g, sel in selected_games.items() if sel]
468
- # Create a copy for visualization to avoid modifying the original
 
469
  df_viz = df.copy()
470
- return df, create_single_radar_chart(df_viz, selected_game_names, highlight_models)
 
 
471
 
472
  def create_organization_radar_chart(rank_data):
473
  df = get_combined_leaderboard(rank_data, {g: True for g in GAME_ORDER})
@@ -477,7 +512,7 @@ def create_organization_radar_chart(rank_data):
477
 
478
  avg_df = pd.DataFrame([
479
  {
480
- **{col: df[df["Organization"] == org][col].replace("n/a", 0).infer_objects(copy=False).astype(float).mean() for col in game_cols},
481
  "Organization": org
482
  }
483
  for org in orgs
@@ -533,7 +568,10 @@ def create_top_players_radar_chart(rank_data, n=5):
533
 
534
  for col in game_cols:
535
  # Replace "n/a" with 0 and handle downcasting properly
536
- vals = top_df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
 
 
 
537
  mean, std = vals.mean(), vals.std()
538
  top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
539
 
@@ -589,8 +627,15 @@ def create_player_radar_chart(rank_data, player_name):
589
 
590
  for col in game_cols:
591
  # Replace "n/a" with 0 and handle downcasting properly
592
- vals = player_df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
593
- mean, std = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float).mean(), df[col].replace("n/a", 0).infer_objects(copy=False).astype(float).std()
 
 
 
 
 
 
 
594
  player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
595
 
596
  fig = go.Figure()
@@ -628,6 +673,281 @@ def create_player_radar_chart(rank_data, player_name):
628
  )
629
  return fig
630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
 
632
  def save_visualization(fig, filename):
633
- fig.write_image(filename)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import numpy as np
3
  import pandas as pd
4
  import json
5
+ import os
6
+ from datetime import datetime
7
  from leaderboard_utils import (
8
  get_combined_leaderboard,
9
  GAME_ORDER
10
  )
11
 
12
  # Load model colors
13
+ with open('assets/model_color.json', 'r', encoding='utf-8') as f:
14
  MODEL_COLORS = json.load(f)
15
 
16
  GAME_SCORE_COLUMNS = {
 
128
  categories = [c.replace(" Score", "") for c in game_cols]
129
 
130
  for col in game_cols:
131
+ vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
132
  mean, std = vals.mean(), vals.std()
133
  df[f"norm_{col}"] = normalize_values(vals, mean, std)
134
 
 
181
  df_viz = df.copy()
182
  return df, create_radar_charts(df_viz)
183
 
184
+ def create_group_bar_chart(df, top_n=5):
185
  game_cols = {}
186
  for game in GAME_ORDER:
187
  col = f"{game} Score"
 
332
 
333
 
334
 
335
+ def get_combined_leaderboard_with_group_bar(rank_data, selected_games, top_n=5, limit_to_top_n=None):
336
+ df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n)
337
  # Create a copy for visualization to avoid modifying the original
338
  df_viz = df.copy()
339
  return df, create_group_bar_chart(df_viz, top_n)
 
346
  return f'rgba({r}, {g}, {b}, {alpha})'
347
 
348
 
349
+ def create_single_radar_chart(df, selected_games=None, highlight_models=None, chart_title=None, top_n=None, full_df=None):
350
  if selected_games is None:
351
  selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
352
 
 
361
  game_cols = [f"{game} Score" for game in selected_games]
362
  categories = formatted_games
363
 
364
+ # Use full dataset for normalization to keep consistent scale
365
+ # If full_df is not provided, use the current df (fallback for backward compatibility)
366
+ normalization_df = full_df if full_df is not None else df
367
+
368
+ # Normalize using the full dataset but apply to the limited df
369
  for col in game_cols:
370
+ # Get normalization parameters from full dataset
371
+ # Use where() to avoid FutureWarning about downcasting in replace()
372
+ full_series = normalization_df[col].copy()
373
+ full_series = full_series.where(full_series != "n/a", 0)
374
+ full_vals = full_series.astype(float)
375
+ mean, std = full_vals.mean(), full_vals.std()
376
+
377
+ # Apply normalization to the limited df
378
+ # Use where() to avoid FutureWarning about downcasting in replace()
379
+ limited_series = df[col].copy()
380
+ limited_series = limited_series.where(limited_series != "n/a", 0)
381
+ limited_vals = limited_series.astype(float)
382
+ df[f"norm_{col}"] = normalize_values(limited_vals, mean, std)
383
 
384
  # Group players by prefix and sort alphabetically
385
  model_groups = {}
 
427
  hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
428
  ))
429
 
430
+ # Dynamic title based on the data source and top_n
431
+ if chart_title is None:
432
+ if top_n is not None:
433
+ chart_title = f"Radar Chart - Top {top_n} Performers by Game"
434
+ else:
435
+ # Fallback title
436
+ if len(df) <= 10:
437
+ chart_title = "🎮 Agent Performance Across Games"
438
+ else:
439
+ chart_title = "🤖 Model Performance Across Games"
440
+
441
  fig.update_layout(
442
  autosize=True,
443
  height=550, # Reduced height for better proportion with legend
444
  margin=dict(l=400, r=100, t=20, b=20),
445
  title=dict(
446
+ text=chart_title,
447
  x=0.5,
448
  xanchor='center',
449
  yanchor='top',
 
489
 
490
  return fig
491
 
492
+ def get_combined_leaderboard_with_single_radar(rank_data, selected_games, highlight_models=None, limit_to_top_n=None, chart_title=None, top_n=None):
493
+ # Get full dataset for normalization
494
+ full_df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n=None)
495
+
496
+ # Get limited dataset for display
497
+ df = get_combined_leaderboard(rank_data, selected_games, limit_to_top_n)
498
+
499
  selected_game_names = [g for g, sel in selected_games.items() if sel]
500
+
501
+ # Create copies for visualization to avoid modifying the original
502
  df_viz = df.copy()
503
+ full_df_viz = full_df.copy()
504
+
505
+ return df, create_single_radar_chart(df_viz, selected_game_names, highlight_models, chart_title, top_n, full_df_viz)
506
 
507
  def create_organization_radar_chart(rank_data):
508
  df = get_combined_leaderboard(rank_data, {g: True for g in GAME_ORDER})
 
512
 
513
  avg_df = pd.DataFrame([
514
  {
515
+ **{col: df[df["Organization"] == org][col].where(df[df["Organization"] == org][col] != "n/a", 0).astype(float).mean() for col in game_cols},
516
  "Organization": org
517
  }
518
  for org in orgs
 
568
 
569
  for col in game_cols:
570
  # Replace "n/a" with 0 and handle downcasting properly
571
+ # Use where() to avoid FutureWarning about downcasting in replace()
572
+ series = top_df[col].copy()
573
+ series = series.where(series != "n/a", 0)
574
+ vals = series.astype(float)
575
  mean, std = vals.mean(), vals.std()
576
  top_df[f"norm_{col}"] = normalize_values(vals, mean, std)
577
 
 
627
 
628
  for col in game_cols:
629
  # Replace "n/a" with 0 and handle downcasting properly
630
+ # Use where() to avoid FutureWarning about downcasting in replace()
631
+ player_series = player_df[col].copy()
632
+ player_series = player_series.where(player_series != "n/a", 0)
633
+ vals = player_series.astype(float)
634
+
635
+ df_series = df[col].copy()
636
+ df_series = df_series.where(df_series != "n/a", 0)
637
+ df_vals = df_series.astype(float)
638
+ mean, std = df_vals.mean(), df_vals.std()
639
  player_df[f"norm_{col}"] = normalize_values(vals, mean, std)
640
 
641
  fig = go.Figure()
 
673
  )
674
  return fig
675
 
676
+ def save_normalized_data(df, selected_games, filename="normalized_data.json"):
677
+ """
678
+ Save normalized data to a JSON file for caching
679
+
680
+ Args:
681
+ df (pd.DataFrame): DataFrame with raw scores
682
+ selected_games (dict): Dictionary of selected games
683
+ filename (str): Output filename
684
+ """
685
+ game_cols = [f"{game} Score" for game in GAME_ORDER if f"{game} Score" in df.columns]
686
+
687
+ # Calculate normalization parameters and normalized values
688
+ normalization_data = {
689
+ "timestamp": datetime.now().isoformat(),
690
+ "selected_games": selected_games,
691
+ "games": {},
692
+ "players": {}
693
+ }
694
+
695
+ # Store normalization parameters per game
696
+ for col in game_cols:
697
+ game_name = col.replace(" Score", "")
698
+ vals = df[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
699
+ mean, std = vals.mean(), vals.std()
700
+
701
+ normalization_data["games"][game_name] = {
702
+ "mean": mean,
703
+ "std": std,
704
+ "raw_scores": vals.to_dict()
705
+ }
706
+
707
+ # Store normalized scores per player
708
+ for _, row in df.iterrows():
709
+ player = row["Player"]
710
+ player_data = {"organization": row.get("Organization", "unknown")}
711
+
712
+ for col in game_cols:
713
+ game_name = col.replace(" Score", "")
714
+ raw_score = row[col]
715
+
716
+ if raw_score != "n/a":
717
+ raw_score = float(raw_score)
718
+ mean = normalization_data["games"][game_name]["mean"]
719
+ std = normalization_data["games"][game_name]["std"]
720
+ normalized = normalize_values([raw_score], mean, std)[0]
721
+ else:
722
+ raw_score = "n/a"
723
+ normalized = 0
724
+
725
+ player_data[f"{game_name}_raw"] = raw_score
726
+ player_data[f"{game_name}_normalized"] = normalized
727
+
728
+ normalization_data["players"][player] = player_data
729
+
730
+ # Save to file
731
+ os.makedirs("cache", exist_ok=True)
732
+ filepath = os.path.join("cache", filename)
733
+
734
+ with open(filepath, 'w') as f:
735
+ json.dump(normalization_data, f, indent=2)
736
+
737
+ print(f"Normalized data saved to {filepath}")
738
+ return filepath
739
+
740
+ def load_normalized_data(filename="normalized_data.json"):
741
+ """
742
+ Load normalized data from a JSON file
743
+
744
+ Args:
745
+ filename (str): Input filename
746
+
747
+ Returns:
748
+ dict: Normalized data or None if file doesn't exist
749
+ """
750
+ filepath = os.path.join("cache", filename)
751
+
752
+ if not os.path.exists(filepath):
753
+ return None
754
+
755
+ try:
756
+ with open(filepath, 'r') as f:
757
+ data = json.load(f)
758
+ print(f"Normalized data loaded from {filepath}")
759
+ return data
760
+ except Exception as e:
761
+ print(f"Error loading normalized data: {e}")
762
+ return None
763
+
764
+ def get_normalized_scores_from_cache(players, games, cache_data):
765
+ """
766
+ Extract normalized scores from cached data
767
+
768
+ Args:
769
+ players (list): List of player names
770
+ games (list): List of game names
771
+ cache_data (dict): Cached normalization data
772
+
773
+ Returns:
774
+ pd.DataFrame: DataFrame with normalized scores
775
+ """
776
+ data = []
777
+
778
+ for player in players:
779
+ if player in cache_data["players"]:
780
+ player_data = {"Player": player}
781
+ player_cache = cache_data["players"][player]
782
+
783
+ for game in games:
784
+ raw_key = f"{game}_raw"
785
+ norm_key = f"{game}_normalized"
786
+
787
+ if raw_key in player_cache:
788
+ player_data[f"{game} Score"] = player_cache[raw_key]
789
+ player_data[f"norm_{game} Score"] = player_cache[norm_key]
790
+ else:
791
+ player_data[f"{game} Score"] = "n/a"
792
+ player_data[f"norm_{game} Score"] = 0
793
+
794
+ data.append(player_data)
795
+
796
+ return pd.DataFrame(data)
797
 
798
  def save_visualization(fig, filename):
799
+ fig.write_image(filename)
800
+
801
+ def generate_and_save_normalized_data(rank_data, filename="normalized_data.json"):
802
+ """
803
+ Generate normalized data for all games and save to file
804
+
805
+ Args:
806
+ rank_data (dict): Raw rank data
807
+ filename (str): Output filename
808
+
809
+ Returns:
810
+ str: Path to saved file
811
+ """
812
+ # Select all games
813
+ all_games = {game: True for game in GAME_ORDER}
814
+
815
+ # Get combined leaderboard
816
+ df = get_combined_leaderboard(rank_data, all_games)
817
+
818
+ # Save normalized data
819
+ return save_normalized_data(df, all_games, filename)
820
+
821
+ def create_single_radar_chart_with_cache(df, selected_games=None, highlight_models=None, use_cache=True, cache_filename="normalized_data.json"):
822
+ """
823
+ Create radar chart with optional caching support
824
+ """
825
+ if selected_games is None:
826
+ selected_games = ['Super Mario Bros', '2048', 'Candy Crush', 'Sokoban', 'Ace Attorney']
827
+
828
+ # Try to load from cache first
829
+ cached_data = None
830
+ if use_cache:
831
+ cached_data = load_normalized_data(cache_filename)
832
+
833
+ if cached_data:
834
+ # Use cached normalized data
835
+ players = df["Player"].tolist()
836
+ df_normalized = get_normalized_scores_from_cache(players, selected_games, cached_data)
837
+ # Merge with original df to get Organization info
838
+ df_normalized = df_normalized.merge(df[["Player", "Organization"]], on="Player", how="left")
839
+ else:
840
+ # Fall back to on-the-fly normalization
841
+ df_normalized = df.copy()
842
+ game_cols = [f"{game} Score" for game in selected_games]
843
+
844
+ # Normalize
845
+ for col in game_cols:
846
+ vals = df_normalized[col].replace("n/a", 0).infer_objects(copy=False).astype(float)
847
+ mean, std = vals.mean(), vals.std()
848
+ df_normalized[f"norm_{col}"] = normalize_values(vals, mean, std)
849
+
850
+ # Format game names
851
+ formatted_games = []
852
+ for game in selected_games:
853
+ if game == 'Super Mario Bros':
854
+ formatted_games.append('SMB')
855
+ else:
856
+ formatted_games.append(game)
857
+
858
+ categories = formatted_games
859
+
860
+ # Group players by prefix and sort alphabetically
861
+ model_groups = {}
862
+ for player in df_normalized["Player"]:
863
+ prefix = get_model_prefix(player)
864
+ model_groups.setdefault(prefix, []).append(player)
865
+
866
+ # Sort each group alphabetically
867
+ for prefix in model_groups:
868
+ model_groups[prefix] = sorted(model_groups[prefix], key=str.lower)
869
+
870
+ # Get sorted prefixes and create ordered player list
871
+ sorted_prefixes = sorted(model_groups.keys(), key=str.lower)
872
+ grouped_players = []
873
+ for prefix in sorted_prefixes:
874
+ grouped_players.extend(model_groups[prefix])
875
+
876
+ fig = go.Figure()
877
+
878
+ for player in grouped_players:
879
+ row = df_normalized[df_normalized["Player"] == player]
880
+ if row.empty:
881
+ continue
882
+ row = row.iloc[0]
883
+
884
+ is_highlighted = highlight_models and player in highlight_models
885
+ color = 'red' if is_highlighted else MODEL_COLORS.get(player, '#808080')
886
+ fillcolor = 'rgba(255, 0, 0, 0.4)' if is_highlighted else hex_to_rgba(color, 0.2)
887
+
888
+ # Get normalized values
889
+ if cached_data:
890
+ r = [row[f"norm_{game} Score"] for game in selected_games]
891
+ else:
892
+ r = [row[f"norm_{game} Score"] for game in selected_games]
893
+
894
+ display_name = player.lower()
895
+
896
+ fig.add_trace(go.Scatterpolar(
897
+ r=r + [r[0]],
898
+ theta=categories + [categories[0]],
899
+ mode='lines+markers',
900
+ fill='toself',
901
+ name=display_name,
902
+ line=dict(color=color, width=6 if is_highlighted else 2),
903
+ marker=dict(color=color, size=10 if is_highlighted else 6),
904
+ fillcolor=fillcolor,
905
+ opacity=1.0 if is_highlighted else 0.7,
906
+ hovertemplate='<b>%{fullData.name}</b><br>Game: %{theta}<br>Score: %{r:.1f}<extra></extra>'
907
+ ))
908
+
909
+ fig.update_layout(
910
+ autosize=True,
911
+ height=550,
912
+ margin=dict(l=400, r=100, t=20, b=20),
913
+ title=dict(
914
+ text="AI Normalized Performance Across Games",
915
+ x=0.5,
916
+ xanchor='center',
917
+ yanchor='top',
918
+ y=0.95,
919
+ font=dict(size=20),
920
+ pad=dict(b=20)
921
+ ),
922
+ polar=dict(
923
+ radialaxis=dict(
924
+ visible=True,
925
+ range=[0, 100],
926
+ tickangle=45,
927
+ tickfont=dict(size=12),
928
+ gridcolor='lightgray',
929
+ gridwidth=1,
930
+ angle=45
931
+ ),
932
+ angularaxis=dict(
933
+ tickfont=dict(size=14, weight='bold'),
934
+ tickangle=0
935
+ )
936
+ ),
937
+ legend=dict(
938
+ font=dict(size=12),
939
+ title="Choose your model 💡 (click / double-click)",
940
+ itemsizing='trace',
941
+ x=-1.4,
942
+ y=0.8,
943
+ yanchor='top',
944
+ xanchor='left',
945
+ bgcolor='rgba(255,255,255,0.6)',
946
+ bordercolor='gray',
947
+ borderwidth=1,
948
+ itemclick="toggleothers",
949
+ itemdoubleclick="toggle"
950
+ )
951
+ )
952
+
953
+ return fig
generate_normalized_cache.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to generate normalized data cache for faster visualization loading.
4
+
5
+ Usage:
6
+ python generate_normalized_cache.py [input_file] [output_file]
7
+
8
+ Example:
9
+ python generate_normalized_cache.py data/rank_data.json normalized_data.json
10
+ """
11
+
12
+ import sys
13
+ import json
14
+ from data_visualization import generate_and_save_normalized_data, load_normalized_data
15
+
16
+ def main():
17
+ # Default files
18
+ input_file = "data/rank_data.json" # Update this path as needed
19
+ output_file = "normalized_data.json"
20
+
21
+ # Handle command line arguments
22
+ if len(sys.argv) > 1:
23
+ input_file = sys.argv[1]
24
+ if len(sys.argv) > 2:
25
+ output_file = sys.argv[2]
26
+
27
+ try:
28
+ # Load rank data
29
+ print(f"Loading rank data from {input_file}...")
30
+ with open(input_file, 'r') as f:
31
+ rank_data = json.load(f)
32
+
33
+ # Generate and save normalized data
34
+ print("Generating normalized data...")
35
+ saved_path = generate_and_save_normalized_data(rank_data, output_file)
36
+
37
+ # Verify the saved data
38
+ print("Verifying saved data...")
39
+ cached_data = load_normalized_data(output_file)
40
+
41
+ if cached_data:
42
+ print(f"✅ Successfully generated normalized data cache!")
43
+ print(f"📁 Saved to: {saved_path}")
44
+ print(f"🎮 Games included: {list(cached_data['games'].keys())}")
45
+ print(f"👥 Players included: {len(cached_data['players'])}")
46
+ print(f"📅 Generated at: {cached_data['timestamp']}")
47
+ else:
48
+ print("❌ Failed to verify cached data")
49
+
50
+ except FileNotFoundError:
51
+ print(f"❌ Error: Could not find input file '{input_file}'")
52
+ print("Please check the file path and try again.")
53
+ except Exception as e:
54
+ print(f"❌ Error: {str(e)}")
55
+
56
+ if __name__ == "__main__":
57
+ main()
leaderboard_utils.py CHANGED
@@ -32,7 +32,7 @@ def get_organization(model_name):
32
  return "unknown"
33
 
34
 
35
- def get_sokoban_leaderboard(rank_data):
36
  data = rank_data.get("Sokoban", {}).get("results", [])
37
  df = pd.DataFrame(data)
38
  df = df.rename(columns={
@@ -53,9 +53,12 @@ def get_sokoban_leaderboard(rank_data):
53
  if "Score" in df.columns:
54
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
55
  df = df.sort_values("Score", ascending=False)
 
 
 
56
  return df
57
 
58
- def get_2048_leaderboard(rank_data):
59
  data = rank_data.get("2048", {}).get("results", [])
60
  # --- Diagnostic Print Removed ---
61
  # if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
@@ -108,9 +111,12 @@ def get_2048_leaderboard(rank_data):
108
  if "Score" in df.columns:
109
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
110
  df = df.sort_values("Score", ascending=False)
 
 
 
111
  return df
112
 
113
- def get_candy_leaderboard(rank_data):
114
  data = rank_data.get("Candy Crush", {}).get("results", [])
115
  df = pd.DataFrame(data)
116
  df = df.rename(columns={
@@ -127,9 +133,12 @@ def get_candy_leaderboard(rank_data):
127
  if "Score" in df.columns:
128
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
129
  df = df.sort_values("Score", ascending=False)
 
 
 
130
  return df
131
 
132
- def get_tetris_planning_leaderboard(rank_data):
133
  data = rank_data.get("Tetris", {}).get("results", [])
134
  df = pd.DataFrame(data)
135
  df = df.rename(columns={
@@ -147,9 +156,12 @@ def get_tetris_planning_leaderboard(rank_data):
147
  if "Score" in df.columns:
148
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
149
  df = df.sort_values("Score", ascending=False)
 
 
 
150
  return df
151
 
152
- def get_ace_attorney_leaderboard(rank_data):
153
  data = rank_data.get("Ace Attorney", {}).get("results", [])
154
  df = pd.DataFrame(data)
155
  df = df.rename(columns={
@@ -168,9 +180,12 @@ def get_ace_attorney_leaderboard(rank_data):
168
  if "Score" in df.columns:
169
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
170
  df = df.sort_values("Score", ascending=False) # Higher score is better
 
 
 
171
  return df
172
 
173
- def get_mario_planning_leaderboard(rank_data):
174
  data = rank_data.get("Super Mario Bros", {}).get("results", [])
175
  df = pd.DataFrame(data)
176
  df = df.rename(columns={
@@ -188,6 +203,9 @@ def get_mario_planning_leaderboard(rank_data):
188
  if "Score" in df.columns:
189
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
190
  df = df.sort_values("Score", ascending=False)
 
 
 
191
  return df
192
 
193
  def calculate_rank_and_completeness(rank_data, selected_games):
@@ -285,13 +303,14 @@ def calculate_rank_and_completeness(rank_data, selected_games):
285
 
286
  return df_results
287
 
288
- def get_combined_leaderboard(rank_data, selected_games):
289
  """
290
  Get combined leaderboard for selected games
291
 
292
  Args:
293
  rank_data (dict): Dictionary containing rank data
294
  selected_games (dict): Dictionary of game names and their selection status
 
295
 
296
  Returns:
297
  pd.DataFrame: Combined leaderboard DataFrame
@@ -358,20 +377,64 @@ def get_combined_leaderboard(rank_data, selected_games):
358
  # Create DataFrame
359
  df_results = pd.DataFrame(results)
360
 
361
- # Sort by total score across all games
362
  if not df_results.empty:
363
- # Calculate total score for each player
364
- df_results["Total Score"] = 0
 
 
 
365
  for game in GAME_ORDER:
366
- if f"{game} Score" in df_results.columns:
367
- df_results["Total Score"] += df_results[f"{game} Score"].apply(
368
- lambda x: float(x) if x != 'n/a' else 0
369
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
 
371
- # Sort by total score in descending order
372
- df_results = df_results.sort_values("Total Score", ascending=False)
373
 
374
- # Drop the temporary total score column
375
- df_results = df_results.drop("Total Score", axis=1)
 
376
 
377
  return df_results
 
32
  return "unknown"
33
 
34
 
35
+ def get_sokoban_leaderboard(rank_data, limit_to_top_n=None):
36
  data = rank_data.get("Sokoban", {}).get("results", [])
37
  df = pd.DataFrame(data)
38
  df = df.rename(columns={
 
53
  if "Score" in df.columns:
54
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
55
  df = df.sort_values("Score", ascending=False)
56
+ # Apply limit if specified
57
+ if limit_to_top_n is not None:
58
+ df = df.head(limit_to_top_n)
59
  return df
60
 
61
+ def get_2048_leaderboard(rank_data, limit_to_top_n=None):
62
  data = rank_data.get("2048", {}).get("results", [])
63
  # --- Diagnostic Print Removed ---
64
  # if data and isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict):
 
111
  if "Score" in df.columns:
112
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
113
  df = df.sort_values("Score", ascending=False)
114
+ # Apply limit if specified
115
+ if limit_to_top_n is not None:
116
+ df = df.head(limit_to_top_n)
117
  return df
118
 
119
+ def get_candy_leaderboard(rank_data, limit_to_top_n=None):
120
  data = rank_data.get("Candy Crush", {}).get("results", [])
121
  df = pd.DataFrame(data)
122
  df = df.rename(columns={
 
133
  if "Score" in df.columns:
134
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
135
  df = df.sort_values("Score", ascending=False)
136
+ # Apply limit if specified
137
+ if limit_to_top_n is not None:
138
+ df = df.head(limit_to_top_n)
139
  return df
140
 
141
+ def get_tetris_planning_leaderboard(rank_data, limit_to_top_n=None):
142
  data = rank_data.get("Tetris", {}).get("results", [])
143
  df = pd.DataFrame(data)
144
  df = df.rename(columns={
 
156
  if "Score" in df.columns:
157
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
158
  df = df.sort_values("Score", ascending=False)
159
+ # Apply limit if specified
160
+ if limit_to_top_n is not None:
161
+ df = df.head(limit_to_top_n)
162
  return df
163
 
164
+ def get_ace_attorney_leaderboard(rank_data, limit_to_top_n=None):
165
  data = rank_data.get("Ace Attorney", {}).get("results", [])
166
  df = pd.DataFrame(data)
167
  df = df.rename(columns={
 
180
  if "Score" in df.columns:
181
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
182
  df = df.sort_values("Score", ascending=False) # Higher score is better
183
+ # Apply limit if specified
184
+ if limit_to_top_n is not None:
185
+ df = df.head(limit_to_top_n)
186
  return df
187
 
188
+ def get_mario_planning_leaderboard(rank_data, limit_to_top_n=None):
189
  data = rank_data.get("Super Mario Bros", {}).get("results", [])
190
  df = pd.DataFrame(data)
191
  df = df.rename(columns={
 
203
  if "Score" in df.columns:
204
  df["Score"] = pd.to_numeric(df["Score"], errors='coerce')
205
  df = df.sort_values("Score", ascending=False)
206
+ # Apply limit if specified
207
+ if limit_to_top_n is not None:
208
+ df = df.head(limit_to_top_n)
209
  return df
210
 
211
  def calculate_rank_and_completeness(rank_data, selected_games):
 
303
 
304
  return df_results
305
 
306
+ def get_combined_leaderboard(rank_data, selected_games, limit_to_top_n=None):
307
  """
308
  Get combined leaderboard for selected games
309
 
310
  Args:
311
  rank_data (dict): Dictionary containing rank data
312
  selected_games (dict): Dictionary of game names and their selection status
313
+ limit_to_top_n (int, optional): Limit results to top N entries. None means no limit.
314
 
315
  Returns:
316
  pd.DataFrame: Combined leaderboard DataFrame
 
377
  # Create DataFrame
378
  df_results = pd.DataFrame(results)
379
 
380
+ # Calculate normalized scores and average normalized score
381
  if not df_results.empty:
382
+ # Import the normalize_values function from data_visualization
383
+ from data_visualization import normalize_values
384
+
385
+ # Calculate normalized scores for each game
386
+ game_score_columns = []
387
  for game in GAME_ORDER:
388
+ score_col = f"{game} Score"
389
+ if score_col in df_results.columns:
390
+ game_score_columns.append(score_col)
391
+ # Get numeric values, replacing 'n/a' with NaN
392
+ # Use where() to avoid FutureWarning about downcasting in replace()
393
+ series = df_results[score_col].copy()
394
+ series = series.where(series != 'n/a', np.nan)
395
+ numeric_scores = pd.to_numeric(series, errors='coerce')
396
+
397
+ # Skip games where all scores are NaN or 0
398
+ valid_scores = numeric_scores.dropna()
399
+ if len(valid_scores) > 0 and valid_scores.sum() > 0:
400
+ mean = valid_scores.mean()
401
+ std = valid_scores.std() if len(valid_scores) > 1 else 0
402
+
403
+ # Calculate normalized scores for all players
404
+ normalized_scores = []
405
+ for _, row in df_results.iterrows():
406
+ score = row[score_col]
407
+ if score == 'n/a' or pd.isna(score):
408
+ normalized_scores.append(0)
409
+ else:
410
+ normalized_scores.append(normalize_values([float(score)], mean, std)[0])
411
+
412
+ df_results[f"norm_{score_col}"] = normalized_scores
413
+ else:
414
+ # If no valid scores, set all normalized scores to 0
415
+ df_results[f"norm_{score_col}"] = 0
416
+
417
+ # Calculate average normalized score across games
418
+ normalized_columns = [f"norm_{col}" for col in game_score_columns if f"norm_{col}" in df_results.columns]
419
+ if normalized_columns:
420
+ df_results["Avg Normalized Score"] = df_results[normalized_columns].mean(axis=1).round(2)
421
+ else:
422
+ df_results["Avg Normalized Score"] = 0.0
423
+
424
+ # Reorder columns to put Avg Normalized Score after Organization
425
+ base_columns = ["Player", "Organization", "Avg Normalized Score"]
426
+ game_columns = [col for col in df_results.columns if col.endswith(" Score") and not col.startswith("norm_") and col != "Avg Normalized Score"]
427
+ other_columns = [col for col in df_results.columns if col not in base_columns + game_columns and not col.startswith("norm_")]
428
+
429
+ # Create final column order
430
+ final_columns = base_columns + game_columns + other_columns
431
+ df_results = df_results[final_columns]
432
 
433
+ # Sort by average normalized score in descending order
434
+ df_results = df_results.sort_values("Avg Normalized Score", ascending=False)
435
 
436
+ # Apply limit if specified
437
+ if limit_to_top_n is not None:
438
+ df_results = df_results.head(limit_to_top_n)
439
 
440
  return df_results
rank_data_03_25_2025.json CHANGED
@@ -3,61 +3,61 @@
3
  "runs": 3,
4
  "results": [
5
  {
6
- "model": "gamingagent + claude-3-5-sonnet-20241022",
7
  "score": 1267.7,
8
  "detail_data": "709,1532,1562",
9
  "progress": "1-1"
10
  },
11
  {
12
- "model": "gamingagent + claude-3-7-sonnet-20250219",
13
  "score": 1418.7,
14
  "detail_data": "2015,709,1532",
15
  "progress": "1-1"
16
  },
17
  {
18
- "model": "gamingagent + gemini-2.5-flash-preview-04-17",
19
  "score": 1385.0,
20
  "detail_data": "1672,1266,1247",
21
  "progress": "1-1"
22
  },
23
  {
24
- "model": "gamingagent + gemini-2.5-pro-preview-05-06",
25
  "score": 1498.3,
26
  "detail_data": "1561,1271,1663",
27
  "progress": "1-1"
28
  },
29
  {
30
- "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
31
  "score": 1468.7,
32
  "detail_data": "898,2008,1500",
33
  "progress": "1-1"
34
  },
35
  {
36
- "model": "gamingagent + gpt-4.1-2025-04-14",
37
  "score": 2126.3,
38
  "detail_data": "1531,722,4126",
39
  "progress": "1-1"
40
  },
41
  {
42
- "model": "gamingagent + gpt-4o-2024-11-20",
43
  "score": 2047.3,
44
  "detail_data": "2017,2590,1535",
45
  "progress": "1-1"
46
  },
47
  {
48
- "model": "gamingagent + o1-2024-12-17",
49
  "score": 855,
50
  "detail_data": "855",
51
  "progress": "1-1"
52
  },
53
  {
54
- "model": "gamingagent + o3-2025-04-16",
55
  "score": 3445,
56
  "detail_data": "3445",
57
  "progress": "1-1"
58
  },
59
  {
60
- "model": "gamingagent + o4-mini-2025-04-16",
61
  "score": 1448.0,
62
  "detail_data": "1525,1263,1556",
63
  "progress": "1-1"
@@ -74,79 +74,79 @@
74
  "runs": 3,
75
  "results": [
76
  {
77
- "model": "gamingagent + claude-3-5-sonnet-20241022",
78
  "score": 1914.67,
79
  "details": "1352,2860,1532",
80
  "highest_tail": 256
81
  },
82
  {
83
- "model": "gamingagent + claude-3-7-sonnet-20250219",
84
  "score": 2624,
85
  "details": "2560,3224,2088",
86
  "highest_tail": 256
87
  },
88
  {
89
- "model": "gamingagent + deepseek-r1-0120",
90
  "score": 1873.33,
91
  "details": "700,1240,3680",
92
  "highest_tail": 256
93
  },
94
  {
95
- "model": "gamingagent + gemini-2.5-flash-preview-04-17",
96
  "score": 1697.33,
97
  "details": "1304,1316,2472",
98
  "highest_tail": 256
99
  },
100
  {
101
- "model": "gamingagent + gemini-2.5-pro-preview-05-06",
102
  "score": 3586.67,
103
  "details": "5300,2400,3060",
104
  "highest_tail": 512
105
  },
106
  {
107
- "model": "gamingagent + grok-3-mini-beta",
108
  "score": 4036,
109
  "details": "6412,2492,3204",
110
  "highest_tail": 512
111
  },
112
  {
113
- "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
114
  "score": 1586.67,
115
  "details": "1404,1272,2084",
116
  "highest_tail": 128
117
  },
118
  {
119
- "model": "gamingagent + gpt-4.1-2025-04-14",
120
  "score": 1656,
121
  "details": "1156,2664,1148",
122
  "highest_tail": 256
123
  },
124
  {
125
- "model": "gamingagent + gpt-4o-2024-11-20",
126
  "score": 1656,
127
  "details": "1604,1284,2080",
128
  "highest_tail": 256
129
  },
130
  {
131
- "model": "gamingagent + o1-2024-12-17",
132
  "score": 7580,
133
  "details": "7580",
134
  "highest_tail": 512
135
  },
136
  {
137
- "model": "gamingagent + o1-mini-2024-09-12",
138
  "score": 2757.33,
139
  "details": "3132,2004,3136",
140
  "highest_tail": 256
141
  },
142
  {
143
- "model": "gamingagent + o3-2025-04-16",
144
  "score": 7120,
145
  "details": "7120",
146
  "highest_tail": 512
147
  },
148
  {
149
- "model": "gamingagent + o4-mini-2025-04-16",
150
  "score": 4432.0,
151
  "details": "4928,5456,2912",
152
  "highest_tail": 512
@@ -158,25 +158,25 @@
158
  "highest_tail": 128
159
  },
160
  {
161
- "model": "gamingagent + claude-opus-4-20250514",
162
  "score": 3036.0,
163
  "details": "3036.0",
164
  "highest_tail": 256
165
  },
166
  {
167
- "model": "gamingagent + claude-sonnet-4-20250514",
168
  "score": 3136,
169
  "details": "2148,2360,4900",
170
  "highest_tail": 256
171
  },
172
  {
173
- "model": "gamingagent + deepseek-r1-0528",
174
  "score": 3330.0,
175
  "details": "3260,3400",
176
  "highest_tail": 256
177
  },
178
  {
179
- "model": "gamingagent + qwen3-235B-A22B-fp8",
180
  "score": 2144.0,
181
  "details": "1436,2556,2440",
182
  "highest_tail": 256
@@ -187,67 +187,67 @@
187
  "runs": 3,
188
  "results": [
189
  {
190
- "model": "gamingagent + claude-3-5-sonnet-20241022",
191
  "score": 14.7,
192
  "details": "16,14,14"
193
  },
194
  {
195
- "model": "gamingagent + claude-3-7-sonnet-20250219",
196
  "score": 16.3,
197
  "details": "19,15,15"
198
  },
199
  {
200
- "model": "gamingagent + deepseek-r1-0120",
201
  "score": 14.3,
202
  "details": "15,14,14"
203
  },
204
  {
205
- "model": "gamingagent + gemini-2.5-flash-preview-04-17",
206
  "score": 16.3,
207
  "details": "20,14,15"
208
  },
209
  {
210
- "model": "gamingagent + gemini-2.5-pro-preview-05-06",
211
  "score": 23.3,
212
  "details": "23,23,24"
213
  },
214
  {
215
- "model": "gamingagent + grok-3-mini-beta",
216
  "score": 21.3,
217
  "details": "20,15,29"
218
  },
219
  {
220
- "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
221
  "score": 10.3,
222
  "details": "9,10,12"
223
  },
224
  {
225
- "model": "gamingagent + gpt-4.1-2025-04-14",
226
  "score": 13.7,
227
  "details": "13,14,14"
228
  },
229
  {
230
- "model": "gamingagent + gpt-4o-2024-11-20",
231
  "score": 14,
232
  "details": "18,11,13"
233
  },
234
  {
235
- "model": "gamingagent + o1-2024-12-17",
236
  "score": 35,
237
  "details": "35"
238
  },
239
  {
240
- "model": "gamingagent + o1-mini-2024-09-12",
241
  "score": 11.7,
242
  "details": "11,11,13"
243
  },
244
  {
245
- "model": "gamingagent + o3-2025-04-16",
246
  "score": 42,
247
  "details": "42"
248
  },
249
  {
250
- "model": "gamingagent + o4-mini-2025-04-16",
251
  "score": 25.3,
252
  "details": "22,35,19"
253
  },
@@ -257,22 +257,22 @@
257
  "details": ""
258
  },
259
  {
260
- "model": "gamingagent + claude-opus-4-20250514",
261
  "score": 20,
262
  "details": "17,18,25"
263
  },
264
  {
265
- "model": "gamingagent + claude-sonnet-4-20250514",
266
  "score": 19.33,
267
  "details": "20,17,21"
268
  },
269
  {
270
- "model": "gamingagent + deepseek-r1-0528",
271
  "score": 33.67,
272
  "details": "26,34,41"
273
  },
274
  {
275
- "model": "gamingagent + qwen3-235B-A22B-fp8",
276
  "score": 11.67,
277
  "details": "13,14,8"
278
  }
@@ -282,67 +282,67 @@
282
  "runs": 3,
283
  "results": [
284
  {
285
- "model": "gamingagent + claude-3-5-sonnet-20241022",
286
  "score": 106,
287
  "details": "92,165,61"
288
  },
289
  {
290
- "model": "gamingagent + claude-3-7-sonnet-20250219",
291
  "score": 484,
292
  "details": "535,428,489"
293
  },
294
  {
295
- "model": "gamingagent + deepseek-r1-0120",
296
  "score": 447.3,
297
  "details": "409,436,497"
298
  },
299
  {
300
- "model": "gamingagent + gemini-2.5-flash-preview-04-17",
301
  "score": 334.7,
302
  "details": "259,372,373"
303
  },
304
  {
305
- "model": "gamingagent + gemini-2.5-pro-preview-05-06",
306
  "score": 416.3,
307
  "details": "411,414,424"
308
  },
309
  {
310
- "model": "gamingagent + grok-3-mini-beta",
311
  "score": 254,
312
  "details": "299,332,131"
313
  },
314
  {
315
- "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
316
  "score": 128.7,
317
  "details": "67,139,180"
318
  },
319
  {
320
- "model": "gamingagent + gpt-4.1-2025-04-14",
321
  "score": 182,
322
  "details": "163,215,168"
323
  },
324
  {
325
- "model": "gamingagent + gpt-4o-2024-11-20",
326
  "score": 147.3,
327
  "details": "131,104,207"
328
  },
329
  {
330
- "model": "gamingagent + o1-2024-12-17",
331
  "score": 159,
332
  "details": "159"
333
  },
334
  {
335
- "model": "gamingagent + o1-mini-2024-09-12",
336
  "score": 48,
337
  "details": "21,86,37"
338
  },
339
  {
340
- "model": "gamingagent + o3-2025-04-16",
341
  "score": 647,
342
  "details": "647"
343
  },
344
  {
345
- "model": "gamingagent + o4-mini-2025-04-16",
346
  "score": 487.3,
347
  "details": "259,591,612"
348
  },
@@ -352,22 +352,22 @@
352
  "details": ""
353
  },
354
  {
355
- "model": "gamingagent + claude-opus-4-20250514",
356
  "score": 464,
357
  "details": "593,406,393"
358
  },
359
  {
360
- "model": "gamingagent + claude-sonnet-4-20250514",
361
  "score": 478.33,
362
  "details": "545,468,422"
363
  },
364
  {
365
- "model": "gamingagent + deepseek-r1-0528",
366
  "score": 491.67,
367
  "details": "464,463,548"
368
  },
369
  {
370
- "model": "gamingagent + qwen3-235B-A22B-fp8",
371
  "score": 363.33,
372
  "details": "365,372,353"
373
  }
@@ -377,79 +377,79 @@
377
  "runs": 3,
378
  "results": [
379
  {
380
- "model": "gamingagent + claude-3-5-sonnet-20241022",
381
  "score": 0,
382
  "detail_box_on_target": "0,0,0",
383
  "cracked_levels": "0,0,0"
384
  },
385
  {
386
- "model": "gamingagent + claude-3-7-sonnet-20250219",
387
  "score": 2.33,
388
  "detail_box_on_target": "2,4,1",
389
  "cracked_levels": "1,2,0"
390
  },
391
  {
392
- "model": "gamingagent + deepseek-r1-0120",
393
  "score": 1.33,
394
  "detail_box_on_target": "2,0,2",
395
  "cracked_levels": "1,0,1"
396
  },
397
  {
398
- "model": "gamingagent + gemini-2.5-flash-preview-04-17",
399
  "score": 1.67,
400
  "detail_box_on_target": "3,0,2",
401
  "cracked_levels": "2,0,1"
402
  },
403
  {
404
- "model": "gamingagent + gemini-2.5-pro-preview-05-06",
405
  "score": 4.33,
406
  "detail_box_on_target": "4,4,5",
407
  "cracked_levels": "2,2,3"
408
  },
409
  {
410
- "model": "gamingagent + grok-3-mini-beta",
411
  "score": 5.67,
412
  "detail_box_on_target": "5,6,6",
413
  "cracked_levels": "3,3,3"
414
  },
415
  {
416
- "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
417
  "score": 0,
418
  "detail_box_on_target": "0,0,0",
419
  "cracked_levels": "0,0,0"
420
  },
421
  {
422
- "model": "gamingagent + gpt-4.1-2025-04-14",
423
  "score": 0,
424
  "detail_box_on_target": "0,0,0",
425
  "cracked_levels": "0,0,0"
426
  },
427
  {
428
- "model": "gamingagent + gpt-4o-2024-11-20",
429
  "score": 0,
430
  "detail_box_on_target": "0,0,0",
431
  "cracked_levels": "0,0,0"
432
  },
433
  {
434
- "model": "gamingagent + o1-2024-12-17",
435
  "score": 2.33,
436
  "detail_box_on_target": "2,2,3",
437
  "cracked_levels": "1,1,2"
438
  },
439
  {
440
- "model": "gamingagent + o1-mini-2024-09-12",
441
  "score": 1.33,
442
  "detail_box_on_target": "1,2,1",
443
  "cracked_levels": "0,1,0"
444
  },
445
  {
446
- "model": "gamingagent + o3-2025-04-16",
447
  "score": 8,
448
  "detail_box_on_target": "10,6",
449
  "cracked_levels": "5,3"
450
  },
451
  {
452
- "model": "gamingagent + o4-mini-2025-04-16",
453
  "score": 5.33,
454
  "detail_box_on_target": "4,6,6",
455
  "cracked_levels": "2,2,3"
@@ -461,22 +461,22 @@
461
  "cracked_levels": "0,0,0"
462
  },
463
  {
464
- "model": "gamingagent + claude-opus-4-20250514",
465
  "score": 4,
466
  "details": "4,4,4"
467
  },
468
  {
469
- "model": "gamingagent + claude-sonnet-4-20250514",
470
  "score": 3,
471
  "details": "2,2,5"
472
  },
473
  {
474
- "model": "gamingagent + deepseek-r1-0528",
475
  "score": 4.67,
476
  "details": "4,4,6"
477
  },
478
  {
479
- "model": "gamingagent + qwen3-235B-A22B-fp8",
480
  "score": 2.33,
481
  "details": "1,2,4"
482
  }
@@ -486,79 +486,79 @@
486
  "runs": 1,
487
  "results": [
488
  {
489
- "model": "gamingagent + claude-3-5-sonnet-20241022",
490
  "score": 2,
491
  "progress": "1:2/5",
492
  "evaluator result": "1/3"
493
  },
494
  {
495
- "model": "gamingagent + claude-3-7-sonnet-20250219",
496
  "score": 7,
497
  "progress": "2:2/9",
498
  "evaluator result": "5/11"
499
  },
500
  {
501
- "model": "gamingagent + deepseek-r1-0120",
502
  "score": 0,
503
  "progress": "0",
504
  "evaluator result": "1/5"
505
  },
506
  {
507
- "model": "gamingagent + gemini-2.5-flash-preview-04-17",
508
  "score": 4,
509
  "progress": "1:4/5",
510
  "evaluator result": "1/7"
511
  },
512
  {
513
- "model": "gamingagent + gemini-2.5-pro-preview-05-06",
514
  "score": 7,
515
  "progress": "2:2/9",
516
  "evaluator result": "2/3"
517
  },
518
  {
519
- "model": "gamingagent + grok-3-mini-beta",
520
  "score": 0,
521
  "progress": "0",
522
  "evaluator result": "0"
523
  },
524
  {
525
- "model": "gamingagent + llama-4-maverick-17b-128e-instruct-fp8",
526
  "score": 0,
527
  "progress": "0",
528
  "evaluator result": "0"
529
  },
530
  {
531
- "model": "gamingagent + gpt-4.1-2025-04-14",
532
  "score": 2,
533
  "progress": "1:2/5",
534
  "evaluator result": "2/3"
535
  },
536
  {
537
- "model": "gamingagent + gpt-4o-2024-11-20",
538
  "score": 0,
539
  "progress": "0",
540
  "evaluator result": "0"
541
  },
542
  {
543
- "model": "gamingagent + o1-2024-12-17",
544
  "score": 16,
545
  "progress": "3: 2/8",
546
  "evaluator result": "6/11"
547
  },
548
  {
549
- "model": "gamingagent + o1-mini-2024-09-12",
550
  "score": 0,
551
  "progress": "0",
552
  "evaluator result": "1/5"
553
  },
554
  {
555
- "model": "gamingagent + o3-2025-04-16",
556
  "score": 16,
557
  "progress": "3: 2/8",
558
  "evaluator result": "1/2"
559
  },
560
  {
561
- "model": "gamingagent + o4-mini-2025-04-16",
562
  "score": 4,
563
  "progress": "1:4/5",
564
  "evaluator result": "2/5"
@@ -570,17 +570,17 @@
570
  "evaluator result": "0"
571
  },
572
  {
573
- "model": "gamingagent + claude-opus-4-20250514",
574
  "score": 6,
575
  "details": "6"
576
  },
577
  {
578
- "model": "gamingagent + claude-sonnet-4-20250514",
579
  "score": 3.67,
580
  "details": "3,4,4"
581
  },
582
  {
583
- "model": "gamingagent + gemini-2.5-flash-preview-05-20",
584
  "score": 4.33,
585
  "details": "3,4,6"
586
  }
 
3
  "runs": 3,
4
  "results": [
5
  {
6
+ "model": "claude-3-5-sonnet-20241022 (⚔️)",
7
  "score": 1267.7,
8
  "detail_data": "709,1532,1562",
9
  "progress": "1-1"
10
  },
11
  {
12
+ "model": "claude-3-7-sonnet-20250219 (⚔️)",
13
  "score": 1418.7,
14
  "detail_data": "2015,709,1532",
15
  "progress": "1-1"
16
  },
17
  {
18
+ "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
19
  "score": 1385.0,
20
  "detail_data": "1672,1266,1247",
21
  "progress": "1-1"
22
  },
23
  {
24
+ "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
25
  "score": 1498.3,
26
  "detail_data": "1561,1271,1663",
27
  "progress": "1-1"
28
  },
29
  {
30
+ "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
31
  "score": 1468.7,
32
  "detail_data": "898,2008,1500",
33
  "progress": "1-1"
34
  },
35
  {
36
+ "model": "gpt-4.1-2025-04-14 (⚔️)",
37
  "score": 2126.3,
38
  "detail_data": "1531,722,4126",
39
  "progress": "1-1"
40
  },
41
  {
42
+ "model": "gpt-4o-2024-11-20 (⚔️)",
43
  "score": 2047.3,
44
  "detail_data": "2017,2590,1535",
45
  "progress": "1-1"
46
  },
47
  {
48
+ "model": "o1-2024-12-17 (⚔️)",
49
  "score": 855,
50
  "detail_data": "855",
51
  "progress": "1-1"
52
  },
53
  {
54
+ "model": "o3-2025-04-16 (⚔️)",
55
  "score": 3445,
56
  "detail_data": "3445",
57
  "progress": "1-1"
58
  },
59
  {
60
+ "model": "o4-mini-2025-04-16 (⚔️)",
61
  "score": 1448.0,
62
  "detail_data": "1525,1263,1556",
63
  "progress": "1-1"
 
74
  "runs": 3,
75
  "results": [
76
  {
77
+ "model": "claude-3-5-sonnet-20241022 (⚔️)",
78
  "score": 1914.67,
79
  "details": "1352,2860,1532",
80
  "highest_tail": 256
81
  },
82
  {
83
+ "model": "claude-3-7-sonnet-20250219 (⚔️)",
84
  "score": 2624,
85
  "details": "2560,3224,2088",
86
  "highest_tail": 256
87
  },
88
  {
89
+ "model": "deepseek-r1-0120 (⚔️)",
90
  "score": 1873.33,
91
  "details": "700,1240,3680",
92
  "highest_tail": 256
93
  },
94
  {
95
+ "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
96
  "score": 1697.33,
97
  "details": "1304,1316,2472",
98
  "highest_tail": 256
99
  },
100
  {
101
+ "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
102
  "score": 3586.67,
103
  "details": "5300,2400,3060",
104
  "highest_tail": 512
105
  },
106
  {
107
+ "model": "grok-3-mini-beta (⚔️)",
108
  "score": 4036,
109
  "details": "6412,2492,3204",
110
  "highest_tail": 512
111
  },
112
  {
113
+ "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
114
  "score": 1586.67,
115
  "details": "1404,1272,2084",
116
  "highest_tail": 128
117
  },
118
  {
119
+ "model": "gpt-4.1-2025-04-14 (⚔️)",
120
  "score": 1656,
121
  "details": "1156,2664,1148",
122
  "highest_tail": 256
123
  },
124
  {
125
+ "model": "gpt-4o-2024-11-20 (⚔️)",
126
  "score": 1656,
127
  "details": "1604,1284,2080",
128
  "highest_tail": 256
129
  },
130
  {
131
+ "model": "o1-2024-12-17 (⚔️)",
132
  "score": 7580,
133
  "details": "7580",
134
  "highest_tail": 512
135
  },
136
  {
137
+ "model": "o1-mini-2024-09-12 (⚔️)",
138
  "score": 2757.33,
139
  "details": "3132,2004,3136",
140
  "highest_tail": 256
141
  },
142
  {
143
+ "model": "o3-2025-04-16 (⚔️)",
144
  "score": 7120,
145
  "details": "7120",
146
  "highest_tail": 512
147
  },
148
  {
149
+ "model": "o4-mini-2025-04-16 (⚔️)",
150
  "score": 4432.0,
151
  "details": "4928,5456,2912",
152
  "highest_tail": 512
 
158
  "highest_tail": 128
159
  },
160
  {
161
+ "model": "claude-opus-4-20250514 (⚔️)",
162
  "score": 3036.0,
163
  "details": "3036.0",
164
  "highest_tail": 256
165
  },
166
  {
167
+ "model": "claude-sonnet-4-20250514 (⚔️)",
168
  "score": 3136,
169
  "details": "2148,2360,4900",
170
  "highest_tail": 256
171
  },
172
  {
173
+ "model": "deepseek-r1-0528 (⚔️)",
174
  "score": 3330.0,
175
  "details": "3260,3400",
176
  "highest_tail": 256
177
  },
178
  {
179
+ "model": "qwen3-235B-A22B-fp8 (⚔️)",
180
  "score": 2144.0,
181
  "details": "1436,2556,2440",
182
  "highest_tail": 256
 
187
  "runs": 3,
188
  "results": [
189
  {
190
+ "model": "claude-3-5-sonnet-20241022 (⚔️)",
191
  "score": 14.7,
192
  "details": "16,14,14"
193
  },
194
  {
195
+ "model": "claude-3-7-sonnet-20250219 (⚔️)",
196
  "score": 16.3,
197
  "details": "19,15,15"
198
  },
199
  {
200
+ "model": "deepseek-r1-0120 (⚔️)",
201
  "score": 14.3,
202
  "details": "15,14,14"
203
  },
204
  {
205
+ "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
206
  "score": 16.3,
207
  "details": "20,14,15"
208
  },
209
  {
210
+ "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
211
  "score": 23.3,
212
  "details": "23,23,24"
213
  },
214
  {
215
+ "model": "grok-3-mini-beta (⚔️)",
216
  "score": 21.3,
217
  "details": "20,15,29"
218
  },
219
  {
220
+ "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
221
  "score": 10.3,
222
  "details": "9,10,12"
223
  },
224
  {
225
+ "model": "gpt-4.1-2025-04-14 (⚔️)",
226
  "score": 13.7,
227
  "details": "13,14,14"
228
  },
229
  {
230
+ "model": "gpt-4o-2024-11-20 (⚔️)",
231
  "score": 14,
232
  "details": "18,11,13"
233
  },
234
  {
235
+ "model": "o1-2024-12-17 (⚔️)",
236
  "score": 35,
237
  "details": "35"
238
  },
239
  {
240
+ "model": "o1-mini-2024-09-12 (⚔️)",
241
  "score": 11.7,
242
  "details": "11,11,13"
243
  },
244
  {
245
+ "model": "o3-2025-04-16 (⚔️)",
246
  "score": 42,
247
  "details": "42"
248
  },
249
  {
250
+ "model": "o4-mini-2025-04-16 (⚔️)",
251
  "score": 25.3,
252
  "details": "22,35,19"
253
  },
 
257
  "details": ""
258
  },
259
  {
260
+ "model": "claude-opus-4-20250514 (⚔️)",
261
  "score": 20,
262
  "details": "17,18,25"
263
  },
264
  {
265
+ "model": "claude-sonnet-4-20250514 (⚔️)",
266
  "score": 19.33,
267
  "details": "20,17,21"
268
  },
269
  {
270
+ "model": "deepseek-r1-0528 (⚔️)",
271
  "score": 33.67,
272
  "details": "26,34,41"
273
  },
274
  {
275
+ "model": "qwen3-235B-A22B-fp8 (⚔️)",
276
  "score": 11.67,
277
  "details": "13,14,8"
278
  }
 
282
  "runs": 3,
283
  "results": [
284
  {
285
+ "model": "claude-3-5-sonnet-20241022 (⚔️)",
286
  "score": 106,
287
  "details": "92,165,61"
288
  },
289
  {
290
+ "model": "claude-3-7-sonnet-20250219 (⚔️)",
291
  "score": 484,
292
  "details": "535,428,489"
293
  },
294
  {
295
+ "model": "deepseek-r1-0120 (⚔️)",
296
  "score": 447.3,
297
  "details": "409,436,497"
298
  },
299
  {
300
+ "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
301
  "score": 334.7,
302
  "details": "259,372,373"
303
  },
304
  {
305
+ "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
306
  "score": 416.3,
307
  "details": "411,414,424"
308
  },
309
  {
310
+ "model": "grok-3-mini-beta (⚔️)",
311
  "score": 254,
312
  "details": "299,332,131"
313
  },
314
  {
315
+ "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
316
  "score": 128.7,
317
  "details": "67,139,180"
318
  },
319
  {
320
+ "model": "gpt-4.1-2025-04-14 (⚔️)",
321
  "score": 182,
322
  "details": "163,215,168"
323
  },
324
  {
325
+ "model": "gpt-4o-2024-11-20 (⚔️)",
326
  "score": 147.3,
327
  "details": "131,104,207"
328
  },
329
  {
330
+ "model": "o1-2024-12-17 (⚔️)",
331
  "score": 159,
332
  "details": "159"
333
  },
334
  {
335
+ "model": "o1-mini-2024-09-12 (⚔️)",
336
  "score": 48,
337
  "details": "21,86,37"
338
  },
339
  {
340
+ "model": "o3-2025-04-16 (⚔️)",
341
  "score": 647,
342
  "details": "647"
343
  },
344
  {
345
+ "model": "o4-mini-2025-04-16 (⚔️)",
346
  "score": 487.3,
347
  "details": "259,591,612"
348
  },
 
352
  "details": ""
353
  },
354
  {
355
+ "model": "claude-opus-4-20250514 (⚔️)",
356
  "score": 464,
357
  "details": "593,406,393"
358
  },
359
  {
360
+ "model": "claude-sonnet-4-20250514 (⚔️)",
361
  "score": 478.33,
362
  "details": "545,468,422"
363
  },
364
  {
365
+ "model": "deepseek-r1-0528 (⚔️)",
366
  "score": 491.67,
367
  "details": "464,463,548"
368
  },
369
  {
370
+ "model": "qwen3-235B-A22B-fp8 (⚔️)",
371
  "score": 363.33,
372
  "details": "365,372,353"
373
  }
 
377
  "runs": 3,
378
  "results": [
379
  {
380
+ "model": "claude-3-5-sonnet-20241022 (⚔️)",
381
  "score": 0,
382
  "detail_box_on_target": "0,0,0",
383
  "cracked_levels": "0,0,0"
384
  },
385
  {
386
+ "model": "claude-3-7-sonnet-20250219 (⚔️)",
387
  "score": 2.33,
388
  "detail_box_on_target": "2,4,1",
389
  "cracked_levels": "1,2,0"
390
  },
391
  {
392
+ "model": "deepseek-r1-0120 (⚔️)",
393
  "score": 1.33,
394
  "detail_box_on_target": "2,0,2",
395
  "cracked_levels": "1,0,1"
396
  },
397
  {
398
+ "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
399
  "score": 1.67,
400
  "detail_box_on_target": "3,0,2",
401
  "cracked_levels": "2,0,1"
402
  },
403
  {
404
+ "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
405
  "score": 4.33,
406
  "detail_box_on_target": "4,4,5",
407
  "cracked_levels": "2,2,3"
408
  },
409
  {
410
+ "model": "grok-3-mini-beta (⚔️)",
411
  "score": 5.67,
412
  "detail_box_on_target": "5,6,6",
413
  "cracked_levels": "3,3,3"
414
  },
415
  {
416
+ "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
417
  "score": 0,
418
  "detail_box_on_target": "0,0,0",
419
  "cracked_levels": "0,0,0"
420
  },
421
  {
422
+ "model": "gpt-4.1-2025-04-14 (⚔️)",
423
  "score": 0,
424
  "detail_box_on_target": "0,0,0",
425
  "cracked_levels": "0,0,0"
426
  },
427
  {
428
+ "model": "gpt-4o-2024-11-20 (⚔️)",
429
  "score": 0,
430
  "detail_box_on_target": "0,0,0",
431
  "cracked_levels": "0,0,0"
432
  },
433
  {
434
+ "model": "o1-2024-12-17 (⚔️)",
435
  "score": 2.33,
436
  "detail_box_on_target": "2,2,3",
437
  "cracked_levels": "1,1,2"
438
  },
439
  {
440
+ "model": "o1-mini-2024-09-12 (⚔️)",
441
  "score": 1.33,
442
  "detail_box_on_target": "1,2,1",
443
  "cracked_levels": "0,1,0"
444
  },
445
  {
446
+ "model": "o3-2025-04-16 (⚔️)",
447
  "score": 8,
448
  "detail_box_on_target": "10,6",
449
  "cracked_levels": "5,3"
450
  },
451
  {
452
+ "model": "o4-mini-2025-04-16 (⚔️)",
453
  "score": 5.33,
454
  "detail_box_on_target": "4,6,6",
455
  "cracked_levels": "2,2,3"
 
461
  "cracked_levels": "0,0,0"
462
  },
463
  {
464
+ "model": "claude-opus-4-20250514 (⚔️)",
465
  "score": 4,
466
  "details": "4,4,4"
467
  },
468
  {
469
+ "model": "claude-sonnet-4-20250514 (⚔️)",
470
  "score": 3,
471
  "details": "2,2,5"
472
  },
473
  {
474
+ "model": "deepseek-r1-0528 (⚔️)",
475
  "score": 4.67,
476
  "details": "4,4,6"
477
  },
478
  {
479
+ "model": "qwen3-235B-A22B-fp8 (⚔️)",
480
  "score": 2.33,
481
  "details": "1,2,4"
482
  }
 
486
  "runs": 1,
487
  "results": [
488
  {
489
+ "model": "claude-3-5-sonnet-20241022 (⚔️)",
490
  "score": 2,
491
  "progress": "1:2/5",
492
  "evaluator result": "1/3"
493
  },
494
  {
495
+ "model": "claude-3-7-sonnet-20250219 (⚔️)",
496
  "score": 7,
497
  "progress": "2:2/9",
498
  "evaluator result": "5/11"
499
  },
500
  {
501
+ "model": "deepseek-r1-0120 (⚔️)",
502
  "score": 0,
503
  "progress": "0",
504
  "evaluator result": "1/5"
505
  },
506
  {
507
+ "model": "gemini-2.5-flash-preview-04-17 (⚔️)",
508
  "score": 4,
509
  "progress": "1:4/5",
510
  "evaluator result": "1/7"
511
  },
512
  {
513
+ "model": "gemini-2.5-pro-preview-05-06 (⚔️)",
514
  "score": 7,
515
  "progress": "2:2/9",
516
  "evaluator result": "2/3"
517
  },
518
  {
519
+ "model": "grok-3-mini-beta (⚔️)",
520
  "score": 0,
521
  "progress": "0",
522
  "evaluator result": "0"
523
  },
524
  {
525
+ "model": "llama-4-maverick-17b-128e-instruct-fp8 (⚔️)",
526
  "score": 0,
527
  "progress": "0",
528
  "evaluator result": "0"
529
  },
530
  {
531
+ "model": "gpt-4.1-2025-04-14 (⚔️)",
532
  "score": 2,
533
  "progress": "1:2/5",
534
  "evaluator result": "2/3"
535
  },
536
  {
537
+ "model": "gpt-4o-2024-11-20 (⚔️)",
538
  "score": 0,
539
  "progress": "0",
540
  "evaluator result": "0"
541
  },
542
  {
543
+ "model": "o1-2024-12-17 (⚔️)",
544
  "score": 16,
545
  "progress": "3: 2/8",
546
  "evaluator result": "6/11"
547
  },
548
  {
549
+ "model": "o1-mini-2024-09-12 (⚔️)",
550
  "score": 0,
551
  "progress": "0",
552
  "evaluator result": "1/5"
553
  },
554
  {
555
+ "model": "o3-2025-04-16 (⚔️)",
556
  "score": 16,
557
  "progress": "3: 2/8",
558
  "evaluator result": "1/2"
559
  },
560
  {
561
+ "model": "o4-mini-2025-04-16 (⚔️)",
562
  "score": 4,
563
  "progress": "1:4/5",
564
  "evaluator result": "2/5"
 
570
  "evaluator result": "0"
571
  },
572
  {
573
+ "model": "claude-opus-4-20250514 (⚔️)",
574
  "score": 6,
575
  "details": "6"
576
  },
577
  {
578
+ "model": "claude-sonnet-4-20250514 (⚔️)",
579
  "score": 3.67,
580
  "details": "3,4,4"
581
  },
582
  {
583
+ "model": "gemini-2.5-flash-preview-05-20 (⚔️)",
584
  "score": 4.33,
585
  "details": "3,4,6"
586
  }