jmercat commited on
Commit
a4277ef
Β·
1 Parent(s): f7fb142

handle slow charts

Browse files
Files changed (1) hide show
  1. app.py +174 -92
app.py CHANGED
@@ -19,6 +19,7 @@ import io
19
  import base64
20
  from itertools import combinations
21
  import warnings
 
22
  warnings.filterwarnings('ignore')
23
 
24
  # Configure page
@@ -64,7 +65,19 @@ def load_comprehensive_data():
64
  df = pd.read_csv("comprehensive_benchmark_scores.csv", index_col=0, encoding='utf-8')
65
 
66
  # Clean the data - handle list-like values stored as strings
67
- for col in df.columns:
 
 
 
 
 
 
 
 
 
 
 
 
68
  def extract_value(x):
69
  if pd.isna(x):
70
  return np.nan
@@ -85,6 +98,10 @@ def load_comprehensive_data():
85
  df[col] = df[col].apply(extract_value)
86
  df[col] = pd.to_numeric(df[col], errors='coerce')
87
 
 
 
 
 
88
  # Filter to only models that have data for at least a few benchmarks
89
  min_benchmarks = 3
90
  df_filtered = df.dropna(thresh=min_benchmarks, axis=0)
@@ -334,6 +351,7 @@ def filter_target_benchmarks(df):
334
 
335
  return df[available_benchmarks].copy()
336
 
 
337
  def estimate_missing_ranks(df, method='spearman', min_corr=0.3, min_benchmarks=3):
338
  """
339
  Estimate missing benchmark ranks using rank correlation-based imputation.
@@ -351,7 +369,7 @@ def estimate_missing_ranks(df, method='spearman', min_corr=0.3, min_benchmarks=3
351
  df_ranks = df.rank(method='min', ascending=False, na_option='keep')
352
  df_ranks_imputed = df_ranks.copy()
353
 
354
- # Compute rank correlation matrix
355
  if method == 'spearman':
356
  rank_corr_matrix = df_ranks.corr(method='spearman')
357
  elif method == 'kendall':
@@ -359,23 +377,42 @@ def estimate_missing_ranks(df, method='spearman', min_corr=0.3, min_benchmarks=3
359
  else:
360
  rank_corr_matrix = df_ranks.corr(method='pearson') # fallback
361
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  # For each model and benchmark combination with missing data
 
 
 
363
  for model_idx in df.index:
364
- for benchmark in df.columns:
365
- if pd.isna(df_ranks.loc[model_idx, benchmark]):
366
- # Find benchmarks this model has ranks for
367
- available_benchmarks = df_ranks.columns[df_ranks.loc[model_idx].notna()].tolist()
368
-
369
- if len(available_benchmarks) >= min_benchmarks:
370
- # Get rank correlations between target benchmark and available benchmarks
 
371
  correlations = []
372
  ranks = []
373
 
374
- for avail_bench in available_benchmarks:
375
- corr_val = rank_corr_matrix.loc[benchmark, avail_bench]
376
- if not pd.isna(corr_val) and abs(corr_val) >= min_corr:
377
- correlations.append(abs(corr_val)) # Use absolute correlation as weight
378
- ranks.append(df_ranks.loc[model_idx, avail_bench])
 
 
 
379
 
380
  if len(correlations) > 0:
381
  # Weighted average of ranks using correlations as weights
@@ -387,9 +424,11 @@ def estimate_missing_ranks(df, method='spearman', min_corr=0.3, min_benchmarks=3
387
  estimated_rank = np.average(ranks, weights=weights)
388
 
389
  df_ranks_imputed.loc[model_idx, benchmark] = estimated_rank
 
390
 
391
  return df_ranks_imputed
392
 
 
393
  def create_consensus_ranking(df, method='spearman', use_rank_imputation=True):
394
  """
395
  Create a consensus ranking using rank correlation-based estimation.
@@ -461,6 +500,81 @@ def create_consensus_ranking(df, method='spearman', use_rank_imputation=True):
461
 
462
  return ranking_df, df_ranks, metadata
463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  def weighted_correlation(x, y, weights):
465
  """Compute weighted Pearson correlation coefficient."""
466
  # Remove NaN values
@@ -821,9 +935,11 @@ def main():
821
  st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
822
  unsafe_allow_html=True)
823
 
824
- # Load data
 
825
  df = load_comprehensive_data()
826
  stderr_df = load_stderr_data()
 
827
 
828
  # Debug information (hidden in an expander)
829
  # with st.expander("πŸ”§ Debug Information", expanded=False):
@@ -907,6 +1023,16 @@ def main():
907
  valid_benchmarks.append(col)
908
  df_display = df_display[valid_benchmarks]
909
 
 
 
 
 
 
 
 
 
 
 
910
  # Main content based on analysis mode
911
  if analysis_mode == "πŸ“Š Overview Dashboard":
912
  show_overview_dashboard(df_display, stderr_df)
@@ -1347,7 +1473,13 @@ def show_model_performance(df):
1347
  # Model search
1348
  search_term = st.text_input("πŸ” Search for models", placeholder="Enter model name or part of name")
1349
 
1350
- if search_term:
 
 
 
 
 
 
1351
  matching_models = df.index[df.index.str.contains(search_term, case=False, na=False)]
1352
  if len(matching_models) > 0:
1353
  df_display = df.loc[matching_models]
@@ -1393,12 +1525,20 @@ def show_model_performance(df):
1393
  else:
1394
  min_corr = 0.3
1395
 
1396
- # Generate rankings
1397
- ranking_df, rank_matrix, metadata = create_consensus_ranking(
1398
- df_display,
1399
- method=rank_method,
1400
- use_rank_imputation=use_rank_imputation
1401
- )
 
 
 
 
 
 
 
 
1402
 
1403
  # Display ranking information
1404
  col1, col2 = st.columns(2)
@@ -1451,6 +1591,11 @@ def show_model_performance(df):
1451
  4. Weights based on rank correlation strength (min threshold: {min_corr})
1452
  5. Final consensus rank = median rank across all benchmarks
1453
 
 
 
 
 
 
1454
  **Upsides**:
1455
  - Eliminates bias from models tested only on easier/harder benchmarks
1456
  - Uses the correlation structure to make informed predictions
@@ -1548,78 +1693,15 @@ def show_model_performance(df):
1548
  st.warning(f"Too many models selected ({len(selected_models)}). Please select 10 or fewer models for the radar chart.")
1549
  st.info("πŸ’‘ **Tip**: Use the search box above to filter models, then select a smaller subset for comparison.")
1550
  else:
1551
- # Show radar chart for 1-10 models
1552
- fig = go.Figure()
1553
-
1554
- # Use only selected benchmarks
1555
- clean_benchmark_names = [clean_benchmark_name(b) for b in selected_benchmarks_for_radar]
1556
-
1557
- # Define colors for different models
1558
- colors_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
1559
- '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
1560
-
1561
- for i, model in enumerate(selected_models):
1562
- # Get model data for selected benchmarks only
1563
- model_scores = []
1564
- for benchmark in selected_benchmarks_for_radar:
1565
- score = df_display.loc[model, benchmark]
1566
- # Convert to float, use 0.0 for any remaining NaN values
1567
- model_scores.append(0.0 if pd.isna(score) else float(score))
1568
-
1569
- # Close the radar chart by adding the first value at the end
1570
- radar_values = model_scores + [model_scores[0]]
1571
- radar_benchmarks = clean_benchmark_names + [clean_benchmark_names[0]]
1572
-
1573
- # Create model name for legend (remove path prefix if present)
1574
- model_display_name = model.split('/')[-1] if '/' in model else model
1575
-
1576
- # Use color from list, cycling if needed
1577
- model_color = colors_list[i % len(colors_list)]
1578
-
1579
- fig.add_trace(go.Scatterpolar(
1580
- r=radar_values,
1581
- theta=radar_benchmarks,
1582
- fill='toself',
1583
- name=model_display_name,
1584
- line_color=model_color,
1585
- hovertemplate='<b>%{theta}</b><br>Score: %{r:.3f}<extra></extra>'
1586
- ))
1587
-
1588
- # Calculate dynamic range for better visualization
1589
- all_values = []
1590
- for model in selected_models:
1591
- for benchmark in selected_benchmarks_for_radar:
1592
- score = df_display.loc[model, benchmark]
1593
- if not pd.isna(score):
1594
- all_values.append(score)
1595
-
1596
- if all_values:
1597
- min_val = min(all_values)
1598
- max_val = max(all_values)
1599
- # Add some padding
1600
- range_padding = (max_val - min_val) * 0.1
1601
- radar_min = max(0, min_val - range_padding)
1602
- radar_max = min(1, max_val + range_padding)
1603
  else:
1604
- radar_min, radar_max = 0, 1
1605
-
1606
- # Adjust chart size based on number of models
1607
- chart_height = 600 if len(selected_models) <= 3 else 700
1608
-
1609
- fig.update_layout(
1610
- polar=dict(
1611
- radialaxis=dict(
1612
- visible=True,
1613
- range=[radar_min, radar_max],
1614
- tickformat='.2f'
1615
- )),
1616
- showlegend=True,
1617
- title=f"Model Performance Radar Chart ({len(selected_benchmarks_for_radar)} benchmarks, {len(selected_models)} models)",
1618
- width=700,
1619
- height=chart_height
1620
- )
1621
 
1622
- st.plotly_chart(fig, use_container_width=True)
 
1623
 
1624
  # Add explanation about missing values (only if not using complete data only)
1625
  if not complete_data_only:
 
19
  import base64
20
  from itertools import combinations
21
  import warnings
22
+ import time
23
  warnings.filterwarnings('ignore')
24
 
25
  # Configure page
 
65
  df = pd.read_csv("comprehensive_benchmark_scores.csv", index_col=0, encoding='utf-8')
66
 
67
  # Clean the data - handle list-like values stored as strings
68
+ # Process in batches for better performance with large datasets
69
+ total_cols = len(df.columns)
70
+
71
+ if total_cols > 20:
72
+ # Show progress for large datasets
73
+ progress_text = st.empty()
74
+ progress_bar = st.progress(0)
75
+
76
+ for i, col in enumerate(df.columns):
77
+ if total_cols > 20:
78
+ progress_text.text(f"Processing column {i+1}/{total_cols}: {col}")
79
+ progress_bar.progress((i+1) / total_cols)
80
+
81
  def extract_value(x):
82
  if pd.isna(x):
83
  return np.nan
 
98
  df[col] = df[col].apply(extract_value)
99
  df[col] = pd.to_numeric(df[col], errors='coerce')
100
 
101
+ if total_cols > 20:
102
+ progress_text.empty()
103
+ progress_bar.empty()
104
+
105
  # Filter to only models that have data for at least a few benchmarks
106
  min_benchmarks = 3
107
  df_filtered = df.dropna(thresh=min_benchmarks, axis=0)
 
351
 
352
  return df[available_benchmarks].copy()
353
 
354
+ @st.cache_data
355
  def estimate_missing_ranks(df, method='spearman', min_corr=0.3, min_benchmarks=3):
356
  """
357
  Estimate missing benchmark ranks using rank correlation-based imputation.
 
369
  df_ranks = df.rank(method='min', ascending=False, na_option='keep')
370
  df_ranks_imputed = df_ranks.copy()
371
 
372
+ # Compute rank correlation matrix once
373
  if method == 'spearman':
374
  rank_corr_matrix = df_ranks.corr(method='spearman')
375
  elif method == 'kendall':
 
377
  else:
378
  rank_corr_matrix = df_ranks.corr(method='pearson') # fallback
379
 
380
+ # Pre-compute correlation thresholds to avoid repeated calculations
381
+ valid_correlations = {}
382
+ for benchmark in df.columns:
383
+ valid_correlations[benchmark] = []
384
+ for other_bench in df.columns:
385
+ if benchmark != other_bench:
386
+ corr_val = rank_corr_matrix.loc[benchmark, other_bench]
387
+ if not pd.isna(corr_val) and abs(corr_val) >= min_corr:
388
+ valid_correlations[benchmark].append((other_bench, abs(corr_val)))
389
+ # Sort by correlation strength for better prediction
390
+ valid_correlations[benchmark].sort(key=lambda x: x[1], reverse=True)
391
+
392
  # For each model and benchmark combination with missing data
393
+ missing_count = 0
394
+ total_missing = df_ranks.isna().sum().sum()
395
+
396
  for model_idx in df.index:
397
+ available_benchmarks = df_ranks.columns[df_ranks.loc[model_idx].notna()].tolist()
398
+
399
+ if len(available_benchmarks) >= min_benchmarks:
400
+ for benchmark in df.columns:
401
+ if pd.isna(df_ranks.loc[model_idx, benchmark]):
402
+ # Get pre-computed valid correlations for this benchmark
403
+ valid_pairs = valid_correlations[benchmark]
404
+
405
  correlations = []
406
  ranks = []
407
 
408
+ for other_bench, corr_strength in valid_pairs:
409
+ if other_bench in available_benchmarks:
410
+ correlations.append(corr_strength)
411
+ ranks.append(df_ranks.loc[model_idx, other_bench])
412
+
413
+ # Limit to top 5 most correlated benchmarks for efficiency
414
+ if len(correlations) >= 5:
415
+ break
416
 
417
  if len(correlations) > 0:
418
  # Weighted average of ranks using correlations as weights
 
424
  estimated_rank = np.average(ranks, weights=weights)
425
 
426
  df_ranks_imputed.loc[model_idx, benchmark] = estimated_rank
427
+ missing_count += 1
428
 
429
  return df_ranks_imputed
430
 
431
+ @st.cache_data
432
  def create_consensus_ranking(df, method='spearman', use_rank_imputation=True):
433
  """
434
  Create a consensus ranking using rank correlation-based estimation.
 
500
 
501
  return ranking_df, df_ranks, metadata
502
 
503
+ @st.cache_data
504
+ def create_optimized_radar_chart(df_display, selected_models, selected_benchmarks_for_radar):
505
+ """Create an optimized radar chart for the selected models and benchmarks."""
506
+ if not selected_benchmarks_for_radar or not selected_models:
507
+ return None
508
+
509
+ # Pre-filter data to only what we need
510
+ filtered_data = df_display.loc[selected_models, selected_benchmarks_for_radar]
511
+ clean_benchmark_names = [clean_benchmark_name(b) for b in selected_benchmarks_for_radar]
512
+
513
+ # Define colors for different models
514
+ colors_list = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
515
+ '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
516
+
517
+ fig = go.Figure()
518
+
519
+ # Calculate dynamic range for better visualization
520
+ all_values = filtered_data.values.flatten()
521
+ all_values = all_values[~pd.isna(all_values)]
522
+
523
+ if len(all_values) > 0:
524
+ min_val = float(np.min(all_values))
525
+ max_val = float(np.max(all_values))
526
+ # Add some padding
527
+ range_padding = (max_val - min_val) * 0.1
528
+ radar_min = max(0, min_val - range_padding)
529
+ radar_max = min(1, max_val + range_padding)
530
+ else:
531
+ radar_min, radar_max = 0, 1
532
+
533
+ for i, model in enumerate(selected_models):
534
+ # Get model data for selected benchmarks only
535
+ model_scores = []
536
+ for benchmark in selected_benchmarks_for_radar:
537
+ score = filtered_data.loc[model, benchmark]
538
+ # Convert to float, use 0.0 for any remaining NaN values
539
+ model_scores.append(0.0 if pd.isna(score) else float(score))
540
+
541
+ # Close the radar chart by adding the first value at the end
542
+ radar_values = model_scores + [model_scores[0]]
543
+ radar_benchmarks = clean_benchmark_names + [clean_benchmark_names[0]]
544
+
545
+ # Create model name for legend (remove path prefix if present)
546
+ model_display_name = model.split('/')[-1] if '/' in model else model
547
+
548
+ # Use color from list, cycling if needed
549
+ model_color = colors_list[i % len(colors_list)]
550
+
551
+ fig.add_trace(go.Scatterpolar(
552
+ r=radar_values,
553
+ theta=radar_benchmarks,
554
+ fill='toself',
555
+ name=model_display_name,
556
+ line_color=model_color,
557
+ hovertemplate='<b>%{theta}</b><br>Score: %{r:.3f}<extra></extra>'
558
+ ))
559
+
560
+ # Adjust chart size based on number of models
561
+ chart_height = 600 if len(selected_models) <= 3 else 700
562
+
563
+ fig.update_layout(
564
+ polar=dict(
565
+ radialaxis=dict(
566
+ visible=True,
567
+ range=[radar_min, radar_max],
568
+ tickformat='.2f'
569
+ )),
570
+ showlegend=True,
571
+ title=f"Model Performance Radar Chart ({len(selected_benchmarks_for_radar)} benchmarks, {len(selected_models)} models)",
572
+ width=700,
573
+ height=chart_height
574
+ )
575
+
576
+ return fig
577
+
578
  def weighted_correlation(x, y, weights):
579
  """Compute weighted Pearson correlation coefficient."""
580
  # Remove NaN values
 
935
  st.markdown('<h1 class="main-header">OpenThoughts Evalchemy Benchmark Explorer</h1>',
936
  unsafe_allow_html=True)
937
 
938
+ # Load data with timing
939
+ start_time = time.time()
940
  df = load_comprehensive_data()
941
  stderr_df = load_stderr_data()
942
+ load_time = time.time() - start_time
943
 
944
  # Debug information (hidden in an expander)
945
  # with st.expander("πŸ”§ Debug Information", expanded=False):
 
1023
  valid_benchmarks.append(col)
1024
  df_display = df_display[valid_benchmarks]
1025
 
1026
+ # Performance info
1027
+ st.sidebar.markdown("---")
1028
+ st.sidebar.subheader("⚑ Performance")
1029
+ if load_time > 0:
1030
+ st.sidebar.metric("Data Load Time", f"{load_time:.2f}s")
1031
+ st.sidebar.metric("Dataset Size", f"{len(df_display)} Γ— {len(df_display.columns)}")
1032
+ if not df_display.empty:
1033
+ data_coverage = (df_display.notna().sum().sum() / (len(df_display) * len(df_display.columns))) * 100
1034
+ st.sidebar.metric("Data Coverage", f"{data_coverage:.1f}%")
1035
+
1036
  # Main content based on analysis mode
1037
  if analysis_mode == "πŸ“Š Overview Dashboard":
1038
  show_overview_dashboard(df_display, stderr_df)
 
1473
  # Model search
1474
  search_term = st.text_input("πŸ” Search for models", placeholder="Enter model name or part of name")
1475
 
1476
+ # Performance optimization: limit initial display for very large datasets
1477
+ if not search_term and len(df) > 100:
1478
+ st.info(f"πŸ“Š **Large dataset detected** ({len(df)} models). Showing top 100 models by average performance. Use search to find specific models.")
1479
+ # Get top 100 models by average score across benchmarks
1480
+ avg_scores = df.mean(axis=1, skipna=True).sort_values(ascending=False)
1481
+ df_display = df.loc[avg_scores.head(100).index]
1482
+ elif search_term:
1483
  matching_models = df.index[df.index.str.contains(search_term, case=False, na=False)]
1484
  if len(matching_models) > 0:
1485
  df_display = df.loc[matching_models]
 
1525
  else:
1526
  min_corr = 0.3
1527
 
1528
+ # Generate rankings with progress indicator
1529
+ if use_rank_imputation and len(df_display) > 50:
1530
+ with st.spinner(f"Computing consensus rankings for {len(df_display)} models..."):
1531
+ ranking_df, rank_matrix, metadata = create_consensus_ranking(
1532
+ df_display,
1533
+ method=rank_method,
1534
+ use_rank_imputation=use_rank_imputation
1535
+ )
1536
+ else:
1537
+ ranking_df, rank_matrix, metadata = create_consensus_ranking(
1538
+ df_display,
1539
+ method=rank_method,
1540
+ use_rank_imputation=use_rank_imputation
1541
+ )
1542
 
1543
  # Display ranking information
1544
  col1, col2 = st.columns(2)
 
1591
  4. Weights based on rank correlation strength (min threshold: {min_corr})
1592
  5. Final consensus rank = median rank across all benchmarks
1593
 
1594
+ **Optimizations**:
1595
+ - Pre-compute correlation matrices for efficiency
1596
+ - Limit to top 5 most correlated benchmarks per prediction
1597
+ - Cache results to avoid recomputation
1598
+
1599
  **Upsides**:
1600
  - Eliminates bias from models tested only on easier/harder benchmarks
1601
  - Uses the correlation structure to make informed predictions
 
1693
  st.warning(f"Too many models selected ({len(selected_models)}). Please select 10 or fewer models for the radar chart.")
1694
  st.info("πŸ’‘ **Tip**: Use the search box above to filter models, then select a smaller subset for comparison.")
1695
  else:
1696
+ # Show radar chart for 1-10 models with optimization
1697
+ if len(selected_models) > 3 or len(selected_benchmarks_for_radar) > 8:
1698
+ with st.spinner("Generating radar chart..."):
1699
+ fig = create_optimized_radar_chart(df_display, selected_models, selected_benchmarks_for_radar)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1700
  else:
1701
+ fig = create_optimized_radar_chart(df_display, selected_models, selected_benchmarks_for_radar)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1702
 
1703
+ if fig:
1704
+ st.plotly_chart(fig, use_container_width=True)
1705
 
1706
  # Add explanation about missing values (only if not using complete data only)
1707
  if not complete_data_only: