Spaces:

codelion
/

LogProbsVisualizer

Running

App Files Files Community

codelion commited on Feb 26

Commit

cbaf223

verified ·

1 Parent(s): ccde0a2

Update app.py

Browse files

Files changed (1) hide show

app.py +127 -597

app.py CHANGED Viewed

@@ -8,13 +8,9 @@ import math
 import ast
 import logging
 import numpy as np
-from sklearn.cluster import KMeans
-from sklearn.manifold import TSNE
 from scipy import stats
-from scipy.stats import entropy
-from scipy.signal import correlate
-import networkx as nx
-from matplotlib.widgets import Cursor
 # Set up logging
 logging.basicConfig(level=logging.DEBUG)
@@ -63,8 +59,8 @@ def ensure_float(value):
         return float(value)
     return None
-# Function to process and visualize log probs with multiple analyses
-def visualize_logprobs(json_input, prob_filter=-1e9):
     try:
         # Parse the input (handles both JSON and Python dictionaries)
         data = parse_input(json_input)
@@ -81,18 +77,11 @@ def visualize_logprobs(json_input, prob_filter=-1e9):
         tokens = []
         logprobs = []
         top_alternatives = []  # List to store top 3 log probs (selected token + 2 alternatives)
-        token_types = []  # Simplified token type categorization
         for entry in content:
             logprob = ensure_float(entry.get("logprob", None))
             if logprob is not None and math.isfinite(logprob) and logprob >= prob_filter:
                 tokens.append(entry["token"])
                 logprobs.append(logprob)
-                # Categorize token type (simple heuristic)
-                token = entry["token"].lower().strip()
-                if token in ["the", "a", "an"]: token_types.append("article")
-                elif token in ["is", "are", "was", "were"]: token_types.append("verb")
-                elif token in ["top", "so", "need", "figure"]: token_types.append("noun")
-                else: token_types.append("other")
                 # Get top_logprobs, default to empty dict if None
                 top_probs = entry.get("top_logprobs", {})
                 # Ensure all values in top_logprobs are floats
@@ -112,505 +101,76 @@ def visualize_logprobs(json_input, prob_filter=-1e9):
         # Check if there's valid data after filtering
         if not logprobs or not tokens:
-            return ("No finite log probabilities or tokens to visualize after filtering.", None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None)
-        # 1. Main Log Probability Plot (with click for tokens)
-        def create_main_plot():
-            fig_main, ax_main = plt.subplots(figsize=(10, 5))
-            if not logprobs or not tokens:
-                raise ValueError("No data for main plot")
-            scatter = ax_main.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b", label="Selected Token")[0]
-            ax_main.set_title("Log Probabilities of Generated Tokens")
-            ax_main.set_xlabel("Token Position")
-            ax_main.set_ylabel("Log Probability")
-            ax_main.grid(True)
-            ax_main.set_xticks([])  # Hide X-axis labels by default
-            # Add click functionality to show token
-            token_annotations = []
-            for i, (x, y) in enumerate(zip(range(len(logprobs)), logprobs)):
-                annotation = ax_main.annotate('', (x, y), xytext=(10, 10), textcoords='offset points', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), visible=False)
-                token_annotations.append(annotation)
-            def on_click(event):
-                if event.inaxes == ax_main:
-                    for i, (x, y) in enumerate(zip(range(len(logprobs)), logprobs)):
-                        contains, _ = scatter.contains(event)
-                        if contains and abs(event.xdata - x) < 0.5 and abs(event.ydata - y) < 0.5:
-                            token_annotations[i].set_text(tokens[i])
-                            token_annotations[i].set_visible(True)
-                            fig_main.canvas.draw_idle()
-                        else:
-                            token_annotations[i].set_visible(False)
-                            fig_main.canvas.draw_idle()
-            fig_main.canvas.mpl_connect('button_press_event', on_click)
-            buf_main = io.BytesIO()
-            plt.savefig(buf_main, format="png", bbox_inches="tight", dpi=100)
-            buf_main.seek(0)
-            plt.close(fig_main)
-            return buf_main
-        # 2. K-Means Clustering of Log Probabilities
-        def create_cluster_plot():
-            if not logprobs:
-                raise ValueError("No data for clustering plot")
-            kmeans = KMeans(n_clusters=3, random_state=42)
-            cluster_labels = kmeans.fit_predict(np.array(logprobs).reshape(-1, 1))
-            fig_cluster, ax_cluster = plt.subplots(figsize=(10, 5))
-            scatter = ax_cluster.scatter(range(len(logprobs)), logprobs, c=cluster_labels, cmap='viridis')
-            ax_cluster.set_title("K-Means Clustering of Log Probabilities")
-            ax_cluster.set_xlabel("Token Position")
-            ax_cluster.set_ylabel("Log Probability")
-            ax_cluster.grid(True)
-            plt.colorbar(scatter, ax=ax_cluster, label="Cluster")
-            buf_cluster = io.BytesIO()
-            plt.savefig(buf_cluster, format="png", bbox_inches="tight", dpi=100)
-            buf_cluster.seek(0)
-            plt.close(fig_cluster)
-            return buf_cluster
-        # 3. Probability Drop Analysis
-        def create_drops_plot():
-            if not logprobs or len(logprobs) < 2:
-                raise ValueError("Insufficient data for probability drops")
-            drops = [logprobs[i+1] - logprobs[i] if i < len(logprobs)-1 else 0 for i in range(len(logprobs))]
-            fig_drops, ax_drops = plt.subplots(figsize=(10, 5))
-            ax_drops.bar(range(len(drops)), drops, color='red', alpha=0.5)
-            ax_drops.set_title("Significant Probability Drops")
-            ax_drops.set_xlabel("Token Position")
-            ax_drops.set_ylabel("Log Probability Drop")
-            ax_drops.grid(True)
-            buf_drops = io.BytesIO()
-            plt.savefig(buf_drops, format="png", bbox_inches="tight", dpi=100)
-            buf_drops.seek(0)
-            plt.close(fig_drops)
-            return buf_drops
-        # 4. N-Gram Analysis (Bigrams for simplicity)
-        def create_ngram_plot():
-            if not logprobs or len(logprobs) < 2:
-                raise ValueError("Insufficient data for N-gram analysis")
-            bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]
-            bigram_probs = [logprobs[i] + logprobs[i+1] for i in range(len(tokens)-1)]
-            fig_ngram, ax_ngram = plt.subplots(figsize=(10, 5))
-            ax_ngram.bar(range(len(bigrams)), bigram_probs, color='green')
-            ax_ngram.set_title("N-Gram (Bigrams) Probability Sum")
-            ax_ngram.set_xlabel("Bigram Position")
-            ax_ngram.set_ylabel("Sum of Log Probabilities")
-            ax_ngram.set_xticks(range(len(bigrams)))
-            ax_ngram.set_xticklabels([f"{b[0]}->{b[1]}" for b in bigrams], rotation=45, ha="right")
-            ax_ngram.grid(True)
-            buf_ngram = io.BytesIO()
-            plt.savefig(buf_ngram, format="png", bbox_inches="tight", dpi=100)
-            buf_ngram.seek(0)
-            plt.close(fig_ngram)
-            return buf_ngram
-        # 5. Markov Chain Modeling (Simple Graph)
-        def create_markov_plot():
-            if not tokens or len(tokens) < 2:
-                raise ValueError("Insufficient data for Markov chain")
-            G = nx.DiGraph()
-            for i in range(len(tokens)-1):
-                G.add_edge(tokens[i], tokens[i+1], weight=logprobs[i+1] - logprobs[i])
-            fig_markov, ax_markov = plt.subplots(figsize=(10, 5))
-            pos = nx.spring_layout(G)
-            nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=500, edge_color='gray', width=1, ax=ax_markov)
-            ax_markov.set_title("Markov Chain of Token Transitions")
-            buf_markov = io.BytesIO()
-            plt.savefig(buf_markov, format="png", bbox_inches="tight", dpi=100)
-            buf_markov.seek(0)
-            plt.close(fig_markov)
-            return buf_markov
-        # 6. Anomaly Detection (Outlier Detection with Z-Score)
-        def create_anomaly_plot():
-            if not logprobs:
-                raise ValueError("No data for anomaly detection")
-            z_scores = np.abs(stats.zscore(logprobs))
             outliers = z_scores > 2  # Threshold for outliers
-            fig_anomaly, ax_anomaly = plt.subplots(figsize=(10, 5))
-            ax_anomaly.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b")
-            ax_anomaly.plot(np.where(outliers)[0], [logprobs[i] for i in np.where(outliers)[0]], "ro", label="Outliers")
-            ax_anomaly.set_title("Log Probabilities with Outliers")
-            ax_anomaly.set_xlabel("Token Position")
-            ax_anomaly.set_ylabel("Log Probability")
-            ax_anomaly.grid(True)
-            ax_anomaly.legend()
-            ax_anomaly.set_xticks([])  # Hide X-axis labels
-            buf_anomaly = io.BytesIO()
-            plt.savefig(buf_anomaly, format="png", bbox_inches="tight", dpi=100)
-            buf_anomaly.seek(0)
-            plt.close(fig_anomaly)
-            return buf_anomaly
-        # 7. Autocorrelation
-        def create_autocorr_plot():
-            if not logprobs:
-                raise ValueError("No data for autocorrelation")
-            autocorr = correlate(logprobs, logprobs, mode='full')
-            autocorr = autocorr[len(autocorr)//2:] / len(logprobs)  # Normalize
-            fig_autocorr, ax_autocorr = plt.subplots(figsize=(10, 5))
-            ax_autocorr.plot(range(len(autocorr)), autocorr, color='purple')
-            ax_autocorr.set_title("Autocorrelation of Log Probabilities")
-            ax_autocorr.set_xlabel("Lag")
-            ax_autocorr.set_ylabel("Autocorrelation")
-            ax_autocorr.grid(True)
-            buf_autocorr = io.BytesIO()
-            plt.savefig(buf_autocorr, format="png", bbox_inches="tight", dpi=100)
-            buf_autocorr.seek(0)
-            plt.close(fig_autocorr)
-            return buf_autocorr
-        # 8. Smoothing (Moving Average)
-        def create_smoothing_plot():
-            if not logprobs:
-                raise ValueError("No data for smoothing")
-            window_size = 3
-            moving_avg = np.convolve(logprobs, np.ones(window_size)/window_size, mode='valid')
-            fig_smoothing, ax_smoothing = plt.subplots(figsize=(10, 5))
-            ax_smoothing.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b", label="Original")
-            ax_smoothing.plot(range(window_size-1, len(logprobs)), moving_avg, color="orange", label="Moving Average")
-            ax_smoothing.set_title("Log Probabilities with Moving Average")
-            ax_smoothing.set_xlabel("Token Position")
-            ax_smoothing.set_ylabel("Log Probability")
-            ax_smoothing.grid(True)
-            ax_smoothing.legend()
-            ax_smoothing.set_xticks([])  # Hide X-axis labels
-            buf_smoothing = io.BytesIO()
-            plt.savefig(buf_smoothing, format="png", bbox_inches="tight", dpi=100)
-            buf_smoothing.seek(0)
-            plt.close(fig_smoothing)
-            return buf_smoothing
-        # 9. Uncertainty Propagation (Variance of Top Logprobs)
-        def create_uncertainty_plot():
-            if not logprobs or not top_alternatives:
-                raise ValueError("No data for uncertainty propagation")
-            variances = []
-            for probs in top_alternatives:
-                if len(probs) > 1:
-                    values = [p[1] for p in probs]
-                    variances.append(np.var(values))
-                else:
-                    variances.append(0)
-            fig_uncertainty, ax_uncertainty = plt.subplots(figsize=(10, 5))
-            ax_uncertainty.plot(range(len(logprobs)), logprobs, marker="o", linestyle="-", color="b", label="Log Prob")
-            ax_uncertainty.fill_between(range(len(logprobs)), [lp - v for lp, v in zip(logprobs, variances)],
-                                     [lp + v for lp, v in zip(logprobs, variances)], color='gray', alpha=0.3, label="Uncertainty")
-            ax_uncertainty.set_title("Log Probabilities with Uncertainty Propagation")
-            ax_uncertainty.set_xlabel("Token Position")
-            ax_uncertainty.set_ylabel("Log Probability")
-            ax_uncertainty.grid(True)
-            ax_uncertainty.legend()
-            ax_uncertainty.set_xticks([])  # Hide X-axis labels
-            buf_uncertainty = io.BytesIO()
-            plt.savefig(buf_uncertainty, format="png", bbox_inches="tight", dpi=100)
-            buf_uncertainty.seek(0)
-            plt.close(fig_uncertainty)
-            return buf_uncertainty
-        # 10. Correlation Heatmap
-        def create_corr_plot():
-            if not logprobs or len(logprobs) < 2:
-                raise ValueError("Insufficient data for correlation heatmap")
-            corr_matrix = np.corrcoef(logprobs, rowvar=False)
-            fig_corr, ax_corr = plt.subplots(figsize=(10, 5))
-            im = ax_corr.imshow(corr_matrix, cmap='coolwarm', interpolation='nearest')
-            ax_corr.set_title("Correlation of Log Probabilities Across Positions")
-            ax_corr.set_xlabel("Token Position")
-            ax_corr.set_ylabel("Token Position")
-            plt.colorbar(im, ax=ax_corr, label="Correlation")
-            buf_corr = io.BytesIO()
-            plt.savefig(buf_corr, format="png", bbox_inches="tight", dpi=100)
-            buf_corr.seek(0)
-            plt.close(fig_corr)
-            return buf_corr
-        # 11. Token Type Correlation
-        def create_type_plot():
-            if not logprobs or not token_types:
-                raise ValueError("No data for token type correlation")
-            type_probs = {t: [] for t in set(token_types)}
-            for t, p in zip(token_types, logprobs):
-                type_probs[t].append(p)
-            fig_type, ax_type = plt.subplots(figsize=(10, 5))
-            for t in type_probs:
-                ax_type.bar(t, np.mean(type_probs[t]), yerr=np.std(type_probs[t]), capsize=5, label=t)
-            ax_type.set_title("Average Log Probability by Token Type")
-            ax_type.set_xlabel("Token Type")
-            ax_type.set_ylabel("Average Log Probability")
-            ax_type.grid(True)
-            ax_type.legend()
-            buf_type = io.BytesIO()
-            plt.savefig(buf_type, format="png", bbox_inches="tight", dpi=100)
-            buf_type.seek(0)
-            plt.close(fig_type)
-            return buf_type
-        # 12. Token Embedding Similarity vs. Probability (Simulated)
-        def create_embed_plot():
-            if not logprobs or not tokens:
-                raise ValueError("No data for embedding similarity")
-            simulated_embeddings = np.random.rand(len(tokens), 2)  # 2D embeddings
-            fig_embed, ax_embed = plt.subplots(figsize=(10, 5))
-            ax_embed.scatter(simulated_embeddings[:, 0], simulated_embeddings[:, 1], c=logprobs, cmap='viridis')
-            ax_embed.set_title("Token Embedding Similarity vs. Log Probability")
-            ax_embed.set_xlabel("Embedding Dimension 1")
-            ax_embed.set_ylabel("Embedding Dimension 2")
-            plt.colorbar(ax_embed.collections[0], ax=ax_embed, label="Log Probability")
-            buf_embed = io.BytesIO()
-            plt.savefig(buf_embed, format="png", bbox_inches="tight", dpi=100)
-            buf_embed.seek(0)
-            plt.close(fig_embed)
-            return buf_embed
-        # 13. Bayesian Inference (Simplified as Inferred Probabilities)
-        def create_bayesian_plot():
-            if not top_alternatives:
-                raise ValueError("No data for Bayesian inference")
-            entropies = [entropy([p[1] for p in probs], base=2) for probs in top_alternatives if len(probs) > 1]
-            fig_bayesian, ax_bayesian = plt.subplots(figsize=(10, 5))
-            ax_bayesian.bar(range(len(entropies)), entropies, color='orange')
-            ax_bayesian.set_title("Bayesian Inferred Uncertainty (Entropy)")
-            ax_bayesian.set_xlabel("Token Position")
-            ax_bayesian.set_ylabel("Entropy")
-            ax_bayesian.grid(True)
-            buf_bayesian = io.BytesIO()
-            plt.savefig(buf_bayesian, format="png", bbox_inches="tight", dpi=100)
-            buf_bayesian.seek(0)
-            plt.close(fig_bayesian)
-            return buf_bayesian
-        # 14. Graph-Based Analysis
-        def create_graph_plot():
-            if not tokens or len(tokens) < 2:
-                raise ValueError("Insufficient data for graph analysis")
-            G = nx.DiGraph()
-            for i in range(len(tokens)-1):
-                G.add_edge(tokens[i], tokens[i+1], weight=logprobs[i+1] - logprobs[i])
-            fig_graph, ax_graph = plt.subplots(figsize=(10, 5))
-            pos = nx.spring_layout(G)
-            nx.draw(G, pos, with_labels=True, node_color='lightblue', node_size=500, edge_color='gray', width=1, ax=ax_graph)
-            ax_graph.set_title("Graph of Token Transitions")
-            buf_graph = io.BytesIO()
-            plt.savefig(buf_graph, format="png", bbox_inches="tight", dpi=100)
-            buf_graph.seek(0)
-            plt.close(fig_graph)
-            return buf_graph
-        # 15. Dimensionality Reduction (t-SNE)
-        def create_tsne_plot():
-            if not logprobs or not top_alternatives:
-                raise ValueError("No data for t-SNE")
-            features = np.array([logprobs + [p[1] for p in alts[:2]] for logprobs, alts in zip([logprobs], top_alternatives)])
-            tsne = TSNE(n_components=2, random_state=42)
-            tsne_result = tsne.fit_transform(features.T)
-            fig_tsne, ax_tsne = plt.subplots(figsize=(10, 5))
-            scatter = ax_tsne.scatter(tsne_result[:, 0], tsne_result[:, 1], c=logprobs, cmap='viridis')
-            ax_tsne.set_title("t-SNE of Log Probabilities and Top Alternatives")
-            ax_tsne.set_xlabel("t-SNE Dimension 1")
-            ax_tsne.set_ylabel("t-SNE Dimension 2")
-            plt.colorbar(scatter, ax=ax_tsne, label="Log Probability")
-            buf_tsne = io.BytesIO()
-            plt.savefig(buf_tsne, format="png", bbox_inches="tight", dpi=100)
-            buf_tsne.seek(0)
-            plt.close(fig_tsne)
-            return buf_tsne
-        # 16. Interactive Heatmap
-        def create_heatmap_plot():
-            if not logprobs:
-                raise ValueError("No data for heatmap")
-            fig_heatmap, ax_heatmap = plt.subplots(figsize=(10, 5))
-            im = ax_heatmap.imshow([logprobs], cmap='viridis', aspect='auto')
-            ax_heatmap.set_title("Interactive Heatmap of Log Probabilities")
-            ax_heatmap.set_xlabel("Token Position")
-            ax_heatmap.set_ylabel("Probability Level")
-            plt.colorbar(im, ax=ax_heatmap, label="Log Probability")
-            buf_heatmap = io.BytesIO()
-            plt.savefig(buf_heatmap, format="png", bbox_inches="tight", dpi=100)
-            buf_heatmap.seek(0)
-            plt.close(fig_heatmap)
-            return buf_heatmap
-        # 17. Probability Distribution Plots (Box Plots for Top Logprobs)
-        def create_dist_plot():
-            if not logprobs or not top_alternatives:
-                raise ValueError("No data for probability distribution")
-            all_top_probs = [p[1] for alts in top_alternatives for p in alts]
-            fig_dist, ax_dist = plt.subplots(figsize=(10, 5))
-            ax_dist.boxplot([logprobs] + [p[1] for alts in top_alternatives for p in alts[:2]], labels=["Selected"] + ["Alt1", "Alt2"])
-            ax_dist.set_title("Probability Distribution of Top Tokens")
-            ax_dist.set_xlabel("Token Type")
-            ax_dist.set_ylabel("Log Probability")
-            ax_dist.grid(True)
-            buf_dist = io.BytesIO()
-            plt.savefig(buf_dist, format="png", bbox_inches="tight", dpi=100)
-            buf_dist.seek(0)
-            plt.close(fig_dist)
-            return buf_dist
-        # Create all plots safely
-        img_main_html = "Placeholder for Log Probability Plot"
-        img_cluster_html = "Placeholder for K-Means Clustering"
-        img_drops_html = "Placeholder for Probability Drops"
-        img_ngram_html = "Placeholder for N-Gram Analysis"
-        img_markov_html = "Placeholder for Markov Chain"
-        img_anomaly_html = "Placeholder for Anomaly Detection"
-        img_autocorr_html = "Placeholder for Autocorrelation"
-        img_smoothing_html = "Placeholder for Smoothing (Moving Average)"
-        img_uncertainty_html = "Placeholder for Uncertainty Propagation"
-        img_corr_html = "Placeholder for Correlation Heatmap"
-        img_type_html = "Placeholder for Token Type Correlation"
-        img_embed_html = "Placeholder for Embedding Similarity vs. Probability"
-        img_bayesian_html = "Placeholder for Bayesian Inference (Entropy)"
-        img_graph_html = "Placeholder for Graph of Token Transitions"
-        img_tsne_html = "Placeholder for t-SNE of Log Probabilities"
-        img_heatmap_html = "Placeholder for Interactive Heatmap"
-        img_dist_html = "Placeholder for Probability Distribution"
-        try:
-            buf_main = create_main_plot()
-            img_main_bytes = buf_main.getvalue()
-            img_main_base64 = base64.b64encode(img_main_bytes).decode("utf-8")
-            img_main_html = f'<img src="data:image/png;base64,{img_main_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create main plot: %s", str(e))
-        try:
-            buf_cluster = create_cluster_plot()
-            img_cluster_bytes = buf_cluster.getvalue()
-            img_cluster_base64 = base64.b64encode(img_cluster_bytes).decode("utf-8")
-            img_cluster_html = f'<img src="data:image/png;base64,{img_cluster_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create cluster plot: %s", str(e))
-        try:
-            buf_drops = create_drops_plot()
-            img_drops_bytes = buf_drops.getvalue()
-            img_drops_base64 = base64.b64encode(img_drops_bytes).decode("utf-8")
-            img_drops_html = f'<img src="data:image/png;base64,{img_drops_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create drops plot: %s", str(e))
-        try:
-            buf_ngram = create_ngram_plot()
-            img_ngram_bytes = buf_ngram.getvalue()
-            img_ngram_base64 = base64.b64encode(img_ngram_bytes).decode("utf-8")
-            img_ngram_html = f'<img src="data:image/png;base64,{img_ngram_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create ngram plot: %s", str(e))
-        try:
-            buf_markov = create_markov_plot()
-            img_markov_bytes = buf_markov.getvalue()
-            img_markov_base64 = base64.b64encode(img_markov_bytes).decode("utf-8")
-            img_markov_html = f'<img src="data:image/png;base64,{img_markov_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create markov plot: %s", str(e))
-        try:
-            buf_anomaly = create_anomaly_plot()
-            img_anomaly_bytes = buf_anomaly.getvalue()
-            img_anomaly_base64 = base64.b64encode(img_anomaly_bytes).decode("utf-8")
-            img_anomaly_html = f'<img src="data:image/png;base64,{img_anomaly_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create anomaly plot: %s", str(e))
-        try:
-            buf_autocorr = create_autocorr_plot()
-            img_autocorr_bytes = buf_autocorr.getvalue()
-            img_autocorr_base64 = base64.b64encode(img_autocorr_bytes).decode("utf-8")
-            img_autocorr_html = f'<img src="data:image/png;base64,{img_autocorr_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create autocorr plot: %s", str(e))
-        try:
-            buf_smoothing = create_smoothing_plot()
-            img_smoothing_bytes = buf_smoothing.getvalue()
-            img_smoothing_base64 = base64.b64encode(img_smoothing_bytes).decode("utf-8")
-            img_smoothing_html = f'<img src="data:image/png;base64,{img_smoothing_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create smoothing plot: %s", str(e))
-        try:
-            buf_uncertainty = create_uncertainty_plot()
-            img_uncertainty_bytes = buf_uncertainty.getvalue()
-            img_uncertainty_base64 = base64.b64encode(img_uncertainty_bytes).decode("utf-8")
-            img_uncertainty_html = f'<img src="data:image/png;base64,{img_uncertainty_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create uncertainty plot: %s", str(e))
-        try:
-            buf_corr = create_corr_plot()
-            img_corr_bytes = buf_corr.getvalue()
-            img_corr_base64 = base64.b64encode(img_corr_bytes).decode("utf-8")
-            img_corr_html = f'<img src="data:image/png;base64,{img_corr_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create correlation plot: %s", str(e))
-        try:
-            buf_type = create_type_plot()
-            img_type_bytes = buf_type.getvalue()
-            img_type_base64 = base64.b64encode(img_type_bytes).decode("utf-8")
-            img_type_html = f'<img src="data:image/png;base64,{img_type_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create type plot: %s", str(e))
-        try:
-            buf_embed = create_embed_plot()
-            img_embed_bytes = buf_embed.getvalue()
-            img_embed_base64 = base64.b64encode(img_embed_bytes).decode("utf-8")
-            img_embed_html = f'<img src="data:image/png;base64,{img_embed_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create embed plot: %s", str(e))
-        try:
-            buf_bayesian = create_bayesian_plot()
-            img_bayesian_bytes = buf_bayesian.getvalue()
-            img_bayesian_base64 = base64.b64encode(img_bayesian_bytes).decode("utf-8")
-            img_bayesian_html = f'<img src="data:image/png;base64,{img_bayesian_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create bayesian plot: %s", str(e))
-        try:
-            buf_graph = create_graph_plot()
-            img_graph_bytes = buf_graph.getvalue()
-            img_graph_base64 = base64.b64encode(img_graph_bytes).decode("utf-8")
-            img_graph_html = f'<img src="data:image/png;base64,{img_graph_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create graph plot: %s", str(e))
-        try:
-            buf_tsne = create_tsne_plot()
-            img_tsne_bytes = buf_tsne.getvalue()
-            img_tsne_base64 = base64.b64encode(img_tsne_bytes).decode("utf-8")
-            img_tsne_html = f'<img src="data:image/png;base64,{img_tsne_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create tsne plot: %s", str(e))
-        try:
-            buf_heatmap = create_heatmap_plot()
-            img_heatmap_bytes = buf_heatmap.getvalue()
-            img_heatmap_base64 = base64.b64encode(img_heatmap_bytes).decode("utf-8")
-            img_heatmap_html = f'<img src="data:image/png;base64,{img_heatmap_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create heatmap plot: %s", str(e))
-        try:
-            buf_dist = create_dist_plot()
-            img_dist_bytes = buf_dist.getvalue()
-            img_dist_base64 = base64.b64encode(img_dist_bytes).decode("utf-8")
-            img_dist_html = f'<img src="data:image/png;base64,{img_dist_base64}" style="max-width: 100%; height: auto;">'
-        except Exception as e:
-            logger.error("Failed to create distribution plot: %s", str(e))
-        # Create DataFrame for the table
         table_data = []
-        for i, entry in enumerate(content):
             logprob = ensure_float(entry.get("logprob", None))
             if logprob is not None and math.isfinite(logprob) and logprob >= prob_filter and "top_logprobs" in entry and entry["top_logprobs"] is not None:
                 token = entry["token"]
@@ -645,75 +205,52 @@ def visualize_logprobs(json_input, prob_filter=-1e9):
             else None
         )
-        # Generate colored text
-        if logprobs:
-            min_logprob = min(logprobs)
-            max_logprob = max(logprobs)
             if max_logprob == min_logprob:
-                normalized_probs = [0.5] * len(logprobs)
             else:
                 normalized_probs = [
-                    (lp - min_logprob) / (max_logprob - min_logprob) for lp in logprobs
                 ]
             colored_text = ""
-            for i, (token, norm_prob) in enumerate(zip(tokens, normalized_probs)):
                 r = int(255 * (1 - norm_prob))  # Red for low confidence
                 g = int(255 * norm_prob)        # Green for high confidence
                 b = 0
                 color = f"rgb({r}, {g}, {b})"
                 colored_text += f'<span style="color: {color}; font-weight: bold;">{token}</span>'
-                if i < len(tokens) - 1:
                     colored_text += " "
             colored_text_html = f"<p>{colored_text}</p>"
         else:
             colored_text_html = "No finite log probabilities to display."
-        # Top 3 Token Log Probabilities
         alt_viz_html = ""
-        if logprobs and top_alternatives:
-            alt_viz_html = "<h3>Top 3 Token Log Probabilities</h3><ul>"
-            for i, (token, probs) in enumerate(zip(tokens, top_alternatives)):
-                alt_viz_html += f"<li>Position {i} (Token: {token}):<br>"
                 for tok, prob in probs:
                     alt_viz_html += f"{tok}: {prob:.4f}<br>"
                 alt_viz_html += "</li>"
             alt_viz_html += "</ul>"
-        # Convert buffers to HTML for Gradio
-        def buffer_to_html(buf):
-            if isinstance(buf, str):  # If it's an error message
-                return buf
-            img_bytes = buf.getvalue()
-            img_base64 = base64.b64encode(img_bytes).decode("utf-8")
-            return f'<img src="data:image/png;base64,{img_base64}" style="max-width: 100%; height: auto;">'
-        return (
-            buffer_to_html(img_main_html), df, colored_text_html, alt_viz_html,
-            buffer_to_html(img_cluster_html), buffer_to_html(img_drops_html), buffer_to_html(img_ngram_html),
-            buffer_to_html(img_markov_html), buffer_to_html(img_anomaly_html), buffer_to_html(img_autocorr_html),
-            buffer_to_html(img_smoothing_html), buffer_to_html(img_uncertainty_html), buffer_to_html(img_corr_html),
-            buffer_to_html(img_type_html), buffer_to_html(img_embed_html), buffer_to_html(img_bayesian_html),
-            buffer_to_html(img_graph_html), buffer_to_html(img_tsne_html), buffer_to_html(img_heatmap_html),
-            buffer_to_html(img_dist_html)
-        )
     except Exception as e:
         logger.error("Visualization failed: %s", str(e))
-        return (
-            f"Error: {str(e)}", None, None, None, "Placeholder for K-Means Clustering", "Placeholder for Probability Drops",
-            "Placeholder for N-Gram Analysis", "Placeholder for Markov Chain", "Placeholder for Anomaly Detection",
-            "Placeholder for Autocorrelation", "Placeholder for Smoothing (Moving Average)", "Placeholder for Uncertainty Propagation",
-            "Placeholder for Correlation Heatmap", "Placeholder for Token Type Correlation", "Placeholder for Embedding Similarity vs. Probability",
-            "Placeholder for Bayesian Inference (Entropy)", "Placeholder for Graph of Token Transitions", "Placeholder for t-SNE of Log Probabilities",
-            "Placeholder for Interactive Heatmap", "Placeholder for Probability Distribution"
-        )
-# Gradio interface with improved layout and placeholders
 with gr.Blocks(title="Log Probability Visualizer") as app:
     gr.Markdown("# Log Probability Visualizer")
     gr.Markdown(
-        "Paste your JSON or Python dictionary log prob data below to visualize the tokens and their probabilities. Use the filter to focus on specific log probability ranges."
     )
     with gr.Row():
@@ -725,61 +262,54 @@ with gr.Blocks(title="Log Probability Visualizer") as app:
             )
         with gr.Column(scale=1):
             prob_filter = gr.Slider(minimum=-1e9, maximum=0, value=-1e9, label="Log Probability Filter (≥)")
-    with gr.Tabs():
-        with gr.Tab("Core Visualizations"):
-            with gr.Row():
-                plot_output = gr.HTML(label="Log Probability Plot (Click for Tokens)", value="Placeholder for Log Probability Plot")
-                table_output = gr.Dataframe(label="Token Log Probabilities and Top Alternatives", value=None)
-            with gr.Row():
-                text_output = gr.HTML(label="Colored Text (Confidence Visualization)", value="Placeholder for Colored Text (Confidence Visualization)")
-                alt_viz_output = gr.HTML(label="Top 3 Token Log Probabilities", value="Placeholder for Top 3 Token Log Probabilities")
-        with gr.Tab("Clustering & Patterns"):
-            with gr.Row():
-                cluster_output = gr.HTML(label="K-Means Clustering", value="Placeholder for K-Means Clustering")
-                drops_output = gr.HTML(label="Probability Drops", value="Placeholder for Probability Drops")
-            with gr.Row():
-                ngram_output = gr.HTML(label="N-Gram Analysis", value="Placeholder for N-Gram Analysis")
-                markov_output = gr.HTML(label="Markov Chain", value="Placeholder for Markov Chain")
-        with gr.Tab("Time Series & Anomalies"):
-            with gr.Row():
-                anomaly_output = gr.HTML(label="Anomaly Detection", value="Placeholder for Anomaly Detection")
-                autocorr_output = gr.HTML(label="Autocorrelation", value="Placeholder for Autocorrelation")
-            with gr.Row():
-                smoothing_output = gr.HTML(label="Smoothing (Moving Average)", value="Placeholder for Smoothing (Moving Average)")
-                uncertainty_output = gr.HTML(label="Uncertainty Propagation", value="Placeholder for Uncertainty Propagation")
-        with gr.Tab("Correlation & Types"):
-            with gr.Row():
-                corr_output = gr.HTML(label="Correlation Heatmap", value="Placeholder for Correlation Heatmap")
-                type_output = gr.HTML(label="Token Type Correlation", value="Placeholder for Token Type Correlation")
-        with gr.Tab("Advanced Analyses"):
-            with gr.Row():
-                embed_output = gr.HTML(label="Embedding Similarity vs. Probability", value="Placeholder for Embedding Similarity vs. Probability")
-                bayesian_output = gr.HTML(label="Bayesian Inference (Entropy)", value="Placeholder for Bayesian Inference (Entropy)")
-            with gr.Row():
-                graph_output = gr.HTML(label="Graph of Token Transitions", value="Placeholder for Graph of Token Transitions")
-                tsne_output = gr.HTML(label="t-SNE of Log Probabilities", value="Placeholder for t-SNE of Log Probabilities")
-        with gr.Tab("Enhanced Visualizations"):
-            with gr.Row():
-                heatmap_output = gr.HTML(label="Interactive Heatmap", value="Placeholder for Interactive Heatmap")
-                dist_output = gr.HTML(label="Probability Distribution", value="Placeholder for Probability Distribution")
     btn = gr.Button("Visualize")
     btn.click(
         fn=visualize_logprobs,
-        inputs=[json_input, prob_filter],
-        outputs=[
-            plot_output, table_output, text_output, alt_viz_output,
-            cluster_output, drops_output, ngram_output, markov_output,
-            anomaly_output, autocorr_output, smoothing_output, uncertainty_output,
-            corr_output, type_output, embed_output, bayesian_output,
-            graph_output, tsne_output, heatmap_output, dist_output
-        ],
     )
 app.launch()

 import ast
 import logging
 import numpy as np
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
 from scipy import stats
 # Set up logging
 logging.basicConfig(level=logging.DEBUG)
         return float(value)
     return None
+# Function to process and visualize log probs with interactive Plotly plots
+def visualize_logprobs(json_input, prob_filter=-1e9, page_size=50, page=0):
     try:
         # Parse the input (handles both JSON and Python dictionaries)
         data = parse_input(json_input)
         tokens = []
         logprobs = []
         top_alternatives = []  # List to store top 3 log probs (selected token + 2 alternatives)
         for entry in content:
             logprob = ensure_float(entry.get("logprob", None))
             if logprob is not None and math.isfinite(logprob) and logprob >= prob_filter:
                 tokens.append(entry["token"])
                 logprobs.append(logprob)
                 # Get top_logprobs, default to empty dict if None
                 top_probs = entry.get("top_logprobs", {})
                 # Ensure all values in top_logprobs are floats
         # Check if there's valid data after filtering
         if not logprobs or not tokens:
+            return (gr.update(value="No finite log probabilities or tokens to visualize after filtering"), None, None, None, 1, 0)
+        # Paginate data for large inputs
+        total_pages = max(1, (len(logprobs) + page_size - 1) // page_size)
+        start_idx = page * page_size
+        end_idx = min((page + 1) * page_size, len(logprobs))
+        paginated_tokens = tokens[start_idx:end_idx]
+        paginated_logprobs = logprobs[start_idx:end_idx]
+        paginated_alternatives = top_alternatives[start_idx:end_idx] if top_alternatives else []
+        # 1. Main Log Probability Plot (Interactive Plotly)
+        main_fig = go.Figure()
+        main_fig.add_trace(go.Scatter(x=list(range(len(paginated_logprobs))), y=paginated_logprobs, mode='markers+lines', name='Log Prob', marker=dict(color='blue')))
+        main_fig.update_layout(
+            title="Log Probabilities of Generated Tokens",
+            xaxis_title="Token Position",
+            yaxis_title="Log Probability",
+            hovermode="closest",
+            clickmode='event+select'
+        )
+        main_fig.update_traces(
+            customdata=[f"Token: {tok}, Log Prob: {prob:.4f}, Position: {i+start_idx}" for i, (tok, prob) in enumerate(zip(paginated_tokens, paginated_logprobs))],
+            hovertemplate='<b>%{customdata}</b><extra></extra>'
+        )
+        # 2. Probability Drop Analysis (Interactive Plotly)
+        if len(paginated_logprobs) < 2:
+            drops_fig = go.Figure()
+            drops_fig.add_trace(go.Bar(x=list(range(len(paginated_logprobs)-1)), y=[0], name='Drop', marker_color='red'))
+        else:
+            drops = [paginated_logprobs[i+1] - paginated_logprobs[i] for i in range(len(paginated_logprobs)-1)]
+            drops_fig = go.Figure()
+            drops_fig.add_trace(go.Bar(x=list(range(len(drops))), y=drops, name='Drop', marker_color='red'))
+            drops_fig.update_layout(
+                title="Significant Probability Drops",
+                xaxis_title="Token Position",
+                yaxis_title="Log Probability Drop",
+                hovermode="closest",
+                clickmode='event+select'
+            )
+            drops_fig.update_traces(
+                customdata=[f"Drop: {drop:.4f}, From: {paginated_tokens[i]} to {paginated_tokens[i+1]}, Position: {i+start_idx}" for i, drop in enumerate(drops)],
+                hovertemplate='<b>%{customdata}</b><extra></extra>'
+            )
+        # 3. Anomaly Detection (Interactive Plotly)
+        if not paginated_logprobs:
+            anomaly_fig = go.Figure()
+            anomaly_fig.add_trace(go.Scatter(x=[], y=[], mode='markers+lines', name='Log Prob', marker_color='blue'))
+        else:
+            z_scores = np.abs(stats.zscore(paginated_logprobs))
             outliers = z_scores > 2  # Threshold for outliers
+            anomaly_fig = go.Figure()
+            anomaly_fig.add_trace(go.Scatter(x=list(range(len(paginated_logprobs))), y=paginated_logprobs, mode='markers+lines', name='Log Prob', marker_color='blue'))
+            anomaly_fig.add_trace(go.Scatter(x=np.where(outliers)[0], y=[paginated_logprobs[i] for i in np.where(outliers)[0]], mode='markers', name='Outliers', marker_color='red'))
+            anomaly_fig.update_layout(
+                title="Log Probabilities with Outliers",
+                xaxis_title="Token Position",
+                yaxis_title="Log Probability",
+                hovermode="closest",
+                clickmode='event+select'
+            )
+            anomaly_fig.update_traces(
+                customdata=[f"Token: {tok}, Log Prob: {prob:.4f}, Position: {i+start_idx}, Outlier: {out}" for i, (tok, prob, out) in enumerate(zip(paginated_tokens, paginated_logprobs, outliers))],
+                hovertemplate='<b>%{customdata}</b><extra></extra>'
+            )
+        # Create DataFrame for the table (paginated)
         table_data = []
+        for i, entry in enumerate(content[start_idx:end_idx]):
             logprob = ensure_float(entry.get("logprob", None))
             if logprob is not None and math.isfinite(logprob) and logprob >= prob_filter and "top_logprobs" in entry and entry["top_logprobs"] is not None:
                 token = entry["token"]
             else None
         )
+        # Generate colored text (paginated)
+        if paginated_logprobs:
+            min_logprob = min(paginated_logprobs)
+            max_logprob = max(paginated_logprobs)
             if max_logprob == min_logprob:
+                normalized_probs = [0.5] * len(paginated_logprobs)
             else:
                 normalized_probs = [
+                    (lp - min_logprob) / (max_logprob - min_logprob) for lp in paginated_logprobs
                 ]
             colored_text = ""
+            for i, (token, norm_prob) in enumerate(zip(paginated_tokens, normalized_probs)):
                 r = int(255 * (1 - norm_prob))  # Red for low confidence
                 g = int(255 * norm_prob)        # Green for high confidence
                 b = 0
                 color = f"rgb({r}, {g}, {b})"
                 colored_text += f'<span style="color: {color}; font-weight: bold;">{token}</span>'
+                if i < len(paginated_tokens) - 1:
                     colored_text += " "
             colored_text_html = f"<p>{colored_text}</p>"
         else:
             colored_text_html = "No finite log probabilities to display."
+        # Top 3 Token Log Probabilities (paginated)
         alt_viz_html = ""
+        if paginated_logprobs and paginated_alternatives:
+            alt_viz_html = "<h3>Top 3 Token Log Probabilities (Paginated)</h3><ul>"
+            for i, (token, probs) in enumerate(zip(paginated_tokens, paginated_alternatives)):
+                alt_viz_html += f"<li>Position {i+start_idx} (Token: {token}):<br>"
                 for tok, prob in probs:
                     alt_viz_html += f"{tok}: {prob:.4f}<br>"
                 alt_viz_html += "</li>"
             alt_viz_html += "</ul>"
+        return (main_fig, df, colored_text_html, alt_viz_html, drops_fig, anomaly_fig, total_pages, page)
     except Exception as e:
         logger.error("Visualization failed: %s", str(e))
+        return (gr.update(value=f"Error: {str(e)}"), None, "No finite log probabilities to display.", None, gr.update(value="No data for probability drops."), gr.update(value="No data for anomalies."), 1, 0)
+# Gradio interface with interactive layout and pagination
 with gr.Blocks(title="Log Probability Visualizer") as app:
     gr.Markdown("# Log Probability Visualizer")
     gr.Markdown(
+        "Paste your JSON or Python dictionary log prob data below to visualize the tokens and their probabilities. Use the filter and pagination to navigate large inputs."
     )
     with gr.Row():
             )
         with gr.Column(scale=1):
             prob_filter = gr.Slider(minimum=-1e9, maximum=0, value=-1e9, label="Log Probability Filter (≥)")
+            page_size = gr.Number(value=50, label="Page Size", precision=0, minimum=10, maximum=1000)
+            page = gr.Number(value=0, label="Page Number", precision=0, minimum=0)
+    with gr.Row():
+        plot_output = gr.Plot(label="Log Probability Plot (Click for Tokens)")
+        drops_output = gr.Plot(label="Probability Drops (Click for Details)")
+    with gr.Row():
+        anomaly_output = gr.Plot(label="Anomaly Detection (Click for Details)")
+        table_output = gr.Dataframe(label="Token Log Probabilities and Top Alternatives")
+    with gr.Row():
+        text_output = gr.HTML(label="Colored Text (Confidence Visualization)")
+        alt_viz_output = gr.HTML(label="Top 3 Token Log Probabilities")
     btn = gr.Button("Visualize")
     btn.click(
         fn=visualize_logprobs,
+        inputs=[json_input, prob_filter, page_size, page],
+        outputs=[plot_output, table_output, text_output, alt_viz_output, drops_output, anomaly_output, gr.State(visible=False), gr.State(visible=False)],
+    )
+    # Pagination controls
+    with gr.Row():
+        prev_btn = gr.Button("Previous Page")
+        next_btn = gr.Button("Next Page")
+        total_pages_output = gr.Number(label="Total Pages", interactive=False, visible=False)
+        current_page_output = gr.Number(label="Current Page", interactive=False, visible=False)
+    def update_page(json_input, prob_filter, page_size, current_page, action):
+        if action == "prev" and current_page > 0:
+            current_page -= 1
+        elif action == "next":
+            total_pages = visualize_logprobs(json_input, prob_filter, page_size, 0)[6]  # Get total pages
+            if current_page < total_pages - 1:
+                current_page += 1
+        return gr.update(value=current_page), gr.update(value=total_pages)
+    prev_btn.click(
+        fn=lambda *args: update_page(*args, "prev"),
+        inputs=[json_input, prob_filter, page_size, page, gr.State()],
+        outputs=[page, total_pages_output]
+    )
+    next_btn.click(
+        fn=lambda *args: update_page(*args, "next"),
+        inputs=[json_input, prob_filter, page_size, page, gr.State()],
+        outputs=[page, total_pages_output]
     )
 app.launch()