Spaces:

gsaltintas
/

tokenizer-comparison

Running

File size: 17,414 Bytes

from collections import Counter

import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from utils import tokenize_with_hf, tokenize_with_tiktoken


def compare_tokenizers(text, selected_models, show_details=False):
    if not text.strip():
        return "Please enter some text to tokenize.", "", "", "", None, None

    results = {}

    for model in selected_models:
        if model in ["gpt-4", "gpt-2"]:
            results[model] = tokenize_with_tiktoken(text, model)
        else:
            results[model] = tokenize_with_hf(text, model)

    # Generate outputs
    efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
        results
    )
    detailed_output = generate_detailed_analysis(results) if show_details else ""
    efficiency_chart = create_efficiency_chart(results)
    token_distribution_chart = create_token_distribution_chart(results)

    return (
        efficiency_output,
        tokenization_html,
        token_ids_output,
        detailed_output,
        efficiency_chart,
        token_distribution_chart,
    )


def generate_basic_comparison(results):
    if not results:
        return "No results to display.", "", ""

    # Efficiency ranking
    sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])

    ranking_output = []
    ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
    for i, (model, result) in enumerate(sorted_models):
        if "error" in result:
            ranking_output.append(
                f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}"
            )
        else:
            ranking_output.append(
                f"{i + 1}. **{result['model']}**: {result['token_count']} tokens "
                f"({result['compression_ratio']:.2f}x compression)"
            )

    # Generate interactive tokenization display
    tokenization_html = generate_interactive_tokenization(results)

    # Generate token ID tables
    token_ids_display = generate_token_ids_display(results)

    return "\n".join(ranking_output), tokenization_html, token_ids_display


def generate_interactive_tokenization(results):
    """Generate HTML with hover highlighting across tokenizers"""
    if not results:
        return "<p>No tokenization results to display.</p>"

    html_parts = []
    html_parts.append("""
    <style>
    .tokenizer-container {
        margin-bottom: 20px;
        border: 1px solid #e0e0e0;
        border-radius: 8px;
        padding: 15px;
        background: white;
    }
    .tokenizer-header {
        font-weight: bold;
        font-size: 18px;
        margin-bottom: 10px;
        color: #2c3e50;
    }
    .token-display {
        font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
        line-height: 1.8;
        word-wrap: break-word;
    }
    .token {
        display: inline-block;
        margin: 2px;
        padding: 4px 8px;
        border-radius: 4px;
        border: 1px solid;
        cursor: pointer;
        transition: all 0.2s ease;
        position: relative;
        font-size: 14px;
    }
    .token:hover {
        transform: scale(1.1);
        z-index: 10;
        box-shadow: 0 2px 8px rgba(0,0,0,0.2);
    }
    .token.highlighted {
        background: #ff6b6b !important;
        border-color: #e55353 !important;
        color: white !important;
        box-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
    }
    .token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
    .token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
    .token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; }
    .token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; }
    .token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; }
    .token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; }
    .token-subword {
        background: #fff8e1 !important;
        border-color: #ffc107 !important;
        border-style: dashed !important;
    }
    .token-stats {
        display: inline-block;
        margin-left: 10px;
        padding: 2px 6px;
        background: #f8f9fa;
        border-radius: 3px;
        font-size: 12px;
        color: #666;
    }
    </style>
    
    <script>
    function highlightToken(text, allTokenizers) {
        // Remove existing highlights
        document.querySelectorAll('.token').forEach(token => {
            token.classList.remove('highlighted');
        });
        
        // Highlight tokens with same text across all tokenizers
        document.querySelectorAll('.token').forEach(token => {
            if (token.dataset.text === text) {
                token.classList.add('highlighted');
            }
        });
    }
    
    function clearHighlights() {
        document.querySelectorAll('.token').forEach(token => {
            token.classList.remove('highlighted');
        });
    }
    </script>
    """)

    for model, result in results.items():
        if "error" in result:
            html_parts.append(f"""
            <div class="tokenizer-container">
                <div class="tokenizer-header">{result["model"]} ❌</div>
                <div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
            </div>
            """)
            continue

        html_parts.append(f"""
        <div class="tokenizer-container">
            <div class="tokenizer-header">
                {result["model"]} 
                <span class="token-stats">
                    {result["token_count"]} tokens | 
                    {result["encoding"]} | 
                    {result["compression_ratio"]:.2f}x compression
                </span>
            </div>
            <div class="token-display">
        """)

        # Add tokens with hover functionality
        subword_count = 0
        for i, token in enumerate(result["tokens"]):
            token_text = token["text"]
            display_text = (
                token_text if token_text.strip() else "·"
            )  # Show space as dot

            # Determine token class
            token_class = f"token token-{token['type']}"
            if token["is_subword"]:
                token_class += " token-subword"
                subword_count += 1

            # Escape text for HTML
            escaped_text = token_text.replace('"', "&quot;").replace("'", "&#39;")
            escaped_display = display_text.replace('"', "&quot;").replace("'", "&#39;")

            html_parts.append(f"""
                <span class="{token_class}" 
                      data-text="{escaped_text}"
                      data-id="{token["id"]}"
                      data-position="{i}"
                      title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
                      onmouseover="highlightToken('{escaped_text}', true)"
                      onmouseout="clearHighlights()">
                    {escaped_display}
                </span>
            """)

        html_parts.append(f"""
            </div>
            <div style="margin-top: 8px; font-size: 12px; color: #666;">
                Subwords: {subword_count}/{len(result["tokens"])} 
                ({subword_count / len(result["tokens"]) * 100:.1f}%)
            </div>
        </div>
        """)

    return "".join(html_parts)


def generate_token_ids_display(results):
    """Generate a clean display of token IDs for each tokenizer"""
    if not results:
        return "No token IDs to display."

    output = []
    output.append("## 🔢 Token IDs by Tokenizer")

    for model, result in results.items():
        if "error" in result:
            output.append(f"\n### {result['model']} ❌")
            output.append(f"Error: {result['error']}")
            continue

        output.append(f"\n### {result['model']}")
        output.append(
            f"**Vocab Size**: {result['vocab_size']:,} | **Encoding**: {result['encoding']}"
        )

        # Display token IDs in a readable format
        token_ids = [str(token["id"]) for token in result["tokens"]]

        # Group IDs for better readability (10 per line)
        lines = []
        for i in range(0, len(token_ids), 10):
            line_ids = token_ids[i : i + 10]
            lines.append(" ".join(line_ids))

        output.append("```")
        output.append("\n".join(lines))
        output.append("```")

        # Add some statistics
        unique_ids = len(set(token_ids))
        output.append(
            f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
        )

        # Show ID ranges
        id_values = [token["id"] for token in result["tokens"]]
        if id_values:
            output.append(f"**ID Range**: {min(id_values)} - {max(id_values)}")

    return "\n".join(output)


def generate_detailed_analysis(results):
    if not results or len(results) < 2:
        return "Need at least 2 tokenizers for detailed analysis."

    output = []
    output.append("## 🔍 Detailed Analysis")

    # Find common tokens
    all_token_sets = []
    for model, result in results.items():
        if "error" not in result:
            token_texts = {token["text"] for token in result["tokens"]}
            all_token_sets.append(token_texts)

    if all_token_sets:
        common_tokens = set.intersection(*all_token_sets)
        output.append(f"\n### Common Tokens ({len(common_tokens)})")
        if common_tokens:
            common_display = [
                f"`{token}`" if token != " " else "`·`"
                for token in list(common_tokens)[:15]
            ]
            output.append(" ".join(common_display))
        else:
            output.append("No common tokens found.")

    # Token type distribution
    output.append("\n### Token Type Distribution")
    for model, result in results.items():
        if "error" not in result:
            type_counts = Counter(token["type"] for token in result["tokens"])
            type_display = [f"{type_}: {count}" for type_, count in type_counts.items()]
            output.append(f"**{result['model']}**: {', '.join(type_display)}")

    # Subword analysis
    output.append("\n### Subword Analysis")
    for model, result in results.items():
        if "error" not in result:
            subwords = [token for token in result["tokens"] if token["is_subword"]]
            subword_ratio = (
                len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0
            )
            output.append(
                f"**{result['model']}**: {len(subwords)} subwords ({subword_ratio:.1f}%)"
            )

    return "\n".join(output)


def create_efficiency_chart(results):
    if not results:
        return None

    models = []
    token_counts = []
    compression_ratios = []

    for model, result in results.items():
        if "error" not in result:
            models.append(result["model"])
            token_counts.append(result["token_count"])
            compression_ratios.append(result["compression_ratio"])

    if not models:
        return None

    fig = go.Figure()

    # Add token count bars
    fig.add_trace(
        go.Bar(
            x=models,
            y=token_counts,
            name="Token Count",
            marker_color="lightblue",
            text=token_counts,
            textposition="auto",
        )
    )

    fig.update_layout(
        title="Token Count Comparison (Lower = More Efficient)",
        xaxis_title="Tokenizer",
        yaxis_title="Number of Tokens",
        template="plotly_white",
    )

    return fig


def create_token_distribution_chart(results):
    if not results:
        return None

    all_data = []

    for model, result in results.items():
        if "error" not in result:
            type_counts = Counter(token["type"] for token in result["tokens"])
            for token_type, count in type_counts.items():
                all_data.append(
                    {
                        "Tokenizer": result["model"],
                        "Token Type": token_type,
                        "Count": count,
                    }
                )

    if not all_data:
        return None

    df = pd.DataFrame(all_data)

    fig = px.bar(
        df,
        x="Tokenizer",
        y="Count",
        color="Token Type",
        title="Token Type Distribution by Tokenizer",
        template="plotly_white",
    )

    return fig


# Custom CSS for better styling
css = """
.gradio-container {
    font-family: 'Inter', sans-serif;
}
.token-display {
    font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
    background: #f8f9fa;
    padding: 8px;
    border-radius: 4px;
    font-size: 0.9em;
}
"""

# Create the Gradio interface
with gr.Blocks(
    title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css
) as demo:
    gr.Markdown("""
    # 🔤 Advanced Tokenizer Comparison Tool
    
    Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
    
    **Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
    """)

    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Text to tokenize",
                placeholder="Enter your text here...",
                lines=4,
                value="Hello world! This is a test with some subwords and punctuation.",
            )

        with gr.Column(scale=1):
            model_selector = gr.CheckboxGroup(
                choices=[
                    "gpt-4",
                    "gpt-2",
                    "llama-2",
                    "llama-3",
                    "gemma-2",
                    "qwen3",
                    "qwen2.5",
                    "bert",
                    "bloom",
                    "aya-expanse",
                    "comma",
                    "roberta",
                    "distilbert",
                    "tokenmonster",
                    "byt5",
                ],
                value=["gpt-4", "llama-3", "gpt-2"],
                label="Select tokenizers to compare",
            )

            show_details = gr.Checkbox(label="Show detailed analysis", value=False)

    with gr.Row():
        with gr.Column():
            efficiency_output = gr.Markdown(
                label="Efficiency Ranking",
                value="Enter text above to see efficiency comparison...",
            )

    with gr.Row():
        with gr.Column():
            tokenization_display = gr.HTML(
                label="Interactive Tokenization (Hover to highlight across tokenizers)",
                value="<p>Enter text above to see interactive tokenization...</p>",
            )

    with gr.Row():
        with gr.Column():
            token_ids_output = gr.Markdown(
                label="Token IDs", value="Token IDs will appear here..."
            )

    with gr.Row():
        with gr.Column():
            detailed_output = gr.Markdown(label="Detailed Analysis", visible=False)

    with gr.Row():
        with gr.Column():
            efficiency_chart = gr.Plot(label="Efficiency Comparison")
        with gr.Column():
            distribution_chart = gr.Plot(label="Token Type Distribution")

    # Update visibility of detailed analysis
    def toggle_details(show_details):
        return gr.update(visible=show_details)

    show_details.change(fn=toggle_details, inputs=show_details, outputs=detailed_output)

    # Main comparison function
    def update_comparison(text, models, details):
        efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
            compare_tokenizers(text, models, details)
        )
        return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart

    # Auto-update on changes
    for component in [text_input, model_selector, show_details]:
        component.change(
            fn=update_comparison,
            inputs=[text_input, model_selector, show_details],
            outputs=[
                efficiency_output,
                tokenization_display,
                token_ids_output,
                detailed_output,
                efficiency_chart,
                distribution_chart,
            ],
        )

    gr.Markdown("""
    ---
    ### About the Models
    
    - **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
    - **LLaMA-2/3**: Meta's models using SentencePiece
    - **Gemma-2**: Google's model with SentencePiece
    - **Qwen3/2.5**: Alibaba's models with BPE
    - **BERT/DistilBERT**: Google's models with WordPiece
    - **RoBERTa**: Facebook's model with BPE
    - **BLOOM**: BigScience's multilingual model with BPE
    - **Aya Expanse**: Cohere's multilingual model with SentencePiece
    - **Comma (Common Pile)**: Common Pile's model with BPE
    
    ### Features
    - **Efficiency Ranking**: Compare token counts across models
    - **Subword Analysis**: See how models handle subwords
    - **Token Types**: Classification of word/number/punctuation tokens
    - **Visual Charts**: Interactive plots for comparison
    - **Detailed Analysis**: Common tokens and distribution stats
    """)

if __name__ == "__main__":
    demo.launch()
    demo.launch()
    demo.launch()