Gül Sena Altıntaş
Refactoring, and visual improvements
c02e89e
raw
history blame
17.4 kB
from collections import Counter
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from utils import tokenize_with_hf, tokenize_with_tiktoken
def compare_tokenizers(text, selected_models, show_details=False):
if not text.strip():
return "Please enter some text to tokenize.", "", "", "", None, None
results = {}
for model in selected_models:
if model in ["gpt-4", "gpt-2"]:
results[model] = tokenize_with_tiktoken(text, model)
else:
results[model] = tokenize_with_hf(text, model)
# Generate outputs
efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
results
)
detailed_output = generate_detailed_analysis(results) if show_details else ""
efficiency_chart = create_efficiency_chart(results)
token_distribution_chart = create_token_distribution_chart(results)
return (
efficiency_output,
tokenization_html,
token_ids_output,
detailed_output,
efficiency_chart,
token_distribution_chart,
)
def generate_basic_comparison(results):
if not results:
return "No results to display.", "", ""
# Efficiency ranking
sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])
ranking_output = []
ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
for i, (model, result) in enumerate(sorted_models):
if "error" in result:
ranking_output.append(
f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}"
)
else:
ranking_output.append(
f"{i + 1}. **{result['model']}**: {result['token_count']} tokens "
f"({result['compression_ratio']:.2f}x compression)"
)
# Generate interactive tokenization display
tokenization_html = generate_interactive_tokenization(results)
# Generate token ID tables
token_ids_display = generate_token_ids_display(results)
return "\n".join(ranking_output), tokenization_html, token_ids_display
def generate_interactive_tokenization(results):
"""Generate HTML with hover highlighting across tokenizers"""
if not results:
return "<p>No tokenization results to display.</p>"
html_parts = []
html_parts.append("""
<style>
.tokenizer-container {
margin-bottom: 20px;
border: 1px solid #e0e0e0;
border-radius: 8px;
padding: 15px;
background: white;
}
.tokenizer-header {
font-weight: bold;
font-size: 18px;
margin-bottom: 10px;
color: #2c3e50;
}
.token-display {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
line-height: 1.8;
word-wrap: break-word;
}
.token {
display: inline-block;
margin: 2px;
padding: 4px 8px;
border-radius: 4px;
border: 1px solid;
cursor: pointer;
transition: all 0.2s ease;
position: relative;
font-size: 14px;
}
.token:hover {
transform: scale(1.1);
z-index: 10;
box-shadow: 0 2px 8px rgba(0,0,0,0.2);
}
.token.highlighted {
background: #ff6b6b !important;
border-color: #e55353 !important;
color: white !important;
box-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
}
.token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
.token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
.token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; }
.token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; }
.token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; }
.token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; }
.token-subword {
background: #fff8e1 !important;
border-color: #ffc107 !important;
border-style: dashed !important;
}
.token-stats {
display: inline-block;
margin-left: 10px;
padding: 2px 6px;
background: #f8f9fa;
border-radius: 3px;
font-size: 12px;
color: #666;
}
</style>
<script>
function highlightToken(text, allTokenizers) {
// Remove existing highlights
document.querySelectorAll('.token').forEach(token => {
token.classList.remove('highlighted');
});
// Highlight tokens with same text across all tokenizers
document.querySelectorAll('.token').forEach(token => {
if (token.dataset.text === text) {
token.classList.add('highlighted');
}
});
}
function clearHighlights() {
document.querySelectorAll('.token').forEach(token => {
token.classList.remove('highlighted');
});
}
</script>
""")
for model, result in results.items():
if "error" in result:
html_parts.append(f"""
<div class="tokenizer-container">
<div class="tokenizer-header">{result["model"]} ❌</div>
<div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
</div>
""")
continue
html_parts.append(f"""
<div class="tokenizer-container">
<div class="tokenizer-header">
{result["model"]}
<span class="token-stats">
{result["token_count"]} tokens |
{result["encoding"]} |
{result["compression_ratio"]:.2f}x compression
</span>
</div>
<div class="token-display">
""")
# Add tokens with hover functionality
subword_count = 0
for i, token in enumerate(result["tokens"]):
token_text = token["text"]
display_text = (
token_text if token_text.strip() else "·"
) # Show space as dot
# Determine token class
token_class = f"token token-{token['type']}"
if token["is_subword"]:
token_class += " token-subword"
subword_count += 1
# Escape text for HTML
escaped_text = token_text.replace('"', "&quot;").replace("'", "&#39;")
escaped_display = display_text.replace('"', "&quot;").replace("'", "&#39;")
html_parts.append(f"""
<span class="{token_class}"
data-text="{escaped_text}"
data-id="{token["id"]}"
data-position="{i}"
title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
onmouseover="highlightToken('{escaped_text}', true)"
onmouseout="clearHighlights()">
{escaped_display}
</span>
""")
html_parts.append(f"""
</div>
<div style="margin-top: 8px; font-size: 12px; color: #666;">
Subwords: {subword_count}/{len(result["tokens"])}
({subword_count / len(result["tokens"]) * 100:.1f}%)
</div>
</div>
""")
return "".join(html_parts)
def generate_token_ids_display(results):
"""Generate a clean display of token IDs for each tokenizer"""
if not results:
return "No token IDs to display."
output = []
output.append("## 🔢 Token IDs by Tokenizer")
for model, result in results.items():
if "error" in result:
output.append(f"\n### {result['model']} ❌")
output.append(f"Error: {result['error']}")
continue
output.append(f"\n### {result['model']}")
output.append(
f"**Vocab Size**: {result['vocab_size']:,} | **Encoding**: {result['encoding']}"
)
# Display token IDs in a readable format
token_ids = [str(token["id"]) for token in result["tokens"]]
# Group IDs for better readability (10 per line)
lines = []
for i in range(0, len(token_ids), 10):
line_ids = token_ids[i : i + 10]
lines.append(" ".join(line_ids))
output.append("```")
output.append("\n".join(lines))
output.append("```")
# Add some statistics
unique_ids = len(set(token_ids))
output.append(
f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
)
# Show ID ranges
id_values = [token["id"] for token in result["tokens"]]
if id_values:
output.append(f"**ID Range**: {min(id_values)} - {max(id_values)}")
return "\n".join(output)
def generate_detailed_analysis(results):
if not results or len(results) < 2:
return "Need at least 2 tokenizers for detailed analysis."
output = []
output.append("## 🔍 Detailed Analysis")
# Find common tokens
all_token_sets = []
for model, result in results.items():
if "error" not in result:
token_texts = {token["text"] for token in result["tokens"]}
all_token_sets.append(token_texts)
if all_token_sets:
common_tokens = set.intersection(*all_token_sets)
output.append(f"\n### Common Tokens ({len(common_tokens)})")
if common_tokens:
common_display = [
f"`{token}`" if token != " " else "`·`"
for token in list(common_tokens)[:15]
]
output.append(" ".join(common_display))
else:
output.append("No common tokens found.")
# Token type distribution
output.append("\n### Token Type Distribution")
for model, result in results.items():
if "error" not in result:
type_counts = Counter(token["type"] for token in result["tokens"])
type_display = [f"{type_}: {count}" for type_, count in type_counts.items()]
output.append(f"**{result['model']}**: {', '.join(type_display)}")
# Subword analysis
output.append("\n### Subword Analysis")
for model, result in results.items():
if "error" not in result:
subwords = [token for token in result["tokens"] if token["is_subword"]]
subword_ratio = (
len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0
)
output.append(
f"**{result['model']}**: {len(subwords)} subwords ({subword_ratio:.1f}%)"
)
return "\n".join(output)
def create_efficiency_chart(results):
if not results:
return None
models = []
token_counts = []
compression_ratios = []
for model, result in results.items():
if "error" not in result:
models.append(result["model"])
token_counts.append(result["token_count"])
compression_ratios.append(result["compression_ratio"])
if not models:
return None
fig = go.Figure()
# Add token count bars
fig.add_trace(
go.Bar(
x=models,
y=token_counts,
name="Token Count",
marker_color="lightblue",
text=token_counts,
textposition="auto",
)
)
fig.update_layout(
title="Token Count Comparison (Lower = More Efficient)",
xaxis_title="Tokenizer",
yaxis_title="Number of Tokens",
template="plotly_white",
)
return fig
def create_token_distribution_chart(results):
if not results:
return None
all_data = []
for model, result in results.items():
if "error" not in result:
type_counts = Counter(token["type"] for token in result["tokens"])
for token_type, count in type_counts.items():
all_data.append(
{
"Tokenizer": result["model"],
"Token Type": token_type,
"Count": count,
}
)
if not all_data:
return None
df = pd.DataFrame(all_data)
fig = px.bar(
df,
x="Tokenizer",
y="Count",
color="Token Type",
title="Token Type Distribution by Tokenizer",
template="plotly_white",
)
return fig
# Custom CSS for better styling
css = """
.gradio-container {
font-family: 'Inter', sans-serif;
}
.token-display {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
background: #f8f9fa;
padding: 8px;
border-radius: 4px;
font-size: 0.9em;
}
"""
# Create the Gradio interface
with gr.Blocks(
title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css
) as demo:
gr.Markdown("""
# 🔤 Advanced Tokenizer Comparison Tool
Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
**Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
""")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text to tokenize",
placeholder="Enter your text here...",
lines=4,
value="Hello world! This is a test with some subwords and punctuation.",
)
with gr.Column(scale=1):
model_selector = gr.CheckboxGroup(
choices=[
"gpt-4",
"gpt-2",
"llama-2",
"llama-3",
"gemma-2",
"qwen3",
"qwen2.5",
"bert",
"bloom",
"aya-expanse",
"comma",
"roberta",
"distilbert",
"tokenmonster",
"byt5",
],
value=["gpt-4", "llama-3", "gpt-2"],
label="Select tokenizers to compare",
)
show_details = gr.Checkbox(label="Show detailed analysis", value=False)
with gr.Row():
with gr.Column():
efficiency_output = gr.Markdown(
label="Efficiency Ranking",
value="Enter text above to see efficiency comparison...",
)
with gr.Row():
with gr.Column():
tokenization_display = gr.HTML(
label="Interactive Tokenization (Hover to highlight across tokenizers)",
value="<p>Enter text above to see interactive tokenization...</p>",
)
with gr.Row():
with gr.Column():
token_ids_output = gr.Markdown(
label="Token IDs", value="Token IDs will appear here..."
)
with gr.Row():
with gr.Column():
detailed_output = gr.Markdown(label="Detailed Analysis", visible=False)
with gr.Row():
with gr.Column():
efficiency_chart = gr.Plot(label="Efficiency Comparison")
with gr.Column():
distribution_chart = gr.Plot(label="Token Type Distribution")
# Update visibility of detailed analysis
def toggle_details(show_details):
return gr.update(visible=show_details)
show_details.change(fn=toggle_details, inputs=show_details, outputs=detailed_output)
# Main comparison function
def update_comparison(text, models, details):
efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
compare_tokenizers(text, models, details)
)
return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart
# Auto-update on changes
for component in [text_input, model_selector, show_details]:
component.change(
fn=update_comparison,
inputs=[text_input, model_selector, show_details],
outputs=[
efficiency_output,
tokenization_display,
token_ids_output,
detailed_output,
efficiency_chart,
distribution_chart,
],
)
gr.Markdown("""
---
### About the Models
- **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
- **LLaMA-2/3**: Meta's models using SentencePiece
- **Gemma-2**: Google's model with SentencePiece
- **Qwen3/2.5**: Alibaba's models with BPE
- **BERT/DistilBERT**: Google's models with WordPiece
- **RoBERTa**: Facebook's model with BPE
- **BLOOM**: BigScience's multilingual model with BPE
- **Aya Expanse**: Cohere's multilingual model with SentencePiece
- **Comma (Common Pile)**: Common Pile's model with BPE
### Features
- **Efficiency Ranking**: Compare token counts across models
- **Subword Analysis**: See how models handle subwords
- **Token Types**: Classification of word/number/punctuation tokens
- **Visual Charts**: Interactive plots for comparison
- **Detailed Analysis**: Common tokens and distribution stats
""")
if __name__ == "__main__":
demo.launch()
demo.launch()
demo.launch()