Gül Sena Altıntaş
Added support for showing newlines
d9779a0
raw
history blame
26.2 kB
from collections import Counter
import gradio as gr
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from utils import (
get_normalization_methods,
normalize_text,
tokenize_with_hf,
tokenize_with_tiktoken,
)
def compare_tokenizers(text, selected_models, show_details=False):
if not text.strip():
return "Please enter some text to tokenize.", "", "", "", None, None
results = {}
for model in selected_models:
if model in ["gpt-4", "gpt-2"]:
results[model] = tokenize_with_tiktoken(text, model)
else:
results[model] = tokenize_with_hf(text, model)
# Generate outputs
efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
results
)
detailed_output = generate_detailed_analysis(results) if show_details else ""
efficiency_chart = create_efficiency_chart(results)
token_distribution_chart = create_token_distribution_chart(results)
return (
efficiency_output,
tokenization_html,
token_ids_output,
detailed_output,
efficiency_chart,
token_distribution_chart,
)
def generate_basic_comparison(results):
if not results:
return "No results to display.", "", ""
# Efficiency ranking
sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])
ranking_output = []
ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
for i, (model, result) in enumerate(sorted_models):
if "error" in result:
ranking_output.append(
f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}"
)
else:
ranking_output.append(
f"{i + 1}. **{result['model']}**: {result['token_count']} tokens "
f"({result['compression_ratio']:.2f}x compression)"
)
# Generate interactive tokenization display
tokenization_html = generate_interactive_tokenization(results)
# Generate token ID tables
token_ids_display = generate_token_ids_display(results)
return "\n".join(ranking_output), tokenization_html, token_ids_display
def generate_interactive_tokenization(results):
"""Generate HTML with working hover highlighting across tokenizers"""
if not results:
return "<p>No tokenization results to display.</p>"
html_parts = []
# Add styles first
html_parts.append("""
<div id="tokenizer-container">
<style>
.tokenizer-section {
margin-bottom: 20px;
border: 1px solid #e0e0e0;
border-radius: 8px;
padding: 15px;
background: white;
}
.tokenizer-header {
font-weight: bold;
font-size: 18px;
margin-bottom: 10px;
color: #2c3e50;
}
.token-display {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
line-height: 1.8;
word-wrap: break-word;
}
.token {
display: inline-block;
margin: 2px;
padding: 4px 8px;
border-radius: 4px;
border: 1px solid;
cursor: pointer;
transition: all 0.2s ease;
position: relative;
font-size: 14px;
user-select: none;
}
.token:hover {
transform: scale(1.05);
z-index: 10;
box-shadow: 0 2px 8px rgba(0,0,0,0.2);
}
.token.highlighted {
background: #ff6b6b !important;
border-color: #e55353 !important;
color: white !important;
box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
transform: scale(1.1) !important;
z-index: 100 !important;
}
.token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
.token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
.token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; }
.token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; }
.token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; }
.token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; }
.token-subword {
background: #fff8e1 !important;
border-color: #ffc107 !important;
border-style: dashed !important;
}
.token-stats {
display: inline-block;
margin-left: 10px;
padding: 2px 6px;
background: #f8f9fa;
border-radius: 3px;
font-size: 12px;
color: #666;
}
.highlight-info {
position: fixed;
top: 10px;
right: 10px;
background: #333;
color: white;
padding: 8px 12px;
border-radius: 4px;
font-size: 12px;
display: none;
z-index: 1000;
}
</style>
<div class="highlight-info" id="highlight-info"></div>
<script>
function highlightTokens(targetText) {
// Clear all highlights
document.querySelectorAll('.token').forEach(function(token) {
token.classList.remove('highlighted');
});
// Highlight matching tokens
let count = 0;
document.querySelectorAll('.token').forEach(function(token) {
if (token.getAttribute('data-text') === targetText) {
token.classList.add('highlighted');
count++;
}
});
// Show info
const info = document.getElementById('highlight-info');
if (info) {
const displayText = targetText === ' ' ? '(space)' : targetText;
info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
info.style.display = 'block';
}
}
function clearHighlights() {
document.querySelectorAll('.token').forEach(function(token) {
token.classList.remove('highlighted');
});
const info = document.getElementById('highlight-info');
if (info) {
info.style.display = 'none';
}
}
</script>
""")
# Generate tokenizer sections with inline event handlers
for model, result in results.items():
if "error" in result:
html_parts.append(f"""
<div class="tokenizer-section">
<div class="tokenizer-header">{result["model"]} ❌</div>
<div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
</div>
""")
continue
html_parts.append(f"""
<div class="tokenizer-section">
<div class="tokenizer-header">
{result["model"]}
<span class="token-stats">
{result["token_count"]} tokens |
{result["encoding"]} |
{result["compression_ratio"]:.2f}x compression
</span>
</div>
<div class="token-display">
""")
# Add tokens with inline event handlers
subword_count = 0
for i, token in enumerate(result["tokens"]):
token_text = token["text"]
display_text = token_text if token_text.strip() else "·"
if token_text == "<newline>":
html_parts.append("<br>")
continue
# Determine token class
token_class = f"token token-{token['type']}"
if token["is_subword"]:
token_class += " token-subword"
subword_count += 1
# Create unique identifier for this token occurrence
token_id = f"token_{model}_{i}"
# Escape text for HTML and JavaScript - be very careful with quotes
escaped_text = (
token_text.replace("\\", "\\\\")
.replace("'", "\\'")
.replace('"', '\\"')
.replace("\r", "\\r")
.replace("\n", "\\n")
)
escaped_display = (
display_text.replace('"', "&quot;")
.replace("'", "&#39;")
.replace("\r", "\n")
)
# Use inline event handlers that work in Gradio
html_parts.append(f"""<span class="{token_class}"
id="{token_id}"
data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
data-id="{token["id"]}"
data-position="{i}"
data-model="{model}"
title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
onmouseover="highlightTokens('{escaped_text}')"
onmouseout="clearHighlights()"
onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")
html_parts.append(f"""
</div>
<div style="margin-top: 8px; font-size: 12px; color: #666;">
Subwords: {subword_count}/{len(result["tokens"])}
({subword_count / len(result["tokens"]) * 100:.1f}%)
</div>
</div>
""")
html_parts.append("</div>")
return "".join(html_parts)
def generate_token_ids_display(results):
"""Generate a clean display of token IDs for each tokenizer"""
if not results:
return "No token IDs to display."
output = []
output.append("## 🔢 Token IDs by Tokenizer")
for model, result in results.items():
if "error" in result:
output.append(f"\n### {result['model']} ❌")
output.append(f"Error: {result['error']}")
continue
output.append(f"\n### {result['model']}")
output.append(
f"**Vocab Size**: {result['vocab_size']:,} | **Encoding**: {result['encoding']}"
)
# Display token IDs in a readable format
token_ids = [str(token["id"]) for token in result["tokens"]]
# Group IDs for better readability (10 per line)
lines = []
for i in range(0, len(token_ids), 10):
line_ids = token_ids[i : i + 10]
lines.append(" ".join(line_ids))
output.append("```")
output.append("\n".join(lines))
output.append("```")
# Add some statistics
unique_ids = len(set(token_ids))
output.append(
f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
)
return "\n".join(output)
def compare_with_normalization(
text, selected_models, normalization_method, show_details=False
):
"""Compare tokenizers with optional normalization"""
normalized_text = normalize_text(text, normalization_method)
print(
"[DEBUG] Before normalization:", text, "\nAfter normalization:", normalized_text
)
# Get both original and normalized results
original_results = {}
normalized_results = {}
for model in selected_models:
if model in ["gpt-4", "gpt-2"]:
original_results[model] = tokenize_with_tiktoken(text, model)
if normalization_method != "none":
normalized_results[model] = tokenize_with_tiktoken(
normalized_text, model
)
else:
original_results[model] = tokenize_with_hf(text, model)
if normalization_method != "none":
normalized_results[model] = tokenize_with_hf(normalized_text, model)
return original_results, normalized_results, normalized_text
def generate_detailed_analysis(results):
if not results or len(results) < 2:
return "Need at least 2 tokenizers for detailed analysis."
output = []
output.append("## 🔍 Detailed Analysis")
# Find common tokens
all_token_sets = []
for model, result in results.items():
if "error" not in result:
token_texts = {token["text"] for token in result["tokens"]}
all_token_sets.append(token_texts)
if all_token_sets:
common_tokens = set.intersection(*all_token_sets)
output.append(f"\n### Common Tokens ({len(common_tokens)})")
if common_tokens:
common_display = [
f"`{token}`" if token != " " else "`·`"
for token in list(common_tokens)[:15]
]
output.append(" ".join(common_display))
else:
output.append("No common tokens found.")
# Token type distribution
output.append("\n### Token Type Distribution")
for model, result in results.items():
if "error" not in result:
type_counts = Counter(token["type"] for token in result["tokens"])
type_display = [f"{type_}: {count}" for type_, count in type_counts.items()]
output.append(f"**{result['model']}**: {', '.join(type_display)}")
# Subword analysis
output.append("\n### Subword Analysis")
for model, result in results.items():
if "error" not in result:
subwords = [token for token in result["tokens"] if token["is_subword"]]
subword_ratio = (
len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0
)
output.append(
f"**{result['model']}**: {len(subwords)} subwords ({subword_ratio:.1f}%)"
)
return "\n".join(output)
def create_efficiency_chart(results):
if not results:
return None
models = []
token_counts = []
compression_ratios = []
for model, result in results.items():
if "error" not in result:
models.append(result["model"])
token_counts.append(result["token_count"])
compression_ratios.append(result["compression_ratio"])
if not models:
return None
fig = go.Figure()
# Add token count bars
fig.add_trace(
go.Bar(
x=models,
y=token_counts,
name="Token Count",
marker_color="lightblue",
text=token_counts,
textposition="auto",
)
)
fig.update_layout(
title="Token Count Comparison (Lower = More Efficient)",
xaxis_title="Tokenizer",
yaxis_title="Number of Tokens",
template="plotly_white",
)
return fig
def create_token_distribution_chart(results):
if not results:
return None
all_data = []
for model, result in results.items():
if "error" not in result:
type_counts = Counter(token["type"] for token in result["tokens"])
for token_type, count in type_counts.items():
all_data.append(
{
"Tokenizer": result["model"],
"Token Type": token_type,
"Count": count,
}
)
if not all_data:
return None
df = pd.DataFrame(all_data)
fig = px.bar(
df,
x="Tokenizer",
y="Count",
color="Token Type",
title="Token Type Distribution by Tokenizer",
template="plotly_white",
)
return fig
# Custom CSS for better styling
css = """
.gradio-container {
font-family: 'Inter', sans-serif;
}
.token-display {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
background: #f8f9fa;
padding: 8px;
border-radius: 4px;
font-size: 0.9em;
}
"""
# Create the Gradio interface
with gr.Blocks(
title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css
) as demo:
gr.Markdown("""
# 🔤 Advanced Tokenizer Comparison Tool
Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
**Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
💡 **Try the sample texts** to see how tokenizers handle different challenges like:
- Mixed languages and scripts
- Programming code and JSON
- Long compound words
- Special characters and emojis
- Technical terminology
""")
with gr.Row():
with gr.Column(scale=2):
# Sample texts dropdown
sample_texts = gr.Dropdown(
choices=[
"Custom text (enter below)",
"english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.",
"french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.",
"german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.",
"turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.",
"chinese: 快速的棕色狐狸跳过懒狗。它是1234.56,价格为789美元。",
"arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.",
"hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।",
"code: def calculate_sum(a, b):\n return a + b\n\nresult = calculate_sum(123, 456)",
"mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)",
"numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789",
"Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
"Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
"Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
"Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
"Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
"Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈",
"Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
"Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
],
value="Custom text (enter below)",
label="Choose a sample text or enter your own",
interactive=True,
)
text_input = gr.Textbox(
label="Text to tokenize",
placeholder="Enter your text here or select a sample above...",
lines=4,
value="Hello world! This is a test with some subwords and punctuation.",
)
with gr.Column(scale=1):
with gr.Tabs():
with gr.TabItem("Models"):
model_selector = gr.CheckboxGroup(
choices=[
"gpt-4",
"gpt-2",
"llama-2",
"llama-3",
"gemma-2",
"qwen3",
"qwen2.5",
"bert",
"bloom",
"aya-expanse",
"comma",
"tokenmonster",
"byt5",
],
value=["gpt-4", "llama-3", "gpt-2"],
label="Select tokenizers to compare",
)
show_details = gr.Checkbox(
label="Show detailed analysis", value=False
)
with gr.TabItem("Normalization"):
normalization_method = gr.Dropdown(
choices=[method[0] for method in get_normalization_methods()],
value="none",
label="Normalization Method",
)
show_normalization = gr.Checkbox(
label="Show normalized results", value=False
)
with gr.Row():
with gr.Column():
efficiency_output = gr.Markdown(
label="Efficiency Ranking",
value="Enter text above to see efficiency comparison...",
)
with gr.Row():
with gr.Column():
tokenization_display = gr.HTML(
label="Interactive Tokenization (Hover to highlight across tokenizers)",
value="<p>Enter text above to see interactive tokenization...</p>",
)
with gr.Row():
with gr.Column():
normalized_display = gr.HTML(
label="Normalized Tokenization",
value="<p>Enable normalization to see results...</p>",
visible=False,
)
with gr.Row():
with gr.Column():
token_ids_output = gr.Markdown(
label="Token IDs", value="Token IDs will appear here..."
)
with gr.Row():
with gr.Column():
detailed_output = gr.Markdown(label="Detailed Analysis", visible=False)
with gr.Row():
with gr.Column():
efficiency_chart = gr.Plot(label="Efficiency Comparison")
with gr.Column():
distribution_chart = gr.Plot(label="Token Type Distribution")
# Function to update text input when sample is selected
def update_text_from_sample(sample_choice):
if sample_choice == "Custom text (enter below)":
return gr.update() # Don't change the text input
else:
# Extract the text after the colon
sample_text = (
sample_choice.split(": ", 1)[1]
if ": " in sample_choice
else sample_choice
)
return gr.update(value=sample_text)
# Update text input when sample is selected
sample_texts.change(
fn=update_text_from_sample, inputs=sample_texts, outputs=text_input
)
# Main comparison function
def update_comparison_with_norm(text, models, details, norm_method, show_norm):
if normalization_method == "none" or not show_norm:
# Original behavior
(
efficiency,
tokenization_html,
token_ids,
detailed,
eff_chart,
dist_chart,
) = compare_tokenizers(text, models, details)
return (
efficiency,
tokenization_html,
token_ids,
detailed,
eff_chart,
dist_chart,
)
else:
# With normalization
original_results, normalized_results, normalized_text = (
compare_with_normalization(text, models, norm_method, details)
)
# Generate displays for both
orig_eff, orig_html, orig_ids = generate_basic_comparison(original_results)
norm_eff, norm_html, norm_ids = generate_basic_comparison(
normalized_results
)
# Combine or show separately
combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
return (
orig_eff,
gr.update(value=combined_html, visible=True),
orig_ids,
"",
None,
None,
)
def update_comparison(text, models, details):
efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
compare_tokenizers(text, models, details)
)
return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart
# Auto-update on changes
for component in [
text_input,
model_selector,
show_details,
normalization_method,
show_normalization,
]:
component.change(
fn=update_comparison_with_norm,
inputs=[
text_input,
model_selector,
show_details,
normalization_method,
show_normalization,
],
outputs=[
efficiency_output,
tokenization_display,
token_ids_output,
detailed_output,
efficiency_chart,
distribution_chart,
],
)
gr.Markdown("""
---
### About the Models
- **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
- **LLaMA-2/3**: Meta's models using SentencePiece (Llama-3 uses BPE)
- **Gemma-2**: Google's model with SentencePiece (though HuggingFace uses BPE)
- **Qwen3/2.5**: Alibaba's models with BPE
- **BERT/DistilBERT**: Google's models with WordPiece
- **BLOOM**: BigScience's multilingual model with BPE
- **Aya Expanse**: Cohere's multilingual model with SentencePiece
- **Comma (Common Pile)**: Common Pile's model with BPE
- **Byt5**: Google's byte-level model
### Features
- **Efficiency Ranking**: Compare token counts across models
- **Subword Analysis**: See how models handle subwords
- **Token Types**: Classification of word/number/punctuation tokens
- **Visual Charts**: Interactive plots for comparison
- **Detailed Analysis**: Common tokens and distribution stats
""")
if __name__ == "__main__":
demo.launch()