Spaces:
Running
Running
from collections import Counter | |
import gradio as gr | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
from utils import tokenize_with_hf, tokenize_with_tiktoken | |
def compare_tokenizers(text, selected_models, show_details=False): | |
if not text.strip(): | |
return "Please enter some text to tokenize.", "", "", "", None, None | |
results = {} | |
for model in selected_models: | |
if model in ["gpt-4", "gpt-2"]: | |
results[model] = tokenize_with_tiktoken(text, model) | |
else: | |
results[model] = tokenize_with_hf(text, model) | |
# Generate outputs | |
efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison( | |
results | |
) | |
detailed_output = generate_detailed_analysis(results) if show_details else "" | |
efficiency_chart = create_efficiency_chart(results) | |
token_distribution_chart = create_token_distribution_chart(results) | |
return ( | |
efficiency_output, | |
tokenization_html, | |
token_ids_output, | |
detailed_output, | |
efficiency_chart, | |
token_distribution_chart, | |
) | |
def generate_basic_comparison(results): | |
if not results: | |
return "No results to display.", "", "" | |
# Efficiency ranking | |
sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"]) | |
ranking_output = [] | |
ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)") | |
for i, (model, result) in enumerate(sorted_models): | |
if "error" in result: | |
ranking_output.append( | |
f"{i + 1}. **{result['model']}**: ❌ Error - {result['error']}" | |
) | |
else: | |
ranking_output.append( | |
f"{i + 1}. **{result['model']}**: {result['token_count']} tokens " | |
f"({result['compression_ratio']:.2f}x compression)" | |
) | |
# Generate interactive tokenization display | |
tokenization_html = generate_interactive_tokenization(results) | |
# Generate token ID tables | |
token_ids_display = generate_token_ids_display(results) | |
return "\n".join(ranking_output), tokenization_html, token_ids_display | |
def generate_interactive_tokenization(results): | |
"""Generate HTML with hover highlighting across tokenizers""" | |
if not results: | |
return "<p>No tokenization results to display.</p>" | |
html_parts = [] | |
html_parts.append(""" | |
<style> | |
.tokenizer-container { | |
margin-bottom: 20px; | |
border: 1px solid #e0e0e0; | |
border-radius: 8px; | |
padding: 15px; | |
background: white; | |
} | |
.tokenizer-header { | |
font-weight: bold; | |
font-size: 18px; | |
margin-bottom: 10px; | |
color: #2c3e50; | |
} | |
.token-display { | |
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; | |
line-height: 1.8; | |
word-wrap: break-word; | |
} | |
.token { | |
display: inline-block; | |
margin: 2px; | |
padding: 4px 8px; | |
border-radius: 4px; | |
border: 1px solid; | |
cursor: pointer; | |
transition: all 0.2s ease; | |
position: relative; | |
font-size: 14px; | |
} | |
.token:hover { | |
transform: scale(1.1); | |
z-index: 10; | |
box-shadow: 0 2px 8px rgba(0,0,0,0.2); | |
} | |
.token.highlighted { | |
background: #ff6b6b !important; | |
border-color: #e55353 !important; | |
color: white !important; | |
box-shadow: 0 0 10px rgba(255, 107, 107, 0.5); | |
} | |
.token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; } | |
.token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; } | |
.token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; } | |
.token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; } | |
.token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; } | |
.token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; } | |
.token-subword { | |
background: #fff8e1 !important; | |
border-color: #ffc107 !important; | |
border-style: dashed !important; | |
} | |
.token-stats { | |
display: inline-block; | |
margin-left: 10px; | |
padding: 2px 6px; | |
background: #f8f9fa; | |
border-radius: 3px; | |
font-size: 12px; | |
color: #666; | |
} | |
</style> | |
<script> | |
function highlightToken(text, allTokenizers) { | |
// Remove existing highlights | |
document.querySelectorAll('.token').forEach(token => { | |
token.classList.remove('highlighted'); | |
}); | |
// Highlight tokens with same text across all tokenizers | |
document.querySelectorAll('.token').forEach(token => { | |
if (token.dataset.text === text) { | |
token.classList.add('highlighted'); | |
} | |
}); | |
} | |
function clearHighlights() { | |
document.querySelectorAll('.token').forEach(token => { | |
token.classList.remove('highlighted'); | |
}); | |
} | |
</script> | |
""") | |
for model, result in results.items(): | |
if "error" in result: | |
html_parts.append(f""" | |
<div class="tokenizer-container"> | |
<div class="tokenizer-header">{result["model"]} ❌</div> | |
<div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div> | |
</div> | |
""") | |
continue | |
html_parts.append(f""" | |
<div class="tokenizer-container"> | |
<div class="tokenizer-header"> | |
{result["model"]} | |
<span class="token-stats"> | |
{result["token_count"]} tokens | | |
{result["encoding"]} | | |
{result["compression_ratio"]:.2f}x compression | |
</span> | |
</div> | |
<div class="token-display"> | |
""") | |
# Add tokens with hover functionality | |
subword_count = 0 | |
for i, token in enumerate(result["tokens"]): | |
token_text = token["text"] | |
display_text = ( | |
token_text if token_text.strip() else "·" | |
) # Show space as dot | |
# Determine token class | |
token_class = f"token token-{token['type']}" | |
if token["is_subword"]: | |
token_class += " token-subword" | |
subword_count += 1 | |
# Escape text for HTML | |
escaped_text = token_text.replace('"', """).replace("'", "'") | |
escaped_display = display_text.replace('"', """).replace("'", "'") | |
html_parts.append(f""" | |
<span class="{token_class}" | |
data-text="{escaped_text}" | |
data-id="{token["id"]}" | |
data-position="{i}" | |
title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}" | |
onmouseover="highlightToken('{escaped_text}', true)" | |
onmouseout="clearHighlights()"> | |
{escaped_display} | |
</span> | |
""") | |
html_parts.append(f""" | |
</div> | |
<div style="margin-top: 8px; font-size: 12px; color: #666;"> | |
Subwords: {subword_count}/{len(result["tokens"])} | |
({subword_count / len(result["tokens"]) * 100:.1f}%) | |
</div> | |
</div> | |
""") | |
return "".join(html_parts) | |
def generate_token_ids_display(results): | |
"""Generate a clean display of token IDs for each tokenizer""" | |
if not results: | |
return "No token IDs to display." | |
output = [] | |
output.append("## 🔢 Token IDs by Tokenizer") | |
for model, result in results.items(): | |
if "error" in result: | |
output.append(f"\n### {result['model']} ❌") | |
output.append(f"Error: {result['error']}") | |
continue | |
output.append(f"\n### {result['model']}") | |
output.append( | |
f"**Vocab Size**: {result['vocab_size']:,} | **Encoding**: {result['encoding']}" | |
) | |
# Display token IDs in a readable format | |
token_ids = [str(token["id"]) for token in result["tokens"]] | |
# Group IDs for better readability (10 per line) | |
lines = [] | |
for i in range(0, len(token_ids), 10): | |
line_ids = token_ids[i : i + 10] | |
lines.append(" ".join(line_ids)) | |
output.append("```") | |
output.append("\n".join(lines)) | |
output.append("```") | |
# Add some statistics | |
unique_ids = len(set(token_ids)) | |
output.append( | |
f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs" | |
) | |
# Show ID ranges | |
id_values = [token["id"] for token in result["tokens"]] | |
if id_values: | |
output.append(f"**ID Range**: {min(id_values)} - {max(id_values)}") | |
return "\n".join(output) | |
def generate_detailed_analysis(results): | |
if not results or len(results) < 2: | |
return "Need at least 2 tokenizers for detailed analysis." | |
output = [] | |
output.append("## 🔍 Detailed Analysis") | |
# Find common tokens | |
all_token_sets = [] | |
for model, result in results.items(): | |
if "error" not in result: | |
token_texts = {token["text"] for token in result["tokens"]} | |
all_token_sets.append(token_texts) | |
if all_token_sets: | |
common_tokens = set.intersection(*all_token_sets) | |
output.append(f"\n### Common Tokens ({len(common_tokens)})") | |
if common_tokens: | |
common_display = [ | |
f"`{token}`" if token != " " else "`·`" | |
for token in list(common_tokens)[:15] | |
] | |
output.append(" ".join(common_display)) | |
else: | |
output.append("No common tokens found.") | |
# Token type distribution | |
output.append("\n### Token Type Distribution") | |
for model, result in results.items(): | |
if "error" not in result: | |
type_counts = Counter(token["type"] for token in result["tokens"]) | |
type_display = [f"{type_}: {count}" for type_, count in type_counts.items()] | |
output.append(f"**{result['model']}**: {', '.join(type_display)}") | |
# Subword analysis | |
output.append("\n### Subword Analysis") | |
for model, result in results.items(): | |
if "error" not in result: | |
subwords = [token for token in result["tokens"] if token["is_subword"]] | |
subword_ratio = ( | |
len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0 | |
) | |
output.append( | |
f"**{result['model']}**: {len(subwords)} subwords ({subword_ratio:.1f}%)" | |
) | |
return "\n".join(output) | |
def create_efficiency_chart(results): | |
if not results: | |
return None | |
models = [] | |
token_counts = [] | |
compression_ratios = [] | |
for model, result in results.items(): | |
if "error" not in result: | |
models.append(result["model"]) | |
token_counts.append(result["token_count"]) | |
compression_ratios.append(result["compression_ratio"]) | |
if not models: | |
return None | |
fig = go.Figure() | |
# Add token count bars | |
fig.add_trace( | |
go.Bar( | |
x=models, | |
y=token_counts, | |
name="Token Count", | |
marker_color="lightblue", | |
text=token_counts, | |
textposition="auto", | |
) | |
) | |
fig.update_layout( | |
title="Token Count Comparison (Lower = More Efficient)", | |
xaxis_title="Tokenizer", | |
yaxis_title="Number of Tokens", | |
template="plotly_white", | |
) | |
return fig | |
def create_token_distribution_chart(results): | |
if not results: | |
return None | |
all_data = [] | |
for model, result in results.items(): | |
if "error" not in result: | |
type_counts = Counter(token["type"] for token in result["tokens"]) | |
for token_type, count in type_counts.items(): | |
all_data.append( | |
{ | |
"Tokenizer": result["model"], | |
"Token Type": token_type, | |
"Count": count, | |
} | |
) | |
if not all_data: | |
return None | |
df = pd.DataFrame(all_data) | |
fig = px.bar( | |
df, | |
x="Tokenizer", | |
y="Count", | |
color="Token Type", | |
title="Token Type Distribution by Tokenizer", | |
template="plotly_white", | |
) | |
return fig | |
# Custom CSS for better styling | |
css = """ | |
.gradio-container { | |
font-family: 'Inter', sans-serif; | |
} | |
.token-display { | |
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace; | |
background: #f8f9fa; | |
padding: 8px; | |
border-radius: 4px; | |
font-size: 0.9em; | |
} | |
""" | |
# Create the Gradio interface | |
with gr.Blocks( | |
title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css | |
) as demo: | |
gr.Markdown(""" | |
# 🔤 Advanced Tokenizer Comparison Tool | |
Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types. | |
**Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space | |
""") | |
with gr.Row(): | |
with gr.Column(scale=2): | |
text_input = gr.Textbox( | |
label="Text to tokenize", | |
placeholder="Enter your text here...", | |
lines=4, | |
value="Hello world! This is a test with some subwords and punctuation.", | |
) | |
with gr.Column(scale=1): | |
model_selector = gr.CheckboxGroup( | |
choices=[ | |
"gpt-4", | |
"gpt-2", | |
"llama-2", | |
"llama-3", | |
"gemma-2", | |
"qwen3", | |
"qwen2.5", | |
"bert", | |
"bloom", | |
"aya-expanse", | |
"comma", | |
"roberta", | |
"distilbert", | |
"tokenmonster", | |
"byt5", | |
], | |
value=["gpt-4", "llama-3", "gpt-2"], | |
label="Select tokenizers to compare", | |
) | |
show_details = gr.Checkbox(label="Show detailed analysis", value=False) | |
with gr.Row(): | |
with gr.Column(): | |
efficiency_output = gr.Markdown( | |
label="Efficiency Ranking", | |
value="Enter text above to see efficiency comparison...", | |
) | |
with gr.Row(): | |
with gr.Column(): | |
tokenization_display = gr.HTML( | |
label="Interactive Tokenization (Hover to highlight across tokenizers)", | |
value="<p>Enter text above to see interactive tokenization...</p>", | |
) | |
with gr.Row(): | |
with gr.Column(): | |
token_ids_output = gr.Markdown( | |
label="Token IDs", value="Token IDs will appear here..." | |
) | |
with gr.Row(): | |
with gr.Column(): | |
detailed_output = gr.Markdown(label="Detailed Analysis", visible=False) | |
with gr.Row(): | |
with gr.Column(): | |
efficiency_chart = gr.Plot(label="Efficiency Comparison") | |
with gr.Column(): | |
distribution_chart = gr.Plot(label="Token Type Distribution") | |
# Update visibility of detailed analysis | |
def toggle_details(show_details): | |
return gr.update(visible=show_details) | |
show_details.change(fn=toggle_details, inputs=show_details, outputs=detailed_output) | |
# Main comparison function | |
def update_comparison(text, models, details): | |
efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = ( | |
compare_tokenizers(text, models, details) | |
) | |
return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart | |
# Auto-update on changes | |
for component in [text_input, model_selector, show_details]: | |
component.change( | |
fn=update_comparison, | |
inputs=[text_input, model_selector, show_details], | |
outputs=[ | |
efficiency_output, | |
tokenization_display, | |
token_ids_output, | |
detailed_output, | |
efficiency_chart, | |
distribution_chart, | |
], | |
) | |
gr.Markdown(""" | |
--- | |
### About the Models | |
- **GPT-4/GPT-2**: OpenAI's tokenizers using BPE (Byte-Pair Encoding) | |
- **LLaMA-2/3**: Meta's models using SentencePiece | |
- **Gemma-2**: Google's model with SentencePiece | |
- **Qwen3/2.5**: Alibaba's models with BPE | |
- **BERT/DistilBERT**: Google's models with WordPiece | |
- **RoBERTa**: Facebook's model with BPE | |
- **BLOOM**: BigScience's multilingual model with BPE | |
- **Aya Expanse**: Cohere's multilingual model with SentencePiece | |
- **Comma (Common Pile)**: Common Pile's model with BPE | |
### Features | |
- **Efficiency Ranking**: Compare token counts across models | |
- **Subword Analysis**: See how models handle subwords | |
- **Token Types**: Classification of word/number/punctuation tokens | |
- **Visual Charts**: Interactive plots for comparison | |
- **Detailed Analysis**: Common tokens and distribution stats | |
""") | |
if __name__ == "__main__": | |
demo.launch() | |
demo.launch() | |
demo.launch() | |