Spaces:

gsaltintas
/

tokenizer-comparison

Running

Gül Sena Altıntaş

Added support for showing newlines

d9779a0 14 days ago

26.2 kB

	from collections import Counter

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go

	from utils import (
	get_normalization_methods,
	normalize_text,
	tokenize_with_hf,
	tokenize_with_tiktoken,
	)


	def compare_tokenizers(text, selected_models, show_details=False):
	if not text.strip():
	return "Please enter some text to tokenize.", "", "", "", None, None

	results = {}

	for model in selected_models:
	if model in ["gpt-4", "gpt-2"]:
	results[model] = tokenize_with_tiktoken(text, model)
	else:
	results[model] = tokenize_with_hf(text, model)

	# Generate outputs
	efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
	results
	)
	detailed_output = generate_detailed_analysis(results) if show_details else ""
	efficiency_chart = create_efficiency_chart(results)
	token_distribution_chart = create_token_distribution_chart(results)

	return (
	efficiency_output,
	tokenization_html,
	token_ids_output,
	detailed_output,
	efficiency_chart,
	token_distribution_chart,
	)


	def generate_basic_comparison(results):
	if not results:
	return "No results to display.", "", ""

	# Efficiency ranking
	sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])

	ranking_output = []
	ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
	for i, (model, result) in enumerate(sorted_models):
	if "error" in result:
	ranking_output.append(
	f"{i + 1}. {result['model']}: ❌ Error - {result['error']}"
	)
	else:
	ranking_output.append(
	f"{i + 1}. {result['model']}: {result['token_count']} tokens "
	f"({result['compression_ratio']:.2f}x compression)"
	)

	# Generate interactive tokenization display
	tokenization_html = generate_interactive_tokenization(results)

	# Generate token ID tables
	token_ids_display = generate_token_ids_display(results)

	return "\n".join(ranking_output), tokenization_html, token_ids_display


	def generate_interactive_tokenization(results):
	"""Generate HTML with working hover highlighting across tokenizers"""
	if not results:
	return "<p>No tokenization results to display.</p>"

	html_parts = []

	# Add styles first
	html_parts.append("""
	<div id="tokenizer-container">
	<style>
	.tokenizer-section {
	margin-bottom: 20px;
	border: 1px solid #e0e0e0;
	border-radius: 8px;
	padding: 15px;
	background: white;
	}
	.tokenizer-header {
	font-weight: bold;
	font-size: 18px;
	margin-bottom: 10px;
	color: #2c3e50;
	}
	.token-display {
	font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
	line-height: 1.8;
	word-wrap: break-word;
	}
	.token {
	display: inline-block;
	margin: 2px;
	padding: 4px 8px;
	border-radius: 4px;
	border: 1px solid;
	cursor: pointer;
	transition: all 0.2s ease;
	position: relative;
	font-size: 14px;
	user-select: none;
	}
	.token:hover {
	transform: scale(1.05);
	z-index: 10;
	box-shadow: 0 2px 8px rgba(0,0,0,0.2);
	}
	.token.highlighted {
	background: #ff6b6b !important;
	border-color: #e55353 !important;
	color: white !important;
	box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
	transform: scale(1.1) !important;
	z-index: 100 !important;
	}
	.token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
	.token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
	.token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; }
	.token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; }
	.token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; }
	.token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; }
	.token-subword {
	background: #fff8e1 !important;
	border-color: #ffc107 !important;
	border-style: dashed !important;
	}
	.token-stats {
	display: inline-block;
	margin-left: 10px;
	padding: 2px 6px;
	background: #f8f9fa;
	border-radius: 3px;
	font-size: 12px;
	color: #666;
	}
	.highlight-info {
	position: fixed;
	top: 10px;
	right: 10px;
	background: #333;
	color: white;
	padding: 8px 12px;
	border-radius: 4px;
	font-size: 12px;
	display: none;
	z-index: 1000;
	}
	</style>

	<div class="highlight-info" id="highlight-info"></div>

	<script>
	function highlightTokens(targetText) {
	// Clear all highlights
	document.querySelectorAll('.token').forEach(function(token) {
	token.classList.remove('highlighted');
	});

	// Highlight matching tokens
	let count = 0;
	document.querySelectorAll('.token').forEach(function(token) {
	if (token.getAttribute('data-text') === targetText) {
	token.classList.add('highlighted');
	count++;
	}
	});

	// Show info
	const info = document.getElementById('highlight-info');
	if (info) {
	const displayText = targetText === ' ' ? '(space)' : targetText;
	info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
	info.style.display = 'block';
	}
	}

	function clearHighlights() {
	document.querySelectorAll('.token').forEach(function(token) {
	token.classList.remove('highlighted');
	});
	const info = document.getElementById('highlight-info');
	if (info) {
	info.style.display = 'none';
	}
	}
	</script>
	""")

	# Generate tokenizer sections with inline event handlers
	for model, result in results.items():
	if "error" in result:
	html_parts.append(f"""
	<div class="tokenizer-section">
	<div class="tokenizer-header">{result["model"]} ❌</div>
	<div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
	</div>
	""")
	continue

	html_parts.append(f"""
	<div class="tokenizer-section">
	<div class="tokenizer-header">
	{result["model"]}
	<span class="token-stats">
	{result["token_count"]} tokens \|
	{result["encoding"]} \|
	{result["compression_ratio"]:.2f}x compression
	</span>
	</div>
	<div class="token-display">
	""")

	# Add tokens with inline event handlers
	subword_count = 0
	for i, token in enumerate(result["tokens"]):
	token_text = token["text"]
	display_text = token_text if token_text.strip() else "·"
	if token_text == "<newline>":
	html_parts.append("<br>")
	continue

	# Determine token class
	token_class = f"token token-{token['type']}"
	if token["is_subword"]:
	token_class += " token-subword"
	subword_count += 1

	# Create unique identifier for this token occurrence
	token_id = f"token_{model}_{i}"

	# Escape text for HTML and JavaScript - be very careful with quotes
	escaped_text = (
	token_text.replace("\\", "\\\\")
	.replace("'", "\\'")
	.replace('"', '\\"')
	.replace("\r", "\\r")
	.replace("\n", "\\n")
	)

	escaped_display = (
	display_text.replace('"', """)
	.replace("'", "'")
	.replace("\r", "\n")
	)

	# Use inline event handlers that work in Gradio
	html_parts.append(f"""<span class="{token_class}"
	id="{token_id}"
	data-text="{token_text.replace('"', """).replace("'", "'")}"
	data-id="{token["id"]}"
	data-position="{i}"
	data-model="{model}"
	title="Text: '{token_text}' \| ID: {token["id"]} \| Type: {token["type"]} \| Subword: {token["is_subword"]}"
	onmouseover="highlightTokens('{escaped_text}')"
	onmouseout="clearHighlights()"
	onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")

	html_parts.append(f"""
	</div>
	<div style="margin-top: 8px; font-size: 12px; color: #666;">
	Subwords: {subword_count}/{len(result["tokens"])}
	({subword_count / len(result["tokens"]) * 100:.1f}%)
	</div>
	</div>
	""")

	html_parts.append("</div>")
	return "".join(html_parts)


	def generate_token_ids_display(results):
	"""Generate a clean display of token IDs for each tokenizer"""
	if not results:
	return "No token IDs to display."

	output = []
	output.append("## 🔢 Token IDs by Tokenizer")

	for model, result in results.items():
	if "error" in result:
	output.append(f"\n### {result['model']} ❌")
	output.append(f"Error: {result['error']}")
	continue

	output.append(f"\n### {result['model']}")
	output.append(
	f"Vocab Size: {result['vocab_size']:,} \| Encoding: {result['encoding']}"
	)

	# Display token IDs in a readable format
	token_ids = [str(token["id"]) for token in result["tokens"]]

	# Group IDs for better readability (10 per line)
	lines = []
	for i in range(0, len(token_ids), 10):
	line_ids = token_ids[i : i + 10]
	lines.append(" ".join(line_ids))

	output.append("```")
	output.append("\n".join(lines))
	output.append("```")

	# Add some statistics
	unique_ids = len(set(token_ids))
	output.append(
	f"Stats: {len(token_ids)} total tokens, {unique_ids} unique IDs"
	)

	return "\n".join(output)


	def compare_with_normalization(
	text, selected_models, normalization_method, show_details=False
	):
	"""Compare tokenizers with optional normalization"""
	normalized_text = normalize_text(text, normalization_method)
	print(
	"[DEBUG] Before normalization:", text, "\nAfter normalization:", normalized_text
	)

	# Get both original and normalized results
	original_results = {}
	normalized_results = {}

	for model in selected_models:
	if model in ["gpt-4", "gpt-2"]:
	original_results[model] = tokenize_with_tiktoken(text, model)
	if normalization_method != "none":
	normalized_results[model] = tokenize_with_tiktoken(
	normalized_text, model
	)
	else:
	original_results[model] = tokenize_with_hf(text, model)
	if normalization_method != "none":
	normalized_results[model] = tokenize_with_hf(normalized_text, model)

	return original_results, normalized_results, normalized_text


	def generate_detailed_analysis(results):
	if not results or len(results) < 2:
	return "Need at least 2 tokenizers for detailed analysis."

	output = []
	output.append("## 🔍 Detailed Analysis")

	# Find common tokens
	all_token_sets = []
	for model, result in results.items():
	if "error" not in result:
	token_texts = {token["text"] for token in result["tokens"]}
	all_token_sets.append(token_texts)

	if all_token_sets:
	common_tokens = set.intersection(*all_token_sets)
	output.append(f"\n### Common Tokens ({len(common_tokens)})")
	if common_tokens:
	common_display = [
	f"`{token}`" if token != " " else "`·`"
	for token in list(common_tokens)[:15]
	]
	output.append(" ".join(common_display))
	else:
	output.append("No common tokens found.")

	# Token type distribution
	output.append("\n### Token Type Distribution")
	for model, result in results.items():
	if "error" not in result:
	type_counts = Counter(token["type"] for token in result["tokens"])
	type_display = [f"{type_}: {count}" for type_, count in type_counts.items()]
	output.append(f"{result['model']}: {', '.join(type_display)}")

	# Subword analysis
	output.append("\n### Subword Analysis")
	for model, result in results.items():
	if "error" not in result:
	subwords = [token for token in result["tokens"] if token["is_subword"]]
	subword_ratio = (
	len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0
	)
	output.append(
	f"{result['model']}: {len(subwords)} subwords ({subword_ratio:.1f}%)"
	)

	return "\n".join(output)


	def create_efficiency_chart(results):
	if not results:
	return None

	models = []
	token_counts = []
	compression_ratios = []

	for model, result in results.items():
	if "error" not in result:
	models.append(result["model"])
	token_counts.append(result["token_count"])
	compression_ratios.append(result["compression_ratio"])

	if not models:
	return None

	fig = go.Figure()

	# Add token count bars
	fig.add_trace(
	go.Bar(
	x=models,
	y=token_counts,
	name="Token Count",
	marker_color="lightblue",
	text=token_counts,
	textposition="auto",
	)
	)

	fig.update_layout(
	title="Token Count Comparison (Lower = More Efficient)",
	xaxis_title="Tokenizer",
	yaxis_title="Number of Tokens",
	template="plotly_white",
	)

	return fig


	def create_token_distribution_chart(results):
	if not results:
	return None

	all_data = []

	for model, result in results.items():
	if "error" not in result:
	type_counts = Counter(token["type"] for token in result["tokens"])
	for token_type, count in type_counts.items():
	all_data.append(
	{
	"Tokenizer": result["model"],
	"Token Type": token_type,
	"Count": count,
	}
	)

	if not all_data:
	return None

	df = pd.DataFrame(all_data)

	fig = px.bar(
	df,
	x="Tokenizer",
	y="Count",
	color="Token Type",
	title="Token Type Distribution by Tokenizer",
	template="plotly_white",
	)

	return fig


	# Custom CSS for better styling
	css = """
	.gradio-container {
	font-family: 'Inter', sans-serif;
	}
	.token-display {
	font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
	background: #f8f9fa;
	padding: 8px;
	border-radius: 4px;
	font-size: 0.9em;
	}
	"""

	# Create the Gradio interface
	with gr.Blocks(
	title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css
	) as demo:
	gr.Markdown("""
	# 🔤 Advanced Tokenizer Comparison Tool

	Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.

	Legend: 🔤 Word \| 🔢 Number \| ❗ Punctuation \| 🔸 Subword \| · Space

	💡 Try the sample texts to see how tokenizers handle different challenges like:
	- Mixed languages and scripts
	- Programming code and JSON
	- Long compound words
	- Special characters and emojis
	- Technical terminology
	""")

	with gr.Row():
	with gr.Column(scale=2):
	# Sample texts dropdown
	sample_texts = gr.Dropdown(
	choices=[
	"Custom text (enter below)",
	"english: The quick brown fox jumps over the lazy dog. It's 1234.56 and costs $789.",
	"french: Le renard brun rapide saute par-dessus le chien paresseux. C'est 1234,56 et coûte 789€.",
	"german: Der schnelle braune Fuchs springt über den faulen Hund. Es ist 1234,56 und kostet 789€.",
	"turkish: Hızlı kahverengi tilki tembel köpeğin üstunden atlar. 1234.56'dır ve 789$ tutar.",
	"chinese: 快速的棕色狐狸跳过懒狗。它是1234.56，价格为789美元。",
	"arabic: الثعلب البني السريع يقفز فوق الكلب الكسول. إنه 1234.56 ويكلف 789 دولارًا.",
	"hindi: तेज भूरी लोमड़ी आलसी कुत्ते पर कूदती है। यह 1234.56 है और 789 डॉलर की कीमत है।",
	"code: def calculate_sum(a, b):\n return a + b\n\nresult = calculate_sum(123, 456)",
	"mixed: English text with numbers 12345 and special chars !@#$%, plus some code: x = f(y)",
	"numbers: The price is $123.45 (20% off) = $98.76 savings 1 12 123 1234 12345 123456 1234567 12345678 123456789",
	"Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
	"Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
	"Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
	"Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
	"Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
	"Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية😀 👍 🚀 🌍 🎉 💡 🔥 🎵 🏆 🌈",
	"Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
	'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
	"Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
	],
	value="Custom text (enter below)",
	label="Choose a sample text or enter your own",
	interactive=True,
	)

	text_input = gr.Textbox(
	label="Text to tokenize",
	placeholder="Enter your text here or select a sample above...",
	lines=4,
	value="Hello world! This is a test with some subwords and punctuation.",
	)
	with gr.Column(scale=1):
	with gr.Tabs():
	with gr.TabItem("Models"):
	model_selector = gr.CheckboxGroup(
	choices=[
	"gpt-4",
	"gpt-2",
	"llama-2",
	"llama-3",
	"gemma-2",
	"qwen3",
	"qwen2.5",
	"bert",
	"bloom",
	"aya-expanse",
	"comma",
	"tokenmonster",
	"byt5",
	],
	value=["gpt-4", "llama-3", "gpt-2"],
	label="Select tokenizers to compare",
	)
	show_details = gr.Checkbox(
	label="Show detailed analysis", value=False
	)

	with gr.TabItem("Normalization"):
	normalization_method = gr.Dropdown(
	choices=[method[0] for method in get_normalization_methods()],
	value="none",
	label="Normalization Method",
	)
	show_normalization = gr.Checkbox(
	label="Show normalized results", value=False
	)
	with gr.Row():
	with gr.Column():
	efficiency_output = gr.Markdown(
	label="Efficiency Ranking",
	value="Enter text above to see efficiency comparison...",
	)

	with gr.Row():
	with gr.Column():
	tokenization_display = gr.HTML(
	label="Interactive Tokenization (Hover to highlight across tokenizers)",
	value="<p>Enter text above to see interactive tokenization...</p>",
	)
	with gr.Row():
	with gr.Column():
	normalized_display = gr.HTML(
	label="Normalized Tokenization",
	value="<p>Enable normalization to see results...</p>",
	visible=False,
	)
	with gr.Row():
	with gr.Column():
	token_ids_output = gr.Markdown(
	label="Token IDs", value="Token IDs will appear here..."
	)

	with gr.Row():
	with gr.Column():
	detailed_output = gr.Markdown(label="Detailed Analysis", visible=False)

	with gr.Row():
	with gr.Column():
	efficiency_chart = gr.Plot(label="Efficiency Comparison")
	with gr.Column():
	distribution_chart = gr.Plot(label="Token Type Distribution")

	# Function to update text input when sample is selected
	def update_text_from_sample(sample_choice):
	if sample_choice == "Custom text (enter below)":
	return gr.update() # Don't change the text input
	else:
	# Extract the text after the colon
	sample_text = (
	sample_choice.split(": ", 1)[1]
	if ": " in sample_choice
	else sample_choice
	)
	return gr.update(value=sample_text)

	# Update text input when sample is selected
	sample_texts.change(
	fn=update_text_from_sample, inputs=sample_texts, outputs=text_input
	)

	# Main comparison function
	def update_comparison_with_norm(text, models, details, norm_method, show_norm):
	if normalization_method == "none" or not show_norm:
	# Original behavior
	(
	efficiency,
	tokenization_html,
	token_ids,
	detailed,
	eff_chart,
	dist_chart,
	) = compare_tokenizers(text, models, details)
	return (
	efficiency,
	tokenization_html,
	token_ids,
	detailed,
	eff_chart,
	dist_chart,
	)
	else:
	# With normalization
	original_results, normalized_results, normalized_text = (
	compare_with_normalization(text, models, norm_method, details)
	)

	# Generate displays for both
	orig_eff, orig_html, orig_ids = generate_basic_comparison(original_results)
	norm_eff, norm_html, norm_ids = generate_basic_comparison(
	normalized_results
	)

	# Combine or show separately
	combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"

	return (
	orig_eff,
	gr.update(value=combined_html, visible=True),
	orig_ids,
	"",
	None,
	None,
	)

	def update_comparison(text, models, details):
	efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
	compare_tokenizers(text, models, details)
	)
	return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart

	# Auto-update on changes
	for component in [
	text_input,
	model_selector,
	show_details,
	normalization_method,
	show_normalization,
	]:
	component.change(
	fn=update_comparison_with_norm,
	inputs=[
	text_input,
	model_selector,
	show_details,
	normalization_method,
	show_normalization,
	],
	outputs=[
	efficiency_output,
	tokenization_display,
	token_ids_output,
	detailed_output,
	efficiency_chart,
	distribution_chart,
	],
	)

	gr.Markdown("""
	---
	### About the Models

	- GPT-4/GPT-2: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
	- LLaMA-2/3: Meta's models using SentencePiece (Llama-3 uses BPE)
	- Gemma-2: Google's model with SentencePiece (though HuggingFace uses BPE)
	- Qwen3/2.5: Alibaba's models with BPE
	- BERT/DistilBERT: Google's models with WordPiece
	- BLOOM: BigScience's multilingual model with BPE
	- Aya Expanse: Cohere's multilingual model with SentencePiece
	- Comma (Common Pile): Common Pile's model with BPE
	- Byt5: Google's byte-level model

	### Features
	- Efficiency Ranking: Compare token counts across models
	- Subword Analysis: See how models handle subwords
	- Token Types: Classification of word/number/punctuation tokens
	- Visual Charts: Interactive plots for comparison
	- Detailed Analysis: Common tokens and distribution stats
	""")

	if __name__ == "__main__":
	demo.launch()