Spaces:

gsaltintas
/

tokenizer-comparison

Running

Gül Sena Altıntaş

Refactoring, and visual improvements

c02e89e about 1 month ago

17.4 kB

	from collections import Counter

	import gradio as gr
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go

	from utils import tokenize_with_hf, tokenize_with_tiktoken


	def compare_tokenizers(text, selected_models, show_details=False):
	if not text.strip():
	return "Please enter some text to tokenize.", "", "", "", None, None

	results = {}

	for model in selected_models:
	if model in ["gpt-4", "gpt-2"]:
	results[model] = tokenize_with_tiktoken(text, model)
	else:
	results[model] = tokenize_with_hf(text, model)

	# Generate outputs
	efficiency_output, tokenization_html, token_ids_output = generate_basic_comparison(
	results
	)
	detailed_output = generate_detailed_analysis(results) if show_details else ""
	efficiency_chart = create_efficiency_chart(results)
	token_distribution_chart = create_token_distribution_chart(results)

	return (
	efficiency_output,
	tokenization_html,
	token_ids_output,
	detailed_output,
	efficiency_chart,
	token_distribution_chart,
	)


	def generate_basic_comparison(results):
	if not results:
	return "No results to display.", "", ""

	# Efficiency ranking
	sorted_models = sorted(results.items(), key=lambda x: x[1]["token_count"])

	ranking_output = []
	ranking_output.append("## 🏆 Efficiency Ranking (Fewer tokens = more efficient)")
	for i, (model, result) in enumerate(sorted_models):
	if "error" in result:
	ranking_output.append(
	f"{i + 1}. {result['model']}: ❌ Error - {result['error']}"
	)
	else:
	ranking_output.append(
	f"{i + 1}. {result['model']}: {result['token_count']} tokens "
	f"({result['compression_ratio']:.2f}x compression)"
	)

	# Generate interactive tokenization display
	tokenization_html = generate_interactive_tokenization(results)

	# Generate token ID tables
	token_ids_display = generate_token_ids_display(results)

	return "\n".join(ranking_output), tokenization_html, token_ids_display


	def generate_interactive_tokenization(results):
	"""Generate HTML with hover highlighting across tokenizers"""
	if not results:
	return "<p>No tokenization results to display.</p>"

	html_parts = []
	html_parts.append("""
	<style>
	.tokenizer-container {
	margin-bottom: 20px;
	border: 1px solid #e0e0e0;
	border-radius: 8px;
	padding: 15px;
	background: white;
	}
	.tokenizer-header {
	font-weight: bold;
	font-size: 18px;
	margin-bottom: 10px;
	color: #2c3e50;
	}
	.token-display {
	font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
	line-height: 1.8;
	word-wrap: break-word;
	}
	.token {
	display: inline-block;
	margin: 2px;
	padding: 4px 8px;
	border-radius: 4px;
	border: 1px solid;
	cursor: pointer;
	transition: all 0.2s ease;
	position: relative;
	font-size: 14px;
	}
	.token:hover {
	transform: scale(1.1);
	z-index: 10;
	box-shadow: 0 2px 8px rgba(0,0,0,0.2);
	}
	.token.highlighted {
	background: #ff6b6b !important;
	border-color: #e55353 !important;
	color: white !important;
	box-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
	}
	.token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
	.token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
	.token-punctuation { background: #ffebee; border-color: #f44336; color: #c62828; }
	.token-whitespace { background: #f5f5f5; border-color: #9e9e9e; color: #616161; }
	.token-special { background: #fff3e0; border-color: #ff9800; color: #ef6c00; }
	.token-mixed { background: #e3f2fd; border-color: #2196f3; color: #1565c0; }
	.token-subword {
	background: #fff8e1 !important;
	border-color: #ffc107 !important;
	border-style: dashed !important;
	}
	.token-stats {
	display: inline-block;
	margin-left: 10px;
	padding: 2px 6px;
	background: #f8f9fa;
	border-radius: 3px;
	font-size: 12px;
	color: #666;
	}
	</style>

	<script>
	function highlightToken(text, allTokenizers) {
	// Remove existing highlights
	document.querySelectorAll('.token').forEach(token => {
	token.classList.remove('highlighted');
	});

	// Highlight tokens with same text across all tokenizers
	document.querySelectorAll('.token').forEach(token => {
	if (token.dataset.text === text) {
	token.classList.add('highlighted');
	}
	});
	}

	function clearHighlights() {
	document.querySelectorAll('.token').forEach(token => {
	token.classList.remove('highlighted');
	});
	}
	</script>
	""")

	for model, result in results.items():
	if "error" in result:
	html_parts.append(f"""
	<div class="tokenizer-container">
	<div class="tokenizer-header">{result["model"]} ❌</div>
	<div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
	</div>
	""")
	continue

	html_parts.append(f"""
	<div class="tokenizer-container">
	<div class="tokenizer-header">
	{result["model"]}
	<span class="token-stats">
	{result["token_count"]} tokens \|
	{result["encoding"]} \|
	{result["compression_ratio"]:.2f}x compression
	</span>
	</div>
	<div class="token-display">
	""")

	# Add tokens with hover functionality
	subword_count = 0
	for i, token in enumerate(result["tokens"]):
	token_text = token["text"]
	display_text = (
	token_text if token_text.strip() else "·"
	) # Show space as dot

	# Determine token class
	token_class = f"token token-{token['type']}"
	if token["is_subword"]:
	token_class += " token-subword"
	subword_count += 1

	# Escape text for HTML
	escaped_text = token_text.replace('"', """).replace("'", "'")
	escaped_display = display_text.replace('"', """).replace("'", "'")

	html_parts.append(f"""
	<span class="{token_class}"
	data-text="{escaped_text}"
	data-id="{token["id"]}"
	data-position="{i}"
	title="Text: '{token_text}' \| ID: {token["id"]} \| Type: {token["type"]} \| Subword: {token["is_subword"]}"
	onmouseover="highlightToken('{escaped_text}', true)"
	onmouseout="clearHighlights()">
	{escaped_display}
	</span>
	""")

	html_parts.append(f"""
	</div>
	<div style="margin-top: 8px; font-size: 12px; color: #666;">
	Subwords: {subword_count}/{len(result["tokens"])}
	({subword_count / len(result["tokens"]) * 100:.1f}%)
	</div>
	</div>
	""")

	return "".join(html_parts)


	def generate_token_ids_display(results):
	"""Generate a clean display of token IDs for each tokenizer"""
	if not results:
	return "No token IDs to display."

	output = []
	output.append("## 🔢 Token IDs by Tokenizer")

	for model, result in results.items():
	if "error" in result:
	output.append(f"\n### {result['model']} ❌")
	output.append(f"Error: {result['error']}")
	continue

	output.append(f"\n### {result['model']}")
	output.append(
	f"Vocab Size: {result['vocab_size']:,} \| Encoding: {result['encoding']}"
	)

	# Display token IDs in a readable format
	token_ids = [str(token["id"]) for token in result["tokens"]]

	# Group IDs for better readability (10 per line)
	lines = []
	for i in range(0, len(token_ids), 10):
	line_ids = token_ids[i : i + 10]
	lines.append(" ".join(line_ids))

	output.append("```")
	output.append("\n".join(lines))
	output.append("```")

	# Add some statistics
	unique_ids = len(set(token_ids))
	output.append(
	f"Stats: {len(token_ids)} total tokens, {unique_ids} unique IDs"
	)

	# Show ID ranges
	id_values = [token["id"] for token in result["tokens"]]
	if id_values:
	output.append(f"ID Range: {min(id_values)} - {max(id_values)}")

	return "\n".join(output)


	def generate_detailed_analysis(results):
	if not results or len(results) < 2:
	return "Need at least 2 tokenizers for detailed analysis."

	output = []
	output.append("## 🔍 Detailed Analysis")

	# Find common tokens
	all_token_sets = []
	for model, result in results.items():
	if "error" not in result:
	token_texts = {token["text"] for token in result["tokens"]}
	all_token_sets.append(token_texts)

	if all_token_sets:
	common_tokens = set.intersection(*all_token_sets)
	output.append(f"\n### Common Tokens ({len(common_tokens)})")
	if common_tokens:
	common_display = [
	f"`{token}`" if token != " " else "`·`"
	for token in list(common_tokens)[:15]
	]
	output.append(" ".join(common_display))
	else:
	output.append("No common tokens found.")

	# Token type distribution
	output.append("\n### Token Type Distribution")
	for model, result in results.items():
	if "error" not in result:
	type_counts = Counter(token["type"] for token in result["tokens"])
	type_display = [f"{type_}: {count}" for type_, count in type_counts.items()]
	output.append(f"{result['model']}: {', '.join(type_display)}")

	# Subword analysis
	output.append("\n### Subword Analysis")
	for model, result in results.items():
	if "error" not in result:
	subwords = [token for token in result["tokens"] if token["is_subword"]]
	subword_ratio = (
	len(subwords) / len(result["tokens"]) * 100 if result["tokens"] else 0
	)
	output.append(
	f"{result['model']}: {len(subwords)} subwords ({subword_ratio:.1f}%)"
	)

	return "\n".join(output)


	def create_efficiency_chart(results):
	if not results:
	return None

	models = []
	token_counts = []
	compression_ratios = []

	for model, result in results.items():
	if "error" not in result:
	models.append(result["model"])
	token_counts.append(result["token_count"])
	compression_ratios.append(result["compression_ratio"])

	if not models:
	return None

	fig = go.Figure()

	# Add token count bars
	fig.add_trace(
	go.Bar(
	x=models,
	y=token_counts,
	name="Token Count",
	marker_color="lightblue",
	text=token_counts,
	textposition="auto",
	)
	)

	fig.update_layout(
	title="Token Count Comparison (Lower = More Efficient)",
	xaxis_title="Tokenizer",
	yaxis_title="Number of Tokens",
	template="plotly_white",
	)

	return fig


	def create_token_distribution_chart(results):
	if not results:
	return None

	all_data = []

	for model, result in results.items():
	if "error" not in result:
	type_counts = Counter(token["type"] for token in result["tokens"])
	for token_type, count in type_counts.items():
	all_data.append(
	{
	"Tokenizer": result["model"],
	"Token Type": token_type,
	"Count": count,
	}
	)

	if not all_data:
	return None

	df = pd.DataFrame(all_data)

	fig = px.bar(
	df,
	x="Tokenizer",
	y="Count",
	color="Token Type",
	title="Token Type Distribution by Tokenizer",
	template="plotly_white",
	)

	return fig


	# Custom CSS for better styling
	css = """
	.gradio-container {
	font-family: 'Inter', sans-serif;
	}
	.token-display {
	font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
	background: #f8f9fa;
	padding: 8px;
	border-radius: 4px;
	font-size: 0.9em;
	}
	"""

	# Create the Gradio interface
	with gr.Blocks(
	title="🔤 Advanced Tokenizer Comparison", theme=gr.themes.Soft(), css=css
	) as demo:
	gr.Markdown("""
	# 🔤 Advanced Tokenizer Comparison Tool

	Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.

	Legend: 🔤 Word \| 🔢 Number \| ❗ Punctuation \| 🔸 Subword \| · Space
	""")

	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Text to tokenize",
	placeholder="Enter your text here...",
	lines=4,
	value="Hello world! This is a test with some subwords and punctuation.",
	)

	with gr.Column(scale=1):
	model_selector = gr.CheckboxGroup(
	choices=[
	"gpt-4",
	"gpt-2",
	"llama-2",
	"llama-3",
	"gemma-2",
	"qwen3",
	"qwen2.5",
	"bert",
	"bloom",
	"aya-expanse",
	"comma",
	"roberta",
	"distilbert",
	"tokenmonster",
	"byt5",
	],
	value=["gpt-4", "llama-3", "gpt-2"],
	label="Select tokenizers to compare",
	)

	show_details = gr.Checkbox(label="Show detailed analysis", value=False)

	with gr.Row():
	with gr.Column():
	efficiency_output = gr.Markdown(
	label="Efficiency Ranking",
	value="Enter text above to see efficiency comparison...",
	)

	with gr.Row():
	with gr.Column():
	tokenization_display = gr.HTML(
	label="Interactive Tokenization (Hover to highlight across tokenizers)",
	value="<p>Enter text above to see interactive tokenization...</p>",
	)

	with gr.Row():
	with gr.Column():
	token_ids_output = gr.Markdown(
	label="Token IDs", value="Token IDs will appear here..."
	)

	with gr.Row():
	with gr.Column():
	detailed_output = gr.Markdown(label="Detailed Analysis", visible=False)

	with gr.Row():
	with gr.Column():
	efficiency_chart = gr.Plot(label="Efficiency Comparison")
	with gr.Column():
	distribution_chart = gr.Plot(label="Token Type Distribution")

	# Update visibility of detailed analysis
	def toggle_details(show_details):
	return gr.update(visible=show_details)

	show_details.change(fn=toggle_details, inputs=show_details, outputs=detailed_output)

	# Main comparison function
	def update_comparison(text, models, details):
	efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart = (
	compare_tokenizers(text, models, details)
	)
	return efficiency, tokenization_html, token_ids, detailed, eff_chart, dist_chart

	# Auto-update on changes
	for component in [text_input, model_selector, show_details]:
	component.change(
	fn=update_comparison,
	inputs=[text_input, model_selector, show_details],
	outputs=[
	efficiency_output,
	tokenization_display,
	token_ids_output,
	detailed_output,
	efficiency_chart,
	distribution_chart,
	],
	)

	gr.Markdown("""
	---
	### About the Models

	- GPT-4/GPT-2: OpenAI's tokenizers using BPE (Byte-Pair Encoding)
	- LLaMA-2/3: Meta's models using SentencePiece
	- Gemma-2: Google's model with SentencePiece
	- Qwen3/2.5: Alibaba's models with BPE
	- BERT/DistilBERT: Google's models with WordPiece
	- RoBERTa: Facebook's model with BPE
	- BLOOM: BigScience's multilingual model with BPE
	- Aya Expanse: Cohere's multilingual model with SentencePiece
	- Comma (Common Pile): Common Pile's model with BPE

	### Features
	- Efficiency Ranking: Compare token counts across models
	- Subword Analysis: See how models handle subwords
	- Token Types: Classification of word/number/punctuation tokens
	- Visual Charts: Interactive plots for comparison
	- Detailed Analysis: Common tokens and distribution stats
	""")

	if __name__ == "__main__":
	demo.launch()
	demo.launch()
	demo.launch()