Spaces:
Sleeping
Sleeping
Gül Sena Altıntaş
commited on
Commit
·
452a924
1
Parent(s):
c02e89e
Added sample texts
Browse files
app.py
CHANGED
@@ -68,14 +68,17 @@ def generate_basic_comparison(results):
|
|
68 |
|
69 |
|
70 |
def generate_interactive_tokenization(results):
|
71 |
-
"""Generate HTML with hover highlighting across tokenizers"""
|
72 |
if not results:
|
73 |
return "<p>No tokenization results to display.</p>"
|
74 |
|
75 |
html_parts = []
|
|
|
|
|
76 |
html_parts.append("""
|
|
|
77 |
<style>
|
78 |
-
.tokenizer-
|
79 |
margin-bottom: 20px;
|
80 |
border: 1px solid #e0e0e0;
|
81 |
border-radius: 8px;
|
@@ -103,9 +106,10 @@ def generate_interactive_tokenization(results):
|
|
103 |
transition: all 0.2s ease;
|
104 |
position: relative;
|
105 |
font-size: 14px;
|
|
|
106 |
}
|
107 |
.token:hover {
|
108 |
-
transform: scale(1.
|
109 |
z-index: 10;
|
110 |
box-shadow: 0 2px 8px rgba(0,0,0,0.2);
|
111 |
}
|
@@ -113,7 +117,9 @@ def generate_interactive_tokenization(results):
|
|
113 |
background: #ff6b6b !important;
|
114 |
border-color: #e55353 !important;
|
115 |
color: white !important;
|
116 |
-
box-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
|
|
|
|
|
117 |
}
|
118 |
.token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
|
119 |
.token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
|
@@ -135,35 +141,64 @@ def generate_interactive_tokenization(results):
|
|
135 |
font-size: 12px;
|
136 |
color: #666;
|
137 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
</style>
|
139 |
|
|
|
|
|
140 |
<script>
|
141 |
-
function
|
142 |
-
//
|
143 |
-
document.querySelectorAll('.token').forEach(token
|
144 |
token.classList.remove('highlighted');
|
145 |
});
|
146 |
|
147 |
-
// Highlight tokens
|
148 |
-
|
149 |
-
|
|
|
150 |
token.classList.add('highlighted');
|
|
|
151 |
}
|
152 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
}
|
154 |
|
155 |
function clearHighlights() {
|
156 |
-
document.querySelectorAll('.token').forEach(token
|
157 |
token.classList.remove('highlighted');
|
158 |
});
|
|
|
|
|
|
|
|
|
159 |
}
|
160 |
</script>
|
161 |
""")
|
162 |
|
|
|
163 |
for model, result in results.items():
|
164 |
if "error" in result:
|
165 |
html_parts.append(f"""
|
166 |
-
<div class="tokenizer-
|
167 |
<div class="tokenizer-header">{result["model"]} ❌</div>
|
168 |
<div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
|
169 |
</div>
|
@@ -171,7 +206,7 @@ def generate_interactive_tokenization(results):
|
|
171 |
continue
|
172 |
|
173 |
html_parts.append(f"""
|
174 |
-
<div class="tokenizer-
|
175 |
<div class="tokenizer-header">
|
176 |
{result["model"]}
|
177 |
<span class="token-stats">
|
@@ -183,13 +218,11 @@ def generate_interactive_tokenization(results):
|
|
183 |
<div class="token-display">
|
184 |
""")
|
185 |
|
186 |
-
# Add tokens with
|
187 |
subword_count = 0
|
188 |
for i, token in enumerate(result["tokens"]):
|
189 |
token_text = token["text"]
|
190 |
-
display_text = (
|
191 |
-
token_text if token_text.strip() else "·"
|
192 |
-
) # Show space as dot
|
193 |
|
194 |
# Determine token class
|
195 |
token_class = f"token token-{token['type']}"
|
@@ -197,21 +230,31 @@ def generate_interactive_tokenization(results):
|
|
197 |
token_class += " token-subword"
|
198 |
subword_count += 1
|
199 |
|
200 |
-
#
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
escaped_display = display_text.replace('"', """).replace("'", "'")
|
203 |
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
207 |
data-id="{token["id"]}"
|
208 |
data-position="{i}"
|
|
|
209 |
title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
|
210 |
-
onmouseover="
|
211 |
-
onmouseout="clearHighlights()"
|
212 |
-
|
213 |
-
</span>
|
214 |
-
""")
|
215 |
|
216 |
html_parts.append(f"""
|
217 |
</div>
|
@@ -222,6 +265,7 @@ def generate_interactive_tokenization(results):
|
|
222 |
</div>
|
223 |
""")
|
224 |
|
|
|
225 |
return "".join(html_parts)
|
226 |
|
227 |
|
@@ -420,13 +464,44 @@ with gr.Blocks(
|
|
420 |
Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
|
421 |
|
422 |
**Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
""")
|
424 |
|
425 |
with gr.Row():
|
426 |
with gr.Column(scale=2):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
427 |
text_input = gr.Textbox(
|
428 |
label="Text to tokenize",
|
429 |
-
placeholder="Enter your text here...",
|
430 |
lines=4,
|
431 |
value="Hello world! This is a test with some subwords and punctuation.",
|
432 |
)
|
@@ -445,8 +520,6 @@ with gr.Blocks(
|
|
445 |
"bloom",
|
446 |
"aya-expanse",
|
447 |
"comma",
|
448 |
-
"roberta",
|
449 |
-
"distilbert",
|
450 |
"tokenmonster",
|
451 |
"byt5",
|
452 |
],
|
@@ -486,11 +559,23 @@ with gr.Blocks(
|
|
486 |
with gr.Column():
|
487 |
distribution_chart = gr.Plot(label="Token Type Distribution")
|
488 |
|
489 |
-
#
|
490 |
-
def
|
491 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
492 |
|
493 |
-
|
|
|
|
|
|
|
494 |
|
495 |
# Main comparison function
|
496 |
def update_comparison(text, models, details):
|
@@ -523,10 +608,10 @@ with gr.Blocks(
|
|
523 |
- **Gemma-2**: Google's model with SentencePiece
|
524 |
- **Qwen3/2.5**: Alibaba's models with BPE
|
525 |
- **BERT/DistilBERT**: Google's models with WordPiece
|
526 |
-
- **RoBERTa**: Facebook's model with BPE
|
527 |
- **BLOOM**: BigScience's multilingual model with BPE
|
528 |
- **Aya Expanse**: Cohere's multilingual model with SentencePiece
|
529 |
- **Comma (Common Pile)**: Common Pile's model with BPE
|
|
|
530 |
|
531 |
### Features
|
532 |
- **Efficiency Ranking**: Compare token counts across models
|
@@ -538,5 +623,3 @@ with gr.Blocks(
|
|
538 |
|
539 |
if __name__ == "__main__":
|
540 |
demo.launch()
|
541 |
-
demo.launch()
|
542 |
-
demo.launch()
|
|
|
68 |
|
69 |
|
70 |
def generate_interactive_tokenization(results):
|
71 |
+
"""Generate HTML with working hover highlighting across tokenizers"""
|
72 |
if not results:
|
73 |
return "<p>No tokenization results to display.</p>"
|
74 |
|
75 |
html_parts = []
|
76 |
+
|
77 |
+
# Add styles first
|
78 |
html_parts.append("""
|
79 |
+
<div id="tokenizer-container">
|
80 |
<style>
|
81 |
+
.tokenizer-section {
|
82 |
margin-bottom: 20px;
|
83 |
border: 1px solid #e0e0e0;
|
84 |
border-radius: 8px;
|
|
|
106 |
transition: all 0.2s ease;
|
107 |
position: relative;
|
108 |
font-size: 14px;
|
109 |
+
user-select: none;
|
110 |
}
|
111 |
.token:hover {
|
112 |
+
transform: scale(1.05);
|
113 |
z-index: 10;
|
114 |
box-shadow: 0 2px 8px rgba(0,0,0,0.2);
|
115 |
}
|
|
|
117 |
background: #ff6b6b !important;
|
118 |
border-color: #e55353 !important;
|
119 |
color: white !important;
|
120 |
+
box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
|
121 |
+
transform: scale(1.1) !important;
|
122 |
+
z-index: 100 !important;
|
123 |
}
|
124 |
.token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
|
125 |
.token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
|
|
|
141 |
font-size: 12px;
|
142 |
color: #666;
|
143 |
}
|
144 |
+
.highlight-info {
|
145 |
+
position: fixed;
|
146 |
+
top: 10px;
|
147 |
+
right: 10px;
|
148 |
+
background: #333;
|
149 |
+
color: white;
|
150 |
+
padding: 8px 12px;
|
151 |
+
border-radius: 4px;
|
152 |
+
font-size: 12px;
|
153 |
+
display: none;
|
154 |
+
z-index: 1000;
|
155 |
+
}
|
156 |
</style>
|
157 |
|
158 |
+
<div class="highlight-info" id="highlight-info"></div>
|
159 |
+
|
160 |
<script>
|
161 |
+
function highlightTokens(targetText) {
|
162 |
+
// Clear all highlights
|
163 |
+
document.querySelectorAll('.token').forEach(function(token) {
|
164 |
token.classList.remove('highlighted');
|
165 |
});
|
166 |
|
167 |
+
// Highlight matching tokens
|
168 |
+
let count = 0;
|
169 |
+
document.querySelectorAll('.token').forEach(function(token) {
|
170 |
+
if (token.getAttribute('data-text') === targetText) {
|
171 |
token.classList.add('highlighted');
|
172 |
+
count++;
|
173 |
}
|
174 |
});
|
175 |
+
|
176 |
+
// Show info
|
177 |
+
const info = document.getElementById('highlight-info');
|
178 |
+
if (info) {
|
179 |
+
const displayText = targetText === ' ' ? '(space)' : targetText;
|
180 |
+
info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
|
181 |
+
info.style.display = 'block';
|
182 |
+
}
|
183 |
}
|
184 |
|
185 |
function clearHighlights() {
|
186 |
+
document.querySelectorAll('.token').forEach(function(token) {
|
187 |
token.classList.remove('highlighted');
|
188 |
});
|
189 |
+
const info = document.getElementById('highlight-info');
|
190 |
+
if (info) {
|
191 |
+
info.style.display = 'none';
|
192 |
+
}
|
193 |
}
|
194 |
</script>
|
195 |
""")
|
196 |
|
197 |
+
# Generate tokenizer sections with inline event handlers
|
198 |
for model, result in results.items():
|
199 |
if "error" in result:
|
200 |
html_parts.append(f"""
|
201 |
+
<div class="tokenizer-section">
|
202 |
<div class="tokenizer-header">{result["model"]} ❌</div>
|
203 |
<div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
|
204 |
</div>
|
|
|
206 |
continue
|
207 |
|
208 |
html_parts.append(f"""
|
209 |
+
<div class="tokenizer-section">
|
210 |
<div class="tokenizer-header">
|
211 |
{result["model"]}
|
212 |
<span class="token-stats">
|
|
|
218 |
<div class="token-display">
|
219 |
""")
|
220 |
|
221 |
+
# Add tokens with inline event handlers
|
222 |
subword_count = 0
|
223 |
for i, token in enumerate(result["tokens"]):
|
224 |
token_text = token["text"]
|
225 |
+
display_text = token_text if token_text.strip() else "·"
|
|
|
|
|
226 |
|
227 |
# Determine token class
|
228 |
token_class = f"token token-{token['type']}"
|
|
|
230 |
token_class += " token-subword"
|
231 |
subword_count += 1
|
232 |
|
233 |
+
# Create unique identifier for this token occurrence
|
234 |
+
token_id = f"token_{model}_{i}"
|
235 |
+
|
236 |
+
# Escape text for HTML and JavaScript - be very careful with quotes
|
237 |
+
escaped_text = (
|
238 |
+
token_text.replace("\\", "\\\\")
|
239 |
+
.replace("'", "\\'")
|
240 |
+
.replace('"', '\\"')
|
241 |
+
.replace("\n", "\\n")
|
242 |
+
.replace("\r", "\\r")
|
243 |
+
)
|
244 |
+
|
245 |
escaped_display = display_text.replace('"', """).replace("'", "'")
|
246 |
|
247 |
+
# Use inline event handlers that definitely work in Gradio
|
248 |
+
html_parts.append(f"""<span class="{token_class}"
|
249 |
+
id="{token_id}"
|
250 |
+
data-text="{token_text.replace('"', """).replace("'", "'")}"
|
251 |
data-id="{token["id"]}"
|
252 |
data-position="{i}"
|
253 |
+
data-model="{model}"
|
254 |
title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
|
255 |
+
onmouseover="highlightTokens('{escaped_text}')"
|
256 |
+
onmouseout="clearHighlights()"
|
257 |
+
onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")
|
|
|
|
|
258 |
|
259 |
html_parts.append(f"""
|
260 |
</div>
|
|
|
265 |
</div>
|
266 |
""")
|
267 |
|
268 |
+
html_parts.append("</div>")
|
269 |
return "".join(html_parts)
|
270 |
|
271 |
|
|
|
464 |
Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
|
465 |
|
466 |
**Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
|
467 |
+
|
468 |
+
💡 **Try the sample texts** to see how tokenizers handle different challenges like:
|
469 |
+
- Mixed languages and scripts
|
470 |
+
- Programming code and JSON
|
471 |
+
- Long compound words
|
472 |
+
- Special characters and emojis
|
473 |
+
- Technical terminology
|
474 |
""")
|
475 |
|
476 |
with gr.Row():
|
477 |
with gr.Column(scale=2):
|
478 |
+
# Sample texts dropdown
|
479 |
+
sample_texts = gr.Dropdown(
|
480 |
+
choices=[
|
481 |
+
"Custom text (enter below)",
|
482 |
+
"Basic English: Hello world! How are you doing today?",
|
483 |
+
"Programming code: def tokenize_text(input_str): return input_str.split()",
|
484 |
+
"Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
|
485 |
+
"Numbers & symbols: The price is $123.45 (20% off) = $98.76 savings!",
|
486 |
+
"Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
|
487 |
+
"Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
|
488 |
+
"Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
|
489 |
+
"Poetry: Roses are red, violets are blue, tokenizers split words, in ways quite new!",
|
490 |
+
"Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
|
491 |
+
"Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية",
|
492 |
+
"Repetitive text: Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.",
|
493 |
+
"Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
|
494 |
+
'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
|
495 |
+
"Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
|
496 |
+
],
|
497 |
+
value="Custom text (enter below)",
|
498 |
+
label="Choose a sample text or enter your own",
|
499 |
+
interactive=True,
|
500 |
+
)
|
501 |
+
|
502 |
text_input = gr.Textbox(
|
503 |
label="Text to tokenize",
|
504 |
+
placeholder="Enter your text here or select a sample above...",
|
505 |
lines=4,
|
506 |
value="Hello world! This is a test with some subwords and punctuation.",
|
507 |
)
|
|
|
520 |
"bloom",
|
521 |
"aya-expanse",
|
522 |
"comma",
|
|
|
|
|
523 |
"tokenmonster",
|
524 |
"byt5",
|
525 |
],
|
|
|
559 |
with gr.Column():
|
560 |
distribution_chart = gr.Plot(label="Token Type Distribution")
|
561 |
|
562 |
+
# Function to update text input when sample is selected
|
563 |
+
def update_text_from_sample(sample_choice):
|
564 |
+
if sample_choice == "Custom text (enter below)":
|
565 |
+
return gr.update() # Don't change the text input
|
566 |
+
else:
|
567 |
+
# Extract the text after the colon
|
568 |
+
sample_text = (
|
569 |
+
sample_choice.split(": ", 1)[1]
|
570 |
+
if ": " in sample_choice
|
571 |
+
else sample_choice
|
572 |
+
)
|
573 |
+
return gr.update(value=sample_text)
|
574 |
|
575 |
+
# Update text input when sample is selected
|
576 |
+
sample_texts.change(
|
577 |
+
fn=update_text_from_sample, inputs=sample_texts, outputs=text_input
|
578 |
+
)
|
579 |
|
580 |
# Main comparison function
|
581 |
def update_comparison(text, models, details):
|
|
|
608 |
- **Gemma-2**: Google's model with SentencePiece
|
609 |
- **Qwen3/2.5**: Alibaba's models with BPE
|
610 |
- **BERT/DistilBERT**: Google's models with WordPiece
|
|
|
611 |
- **BLOOM**: BigScience's multilingual model with BPE
|
612 |
- **Aya Expanse**: Cohere's multilingual model with SentencePiece
|
613 |
- **Comma (Common Pile)**: Common Pile's model with BPE
|
614 |
+
- **Byt5**: Google's byte-level model
|
615 |
|
616 |
### Features
|
617 |
- **Efficiency Ranking**: Compare token counts across models
|
|
|
623 |
|
624 |
if __name__ == "__main__":
|
625 |
demo.launch()
|
|
|
|