Gül Sena Altıntaş commited on
Commit
452a924
·
1 Parent(s): c02e89e

Added sample texts

Browse files
Files changed (1) hide show
  1. app.py +120 -37
app.py CHANGED
@@ -68,14 +68,17 @@ def generate_basic_comparison(results):
68
 
69
 
70
  def generate_interactive_tokenization(results):
71
- """Generate HTML with hover highlighting across tokenizers"""
72
  if not results:
73
  return "<p>No tokenization results to display.</p>"
74
 
75
  html_parts = []
 
 
76
  html_parts.append("""
 
77
  <style>
78
- .tokenizer-container {
79
  margin-bottom: 20px;
80
  border: 1px solid #e0e0e0;
81
  border-radius: 8px;
@@ -103,9 +106,10 @@ def generate_interactive_tokenization(results):
103
  transition: all 0.2s ease;
104
  position: relative;
105
  font-size: 14px;
 
106
  }
107
  .token:hover {
108
- transform: scale(1.1);
109
  z-index: 10;
110
  box-shadow: 0 2px 8px rgba(0,0,0,0.2);
111
  }
@@ -113,7 +117,9 @@ def generate_interactive_tokenization(results):
113
  background: #ff6b6b !important;
114
  border-color: #e55353 !important;
115
  color: white !important;
116
- box-shadow: 0 0 10px rgba(255, 107, 107, 0.5);
 
 
117
  }
118
  .token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
119
  .token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
@@ -135,35 +141,64 @@ def generate_interactive_tokenization(results):
135
  font-size: 12px;
136
  color: #666;
137
  }
 
 
 
 
 
 
 
 
 
 
 
 
138
  </style>
139
 
 
 
140
  <script>
141
- function highlightToken(text, allTokenizers) {
142
- // Remove existing highlights
143
- document.querySelectorAll('.token').forEach(token => {
144
  token.classList.remove('highlighted');
145
  });
146
 
147
- // Highlight tokens with same text across all tokenizers
148
- document.querySelectorAll('.token').forEach(token => {
149
- if (token.dataset.text === text) {
 
150
  token.classList.add('highlighted');
 
151
  }
152
  });
 
 
 
 
 
 
 
 
153
  }
154
 
155
  function clearHighlights() {
156
- document.querySelectorAll('.token').forEach(token => {
157
  token.classList.remove('highlighted');
158
  });
 
 
 
 
159
  }
160
  </script>
161
  """)
162
 
 
163
  for model, result in results.items():
164
  if "error" in result:
165
  html_parts.append(f"""
166
- <div class="tokenizer-container">
167
  <div class="tokenizer-header">{result["model"]} ❌</div>
168
  <div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
169
  </div>
@@ -171,7 +206,7 @@ def generate_interactive_tokenization(results):
171
  continue
172
 
173
  html_parts.append(f"""
174
- <div class="tokenizer-container">
175
  <div class="tokenizer-header">
176
  {result["model"]}
177
  <span class="token-stats">
@@ -183,13 +218,11 @@ def generate_interactive_tokenization(results):
183
  <div class="token-display">
184
  """)
185
 
186
- # Add tokens with hover functionality
187
  subword_count = 0
188
  for i, token in enumerate(result["tokens"]):
189
  token_text = token["text"]
190
- display_text = (
191
- token_text if token_text.strip() else "·"
192
- ) # Show space as dot
193
 
194
  # Determine token class
195
  token_class = f"token token-{token['type']}"
@@ -197,21 +230,31 @@ def generate_interactive_tokenization(results):
197
  token_class += " token-subword"
198
  subword_count += 1
199
 
200
- # Escape text for HTML
201
- escaped_text = token_text.replace('"', "&quot;").replace("'", "&#39;")
 
 
 
 
 
 
 
 
 
 
202
  escaped_display = display_text.replace('"', "&quot;").replace("'", "&#39;")
203
 
204
- html_parts.append(f"""
205
- <span class="{token_class}"
206
- data-text="{escaped_text}"
 
207
  data-id="{token["id"]}"
208
  data-position="{i}"
 
209
  title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
210
- onmouseover="highlightToken('{escaped_text}', true)"
211
- onmouseout="clearHighlights()">
212
- {escaped_display}
213
- </span>
214
- """)
215
 
216
  html_parts.append(f"""
217
  </div>
@@ -222,6 +265,7 @@ def generate_interactive_tokenization(results):
222
  </div>
223
  """)
224
 
 
225
  return "".join(html_parts)
226
 
227
 
@@ -420,13 +464,44 @@ with gr.Blocks(
420
  Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
421
 
422
  **Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
 
 
 
 
 
 
 
423
  """)
424
 
425
  with gr.Row():
426
  with gr.Column(scale=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  text_input = gr.Textbox(
428
  label="Text to tokenize",
429
- placeholder="Enter your text here...",
430
  lines=4,
431
  value="Hello world! This is a test with some subwords and punctuation.",
432
  )
@@ -445,8 +520,6 @@ with gr.Blocks(
445
  "bloom",
446
  "aya-expanse",
447
  "comma",
448
- "roberta",
449
- "distilbert",
450
  "tokenmonster",
451
  "byt5",
452
  ],
@@ -486,11 +559,23 @@ with gr.Blocks(
486
  with gr.Column():
487
  distribution_chart = gr.Plot(label="Token Type Distribution")
488
 
489
- # Update visibility of detailed analysis
490
- def toggle_details(show_details):
491
- return gr.update(visible=show_details)
 
 
 
 
 
 
 
 
 
492
 
493
- show_details.change(fn=toggle_details, inputs=show_details, outputs=detailed_output)
 
 
 
494
 
495
  # Main comparison function
496
  def update_comparison(text, models, details):
@@ -523,10 +608,10 @@ with gr.Blocks(
523
  - **Gemma-2**: Google's model with SentencePiece
524
  - **Qwen3/2.5**: Alibaba's models with BPE
525
  - **BERT/DistilBERT**: Google's models with WordPiece
526
- - **RoBERTa**: Facebook's model with BPE
527
  - **BLOOM**: BigScience's multilingual model with BPE
528
  - **Aya Expanse**: Cohere's multilingual model with SentencePiece
529
  - **Comma (Common Pile)**: Common Pile's model with BPE
 
530
 
531
  ### Features
532
  - **Efficiency Ranking**: Compare token counts across models
@@ -538,5 +623,3 @@ with gr.Blocks(
538
 
539
  if __name__ == "__main__":
540
  demo.launch()
541
- demo.launch()
542
- demo.launch()
 
68
 
69
 
70
  def generate_interactive_tokenization(results):
71
+ """Generate HTML with working hover highlighting across tokenizers"""
72
  if not results:
73
  return "<p>No tokenization results to display.</p>"
74
 
75
  html_parts = []
76
+
77
+ # Add styles first
78
  html_parts.append("""
79
+ <div id="tokenizer-container">
80
  <style>
81
+ .tokenizer-section {
82
  margin-bottom: 20px;
83
  border: 1px solid #e0e0e0;
84
  border-radius: 8px;
 
106
  transition: all 0.2s ease;
107
  position: relative;
108
  font-size: 14px;
109
+ user-select: none;
110
  }
111
  .token:hover {
112
+ transform: scale(1.05);
113
  z-index: 10;
114
  box-shadow: 0 2px 8px rgba(0,0,0,0.2);
115
  }
 
117
  background: #ff6b6b !important;
118
  border-color: #e55353 !important;
119
  color: white !important;
120
+ box-shadow: 0 0 10px rgba(255, 107, 107, 0.5) !important;
121
+ transform: scale(1.1) !important;
122
+ z-index: 100 !important;
123
  }
124
  .token-word { background: #e8f5e8; border-color: #4caf50; color: #2e7d32; }
125
  .token-number { background: #f3e5f5; border-color: #9c27b0; color: #7b1fa2; }
 
141
  font-size: 12px;
142
  color: #666;
143
  }
144
+ .highlight-info {
145
+ position: fixed;
146
+ top: 10px;
147
+ right: 10px;
148
+ background: #333;
149
+ color: white;
150
+ padding: 8px 12px;
151
+ border-radius: 4px;
152
+ font-size: 12px;
153
+ display: none;
154
+ z-index: 1000;
155
+ }
156
  </style>
157
 
158
+ <div class="highlight-info" id="highlight-info"></div>
159
+
160
  <script>
161
+ function highlightTokens(targetText) {
162
+ // Clear all highlights
163
+ document.querySelectorAll('.token').forEach(function(token) {
164
  token.classList.remove('highlighted');
165
  });
166
 
167
+ // Highlight matching tokens
168
+ let count = 0;
169
+ document.querySelectorAll('.token').forEach(function(token) {
170
+ if (token.getAttribute('data-text') === targetText) {
171
  token.classList.add('highlighted');
172
+ count++;
173
  }
174
  });
175
+
176
+ // Show info
177
+ const info = document.getElementById('highlight-info');
178
+ if (info) {
179
+ const displayText = targetText === ' ' ? '(space)' : targetText;
180
+ info.textContent = '"' + displayText + '" appears in ' + count + ' positions';
181
+ info.style.display = 'block';
182
+ }
183
  }
184
 
185
  function clearHighlights() {
186
+ document.querySelectorAll('.token').forEach(function(token) {
187
  token.classList.remove('highlighted');
188
  });
189
+ const info = document.getElementById('highlight-info');
190
+ if (info) {
191
+ info.style.display = 'none';
192
+ }
193
  }
194
  </script>
195
  """)
196
 
197
+ # Generate tokenizer sections with inline event handlers
198
  for model, result in results.items():
199
  if "error" in result:
200
  html_parts.append(f"""
201
+ <div class="tokenizer-section">
202
  <div class="tokenizer-header">{result["model"]} ❌</div>
203
  <div style="color: #d32f2f; font-style: italic;">Error: {result["error"]}</div>
204
  </div>
 
206
  continue
207
 
208
  html_parts.append(f"""
209
+ <div class="tokenizer-section">
210
  <div class="tokenizer-header">
211
  {result["model"]}
212
  <span class="token-stats">
 
218
  <div class="token-display">
219
  """)
220
 
221
+ # Add tokens with inline event handlers
222
  subword_count = 0
223
  for i, token in enumerate(result["tokens"]):
224
  token_text = token["text"]
225
+ display_text = token_text if token_text.strip() else "·"
 
 
226
 
227
  # Determine token class
228
  token_class = f"token token-{token['type']}"
 
230
  token_class += " token-subword"
231
  subword_count += 1
232
 
233
+ # Create unique identifier for this token occurrence
234
+ token_id = f"token_{model}_{i}"
235
+
236
+ # Escape text for HTML and JavaScript - be very careful with quotes
237
+ escaped_text = (
238
+ token_text.replace("\\", "\\\\")
239
+ .replace("'", "\\'")
240
+ .replace('"', '\\"')
241
+ .replace("\n", "\\n")
242
+ .replace("\r", "\\r")
243
+ )
244
+
245
  escaped_display = display_text.replace('"', "&quot;").replace("'", "&#39;")
246
 
247
+ # Use inline event handlers that definitely work in Gradio
248
+ html_parts.append(f"""<span class="{token_class}"
249
+ id="{token_id}"
250
+ data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
251
  data-id="{token["id"]}"
252
  data-position="{i}"
253
+ data-model="{model}"
254
  title="Text: '{token_text}' | ID: {token["id"]} | Type: {token["type"]} | Subword: {token["is_subword"]}"
255
+ onmouseover="highlightTokens('{escaped_text}')"
256
+ onmouseout="clearHighlights()"
257
+ onclick="alert('Token: \\'{escaped_text}\\'\\nID: {token["id"]}\\nModel: {model}')">{escaped_display}</span>""")
 
 
258
 
259
  html_parts.append(f"""
260
  </div>
 
265
  </div>
266
  """)
267
 
268
+ html_parts.append("</div>")
269
  return "".join(html_parts)
270
 
271
 
 
464
  Compare how different LLM tokenizers split text into tokens. Analyze efficiency, subwords, and token types.
465
 
466
  **Legend**: 🔤 Word | 🔢 Number | ❗ Punctuation | 🔸 Subword | · Space
467
+
468
+ 💡 **Try the sample texts** to see how tokenizers handle different challenges like:
469
+ - Mixed languages and scripts
470
+ - Programming code and JSON
471
+ - Long compound words
472
+ - Special characters and emojis
473
+ - Technical terminology
474
  """)
475
 
476
  with gr.Row():
477
  with gr.Column(scale=2):
478
+ # Sample texts dropdown
479
+ sample_texts = gr.Dropdown(
480
+ choices=[
481
+ "Custom text (enter below)",
482
+ "Basic English: Hello world! How are you doing today?",
483
+ "Programming code: def tokenize_text(input_str): return input_str.split()",
484
+ "Mixed languages: Hello! 你好! こんにちは! Bonjour! Hola! مرحبا!",
485
+ "Numbers & symbols: The price is $123.45 (20% off) = $98.76 savings!",
486
+ "Subword challenge: antidisestablishmentarianism pseudopseudohypoparathyroidism",
487
+ "Special characters: @user123 #AI #NLP https://example.com/api?q=tokenization&limit=100",
488
+ "Scientific text: The mitochondria (powerhouse of the cell) produces ATP through oxidative phosphorylation.",
489
+ "Poetry: Roses are red, violets are blue, tokenizers split words, in ways quite new!",
490
+ "Technical jargon: The RESTful API endpoint /users/{id}/preferences supports GET/POST/PUT/DELETE operations.",
491
+ "Emoji & Unicode: I love AI! 🤖✨ The café naïve résumé 北京大学 العربية",
492
+ "Repetitive text: Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.",
493
+ "Long compound words (German): Donaudampfschifffahrtselektrizitätenhauptbetriebswerkbauunterbeamtengesellschaft",
494
+ 'JSON data: {"name": "John Doe", "age": 30, "skills": ["Python", "JavaScript", "AI/ML"]}',
495
+ "Medical terminology: Pneumonoultramicroscopicsilicovolcanoconiosisdiagnosis requires thorough radiological examination.",
496
+ ],
497
+ value="Custom text (enter below)",
498
+ label="Choose a sample text or enter your own",
499
+ interactive=True,
500
+ )
501
+
502
  text_input = gr.Textbox(
503
  label="Text to tokenize",
504
+ placeholder="Enter your text here or select a sample above...",
505
  lines=4,
506
  value="Hello world! This is a test with some subwords and punctuation.",
507
  )
 
520
  "bloom",
521
  "aya-expanse",
522
  "comma",
 
 
523
  "tokenmonster",
524
  "byt5",
525
  ],
 
559
  with gr.Column():
560
  distribution_chart = gr.Plot(label="Token Type Distribution")
561
 
562
+ # Function to update text input when sample is selected
563
+ def update_text_from_sample(sample_choice):
564
+ if sample_choice == "Custom text (enter below)":
565
+ return gr.update() # Don't change the text input
566
+ else:
567
+ # Extract the text after the colon
568
+ sample_text = (
569
+ sample_choice.split(": ", 1)[1]
570
+ if ": " in sample_choice
571
+ else sample_choice
572
+ )
573
+ return gr.update(value=sample_text)
574
 
575
+ # Update text input when sample is selected
576
+ sample_texts.change(
577
+ fn=update_text_from_sample, inputs=sample_texts, outputs=text_input
578
+ )
579
 
580
  # Main comparison function
581
  def update_comparison(text, models, details):
 
608
  - **Gemma-2**: Google's model with SentencePiece
609
  - **Qwen3/2.5**: Alibaba's models with BPE
610
  - **BERT/DistilBERT**: Google's models with WordPiece
 
611
  - **BLOOM**: BigScience's multilingual model with BPE
612
  - **Aya Expanse**: Cohere's multilingual model with SentencePiece
613
  - **Comma (Common Pile)**: Common Pile's model with BPE
614
+ - **Byt5**: Google's byte-level model
615
 
616
  ### Features
617
  - **Efficiency Ranking**: Compare token counts across models
 
623
 
624
  if __name__ == "__main__":
625
  demo.launch()