alakxender commited on
Commit
2af4cfb
·
1 Parent(s): fee5e46
Files changed (1) hide show
  1. app.py +118 -38
app.py CHANGED
@@ -1,5 +1,9 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer, T5Tokenizer
 
 
 
 
3
 
4
  # Fixed list of custom tokenizers (left)
5
  TOKENIZER_CUSTOM = {
@@ -21,16 +25,26 @@ SUGGESTED_STOCK_PATHS = [
21
  "microsoft/deberta-v3-base"
22
  ]
23
 
 
 
 
24
  # Load tokenizer with fallback to slow T5
25
  def load_tokenizer(tokenizer_path):
 
 
 
26
  try:
27
- return AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
 
 
28
  except Exception:
29
  if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
30
- return T5Tokenizer.from_pretrained(tokenizer_path)
 
 
31
  raise
32
 
33
- # Tokenize and decode with error handling
34
  def tokenize_display(text, tokenizer_path):
35
  try:
36
  tokenizer = load_tokenizer(tokenizer_path)
@@ -42,36 +56,51 @@ def tokenize_display(text, tokenizer_path):
42
  except Exception as e:
43
  return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"
44
 
45
- # Comparison logic
46
- def compare_side_by_side(dv_text, en_text, custom_label, stock_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def format_block(title, tokenizer_path):
48
  dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
49
  en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
50
 
51
  return f"""\
52
- ### 🔤 {title}
53
 
54
- #### 🈁 Dhivehi Text
55
- `{dv_text}`
56
 
57
- **Tokenized:**
58
- {' '.join(dv_tokens)}
59
 
60
- **Number of tokens:** {len(dv_tokens) if dv_ids else 'N/A'}
61
- **IDs:** {dv_ids or '[ERROR]'}
62
- **Decoded:** `{dv_decoded}`
63
 
64
  ---
65
 
66
- #### 🇬🇧 English Text
67
- `{en_text}`
 
 
68
 
69
- **Tokenized:**
70
- {' '.join(en_tokens)}
71
 
72
- **Number of tokens:** {len(en_tokens) if en_ids else 'N/A'}
73
- **IDs:** {en_ids or '[ERROR]'}
74
- **Decoded:** `{en_decoded}`
75
  """
76
 
77
  try:
@@ -79,52 +108,103 @@ def compare_side_by_side(dv_text, en_text, custom_label, stock_path):
79
  except KeyError:
80
  return "[ERROR] Invalid custom tokenizer selected", ""
81
 
82
- return (
83
- format_block("Custom Tokenizer", custom_path),
84
- format_block("Stock Tokenizer", stock_path)
85
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- # Gradio UI
88
- with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
90
  gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")
91
 
92
  with gr.Row():
93
  dhivehi_text = gr.Textbox(
94
  label="Dhivehi Text",
95
- lines=1,
96
  value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
97
- rtl=True
 
98
  )
99
  english_text = gr.Textbox(
100
  label="English Text",
101
- lines=1,
102
- value="The quick brown fox jumps over the lazy dog"
 
103
  )
104
 
105
  with gr.Row():
106
  tokenizer_a = gr.Dropdown(
107
  label="Select Custom Tokenizer",
108
  choices=list(TOKENIZER_CUSTOM.keys()),
109
- value="T5 Extended"
 
110
  )
111
  tokenizer_b = gr.Dropdown(
112
  label="Enter or Select Stock Tokenizer Path",
113
  choices=SUGGESTED_STOCK_PATHS,
114
  value="google/flan-t5-base",
115
- allow_custom_value=True
 
116
  )
117
 
118
- compare_button = gr.Button("Compare Tokenizers")
119
 
120
  with gr.Row():
121
- output_custom = gr.Markdown(label="Custom Tokenizer Output")
122
- output_stock = gr.Markdown(label="Stock Tokenizer Output")
123
 
 
124
  compare_button.click(
125
- compare_side_by_side,
126
  inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
127
- outputs=[output_custom, output_stock]
 
128
  )
129
 
130
- demo.launch()
 
 
 
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer, T5Tokenizer
3
+ import asyncio
4
+ import threading
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ import time
7
 
8
  # Fixed list of custom tokenizers (left)
9
  TOKENIZER_CUSTOM = {
 
25
  "microsoft/deberta-v3-base"
26
  ]
27
 
28
+ # Cache for loaded tokenizers to avoid reloading
29
+ tokenizer_cache = {}
30
+
31
  # Load tokenizer with fallback to slow T5
32
  def load_tokenizer(tokenizer_path):
33
+ if tokenizer_path in tokenizer_cache:
34
+ return tokenizer_cache[tokenizer_path]
35
+
36
  try:
37
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
38
+ tokenizer_cache[tokenizer_path] = tokenizer
39
+ return tokenizer
40
  except Exception:
41
  if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
42
+ tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
43
+ tokenizer_cache[tokenizer_path] = tokenizer
44
+ return tokenizer
45
  raise
46
 
47
+ # Tokenize and decode with enhanced visualization
48
  def tokenize_display(text, tokenizer_path):
49
  try:
50
  tokenizer = load_tokenizer(tokenizer_path)
 
56
  except Exception as e:
57
  return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"
58
 
59
+ def create_token_visualization(tokens, ids):
60
+ """Create a visual representation of tokens with colors and spacing"""
61
+ if not tokens or not ids:
62
+ return "❌ No tokens to display"
63
+
64
+ # Create colored token blocks
65
+ token_blocks = []
66
+ colors = ["🟦", "🟩", "🟨", "🟪", "🟧", "🟫"]
67
+
68
+ for i, (token, token_id) in enumerate(zip(tokens, ids)):
69
+ color = colors[i % len(colors)]
70
+ # Clean token display (remove special characters for better readability)
71
+ clean_token = token.replace('▁', '_').replace('</s>', '[END]').replace('<s>', '[START]')
72
+ token_blocks.append(f"{color} `{clean_token}` ({token_id})")
73
+
74
+ return " ".join(token_blocks)
75
+
76
+ # Async comparison with progress updates
77
+ def compare_side_by_side_with_progress(dv_text, en_text, custom_label, stock_path, progress=gr.Progress()):
78
  def format_block(title, tokenizer_path):
79
  dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
80
  en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
81
 
82
  return f"""\
83
+ ## 🔤 {title}
84
 
85
+ ### 🈁 Dhivehi: `{dv_text}`
 
86
 
87
+ **🎯 Tokens:** {len(dv_tokens) if dv_ids else 'N/A'} tokens
88
+ {create_token_visualization(dv_tokens, dv_ids)}
89
 
90
+ **🔢 Token IDs:** `{dv_ids if dv_ids else '[ERROR]'}`
91
+ **🔄 Decoded:** `{dv_decoded}`
 
92
 
93
  ---
94
 
95
+ ### 🇬🇧 English: `{en_text}`
96
+
97
+ **🎯 Tokens:** {len(en_tokens) if en_ids else 'N/A'} tokens
98
+ {create_token_visualization(en_tokens, en_ids)}
99
 
100
+ **🔢 Token IDs:** `{en_ids if en_ids else '[ERROR]'}`
101
+ **🔄 Decoded:** `{en_decoded}`
102
 
103
+ ---
 
 
104
  """
105
 
106
  try:
 
108
  except KeyError:
109
  return "[ERROR] Invalid custom tokenizer selected", ""
110
 
111
+ # Show loading progress
112
+ progress(0.1, desc="Loading custom tokenizer...")
113
+
114
+ # Load custom tokenizer
115
+ try:
116
+ custom_result = format_block("Custom Tokenizer", custom_path)
117
+ progress(0.5, desc="Custom tokenizer loaded. Loading stock tokenizer...")
118
+ except Exception as e:
119
+ custom_result = f"[ERROR] Failed to load custom tokenizer: {str(e)}"
120
+ progress(0.5, desc="Custom tokenizer failed. Loading stock tokenizer...")
121
+
122
+ # Load stock tokenizer
123
+ try:
124
+ stock_result = format_block("Stock Tokenizer", stock_path)
125
+ progress(1.0, desc="Complete!")
126
+ except Exception as e:
127
+ stock_result = f"[ERROR] Failed to load stock tokenizer: {str(e)}"
128
+ progress(1.0, desc="Complete with errors!")
129
+
130
+ return custom_result, stock_result
131
+
132
+ # Non-blocking comparison function
133
+ def compare_tokenizers_async(dv_text, en_text, custom_label, stock_path):
134
+ # Return immediate loading message
135
+ loading_msg = """
136
+ ## ⏳ Loading Tokenizer...
137
+
138
+ 🚀 **Status:** Downloading and initializing tokenizer...
139
 
140
+ *This may take a moment for first-time downloads*
141
+ """
142
+
143
+ # Use ThreadPoolExecutor for non-blocking execution
144
+ with ThreadPoolExecutor(max_workers=2) as executor:
145
+ future = executor.submit(compare_side_by_side_with_progress, dv_text, en_text, custom_label, stock_path)
146
+
147
+ # Return loading state first
148
+ yield loading_msg, loading_msg
149
+
150
+ # Then return actual results
151
+ try:
152
+ custom_result, stock_result = future.result(timeout=120) # 2 minute timeout
153
+ yield custom_result, stock_result
154
+ except Exception as e:
155
+ error_msg = f"## ❌ Error\n\n**Failed to load tokenizers:** {str(e)}"
156
+ yield error_msg, error_msg
157
+
158
+ # Gradio UI with better UX
159
+ with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool", theme=gr.themes.Soft()) as demo:
160
  gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
161
  gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")
162
 
163
  with gr.Row():
164
  dhivehi_text = gr.Textbox(
165
  label="Dhivehi Text",
166
+ lines=2,
167
  value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
168
+ rtl=True,
169
+ placeholder="Enter Dhivehi text here..."
170
  )
171
  english_text = gr.Textbox(
172
  label="English Text",
173
+ lines=2,
174
+ value="The quick brown fox jumps over the lazy dog",
175
+ placeholder="Enter English text here..."
176
  )
177
 
178
  with gr.Row():
179
  tokenizer_a = gr.Dropdown(
180
  label="Select Custom Tokenizer",
181
  choices=list(TOKENIZER_CUSTOM.keys()),
182
+ value="T5 Extended",
183
+ info="Pre-trained Dhivehi tokenizers"
184
  )
185
  tokenizer_b = gr.Dropdown(
186
  label="Enter or Select Stock Tokenizer Path",
187
  choices=SUGGESTED_STOCK_PATHS,
188
  value="google/flan-t5-base",
189
+ allow_custom_value=True,
190
+ info="Standard HuggingFace tokenizers"
191
  )
192
 
193
+ compare_button = gr.Button("🔄 Compare Tokenizers", variant="primary", size="lg")
194
 
195
  with gr.Row():
196
+ output_custom = gr.Markdown(label="Custom Tokenizer Output", height=400)
197
+ output_stock = gr.Markdown(label="Stock Tokenizer Output", height=400)
198
 
199
+ # Use the non-blocking function
200
  compare_button.click(
201
+ compare_side_by_side_with_progress,
202
  inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
203
+ outputs=[output_custom, output_stock],
204
+ show_progress=True
205
  )
206
 
207
+
208
+
209
+ if __name__ == "__main__":
210
+ demo.launch()