Spaces:
Running
Running
Commit
·
2af4cfb
1
Parent(s):
fee5e46
app.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoTokenizer, T5Tokenizer
|
|
|
|
|
|
|
|
|
3 |
|
4 |
# Fixed list of custom tokenizers (left)
|
5 |
TOKENIZER_CUSTOM = {
|
@@ -21,16 +25,26 @@ SUGGESTED_STOCK_PATHS = [
|
|
21 |
"microsoft/deberta-v3-base"
|
22 |
]
|
23 |
|
|
|
|
|
|
|
24 |
# Load tokenizer with fallback to slow T5
|
25 |
def load_tokenizer(tokenizer_path):
|
|
|
|
|
|
|
26 |
try:
|
27 |
-
|
|
|
|
|
28 |
except Exception:
|
29 |
if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
|
30 |
-
|
|
|
|
|
31 |
raise
|
32 |
|
33 |
-
# Tokenize and decode with
|
34 |
def tokenize_display(text, tokenizer_path):
|
35 |
try:
|
36 |
tokenizer = load_tokenizer(tokenizer_path)
|
@@ -42,36 +56,51 @@ def tokenize_display(text, tokenizer_path):
|
|
42 |
except Exception as e:
|
43 |
return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"
|
44 |
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
def format_block(title, tokenizer_path):
|
48 |
dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
|
49 |
en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
|
50 |
|
51 |
return f"""\
|
52 |
-
|
53 |
|
54 |
-
|
55 |
-
`{dv_text}`
|
56 |
|
57 |
-
|
58 |
-
{
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
**Decoded:** `{dv_decoded}`
|
63 |
|
64 |
---
|
65 |
|
66 |
-
|
67 |
-
|
|
|
|
|
68 |
|
69 |
-
|
70 |
-
{
|
71 |
|
72 |
-
|
73 |
-
**IDs:** {en_ids or '[ERROR]'}
|
74 |
-
**Decoded:** `{en_decoded}`
|
75 |
"""
|
76 |
|
77 |
try:
|
@@ -79,52 +108,103 @@ def compare_side_by_side(dv_text, en_text, custom_label, stock_path):
|
|
79 |
except KeyError:
|
80 |
return "[ERROR] Invalid custom tokenizer selected", ""
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
|
87 |
-
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
|
90 |
gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")
|
91 |
|
92 |
with gr.Row():
|
93 |
dhivehi_text = gr.Textbox(
|
94 |
label="Dhivehi Text",
|
95 |
-
lines=
|
96 |
value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
|
97 |
-
rtl=True
|
|
|
98 |
)
|
99 |
english_text = gr.Textbox(
|
100 |
label="English Text",
|
101 |
-
lines=
|
102 |
-
value="The quick brown fox jumps over the lazy dog"
|
|
|
103 |
)
|
104 |
|
105 |
with gr.Row():
|
106 |
tokenizer_a = gr.Dropdown(
|
107 |
label="Select Custom Tokenizer",
|
108 |
choices=list(TOKENIZER_CUSTOM.keys()),
|
109 |
-
value="T5 Extended"
|
|
|
110 |
)
|
111 |
tokenizer_b = gr.Dropdown(
|
112 |
label="Enter or Select Stock Tokenizer Path",
|
113 |
choices=SUGGESTED_STOCK_PATHS,
|
114 |
value="google/flan-t5-base",
|
115 |
-
allow_custom_value=True
|
|
|
116 |
)
|
117 |
|
118 |
-
compare_button = gr.Button("Compare Tokenizers")
|
119 |
|
120 |
with gr.Row():
|
121 |
-
output_custom = gr.Markdown(label="Custom Tokenizer Output")
|
122 |
-
output_stock = gr.Markdown(label="Stock Tokenizer Output")
|
123 |
|
|
|
124 |
compare_button.click(
|
125 |
-
|
126 |
inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
|
127 |
-
outputs=[output_custom, output_stock]
|
|
|
128 |
)
|
129 |
|
130 |
-
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from transformers import AutoTokenizer, T5Tokenizer
|
3 |
+
import asyncio
|
4 |
+
import threading
|
5 |
+
from concurrent.futures import ThreadPoolExecutor
|
6 |
+
import time
|
7 |
|
8 |
# Fixed list of custom tokenizers (left)
|
9 |
TOKENIZER_CUSTOM = {
|
|
|
25 |
"microsoft/deberta-v3-base"
|
26 |
]
|
27 |
|
28 |
+
# Cache for loaded tokenizers to avoid reloading
|
29 |
+
tokenizer_cache = {}
|
30 |
+
|
31 |
# Load tokenizer with fallback to slow T5
|
32 |
def load_tokenizer(tokenizer_path):
|
33 |
+
if tokenizer_path in tokenizer_cache:
|
34 |
+
return tokenizer_cache[tokenizer_path]
|
35 |
+
|
36 |
try:
|
37 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
|
38 |
+
tokenizer_cache[tokenizer_path] = tokenizer
|
39 |
+
return tokenizer
|
40 |
except Exception:
|
41 |
if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
|
42 |
+
tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
|
43 |
+
tokenizer_cache[tokenizer_path] = tokenizer
|
44 |
+
return tokenizer
|
45 |
raise
|
46 |
|
47 |
+
# Tokenize and decode with enhanced visualization
|
48 |
def tokenize_display(text, tokenizer_path):
|
49 |
try:
|
50 |
tokenizer = load_tokenizer(tokenizer_path)
|
|
|
56 |
except Exception as e:
|
57 |
return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"
|
58 |
|
59 |
+
def create_token_visualization(tokens, ids):
|
60 |
+
"""Create a visual representation of tokens with colors and spacing"""
|
61 |
+
if not tokens or not ids:
|
62 |
+
return "❌ No tokens to display"
|
63 |
+
|
64 |
+
# Create colored token blocks
|
65 |
+
token_blocks = []
|
66 |
+
colors = ["🟦", "🟩", "🟨", "🟪", "🟧", "🟫"]
|
67 |
+
|
68 |
+
for i, (token, token_id) in enumerate(zip(tokens, ids)):
|
69 |
+
color = colors[i % len(colors)]
|
70 |
+
# Clean token display (remove special characters for better readability)
|
71 |
+
clean_token = token.replace('▁', '_').replace('</s>', '[END]').replace('<s>', '[START]')
|
72 |
+
token_blocks.append(f"{color} `{clean_token}` ({token_id})")
|
73 |
+
|
74 |
+
return " ".join(token_blocks)
|
75 |
+
|
76 |
+
# Async comparison with progress updates
|
77 |
+
def compare_side_by_side_with_progress(dv_text, en_text, custom_label, stock_path, progress=gr.Progress()):
|
78 |
def format_block(title, tokenizer_path):
|
79 |
dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
|
80 |
en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
|
81 |
|
82 |
return f"""\
|
83 |
+
## 🔤 {title}
|
84 |
|
85 |
+
### 🈁 Dhivehi: `{dv_text}`
|
|
|
86 |
|
87 |
+
**🎯 Tokens:** {len(dv_tokens) if dv_ids else 'N/A'} tokens
|
88 |
+
{create_token_visualization(dv_tokens, dv_ids)}
|
89 |
|
90 |
+
**🔢 Token IDs:** `{dv_ids if dv_ids else '[ERROR]'}`
|
91 |
+
**🔄 Decoded:** `{dv_decoded}`
|
|
|
92 |
|
93 |
---
|
94 |
|
95 |
+
### 🇬🇧 English: `{en_text}`
|
96 |
+
|
97 |
+
**🎯 Tokens:** {len(en_tokens) if en_ids else 'N/A'} tokens
|
98 |
+
{create_token_visualization(en_tokens, en_ids)}
|
99 |
|
100 |
+
**🔢 Token IDs:** `{en_ids if en_ids else '[ERROR]'}`
|
101 |
+
**🔄 Decoded:** `{en_decoded}`
|
102 |
|
103 |
+
---
|
|
|
|
|
104 |
"""
|
105 |
|
106 |
try:
|
|
|
108 |
except KeyError:
|
109 |
return "[ERROR] Invalid custom tokenizer selected", ""
|
110 |
|
111 |
+
# Show loading progress
|
112 |
+
progress(0.1, desc="Loading custom tokenizer...")
|
113 |
+
|
114 |
+
# Load custom tokenizer
|
115 |
+
try:
|
116 |
+
custom_result = format_block("Custom Tokenizer", custom_path)
|
117 |
+
progress(0.5, desc="Custom tokenizer loaded. Loading stock tokenizer...")
|
118 |
+
except Exception as e:
|
119 |
+
custom_result = f"[ERROR] Failed to load custom tokenizer: {str(e)}"
|
120 |
+
progress(0.5, desc="Custom tokenizer failed. Loading stock tokenizer...")
|
121 |
+
|
122 |
+
# Load stock tokenizer
|
123 |
+
try:
|
124 |
+
stock_result = format_block("Stock Tokenizer", stock_path)
|
125 |
+
progress(1.0, desc="Complete!")
|
126 |
+
except Exception as e:
|
127 |
+
stock_result = f"[ERROR] Failed to load stock tokenizer: {str(e)}"
|
128 |
+
progress(1.0, desc="Complete with errors!")
|
129 |
+
|
130 |
+
return custom_result, stock_result
|
131 |
+
|
132 |
+
# Non-blocking comparison function
|
133 |
+
def compare_tokenizers_async(dv_text, en_text, custom_label, stock_path):
|
134 |
+
# Return immediate loading message
|
135 |
+
loading_msg = """
|
136 |
+
## ⏳ Loading Tokenizer...
|
137 |
+
|
138 |
+
🚀 **Status:** Downloading and initializing tokenizer...
|
139 |
|
140 |
+
*This may take a moment for first-time downloads*
|
141 |
+
"""
|
142 |
+
|
143 |
+
# Use ThreadPoolExecutor for non-blocking execution
|
144 |
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
145 |
+
future = executor.submit(compare_side_by_side_with_progress, dv_text, en_text, custom_label, stock_path)
|
146 |
+
|
147 |
+
# Return loading state first
|
148 |
+
yield loading_msg, loading_msg
|
149 |
+
|
150 |
+
# Then return actual results
|
151 |
+
try:
|
152 |
+
custom_result, stock_result = future.result(timeout=120) # 2 minute timeout
|
153 |
+
yield custom_result, stock_result
|
154 |
+
except Exception as e:
|
155 |
+
error_msg = f"## ❌ Error\n\n**Failed to load tokenizers:** {str(e)}"
|
156 |
+
yield error_msg, error_msg
|
157 |
+
|
158 |
+
# Gradio UI with better UX
|
159 |
+
with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool", theme=gr.themes.Soft()) as demo:
|
160 |
gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
|
161 |
gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")
|
162 |
|
163 |
with gr.Row():
|
164 |
dhivehi_text = gr.Textbox(
|
165 |
label="Dhivehi Text",
|
166 |
+
lines=2,
|
167 |
value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
|
168 |
+
rtl=True,
|
169 |
+
placeholder="Enter Dhivehi text here..."
|
170 |
)
|
171 |
english_text = gr.Textbox(
|
172 |
label="English Text",
|
173 |
+
lines=2,
|
174 |
+
value="The quick brown fox jumps over the lazy dog",
|
175 |
+
placeholder="Enter English text here..."
|
176 |
)
|
177 |
|
178 |
with gr.Row():
|
179 |
tokenizer_a = gr.Dropdown(
|
180 |
label="Select Custom Tokenizer",
|
181 |
choices=list(TOKENIZER_CUSTOM.keys()),
|
182 |
+
value="T5 Extended",
|
183 |
+
info="Pre-trained Dhivehi tokenizers"
|
184 |
)
|
185 |
tokenizer_b = gr.Dropdown(
|
186 |
label="Enter or Select Stock Tokenizer Path",
|
187 |
choices=SUGGESTED_STOCK_PATHS,
|
188 |
value="google/flan-t5-base",
|
189 |
+
allow_custom_value=True,
|
190 |
+
info="Standard HuggingFace tokenizers"
|
191 |
)
|
192 |
|
193 |
+
compare_button = gr.Button("🔄 Compare Tokenizers", variant="primary", size="lg")
|
194 |
|
195 |
with gr.Row():
|
196 |
+
output_custom = gr.Markdown(label="Custom Tokenizer Output", height=400)
|
197 |
+
output_stock = gr.Markdown(label="Stock Tokenizer Output", height=400)
|
198 |
|
199 |
+
# Use the non-blocking function
|
200 |
compare_button.click(
|
201 |
+
compare_side_by_side_with_progress,
|
202 |
inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
|
203 |
+
outputs=[output_custom, output_stock],
|
204 |
+
show_progress=True
|
205 |
)
|
206 |
|
207 |
+
|
208 |
+
|
209 |
+
if __name__ == "__main__":
|
210 |
+
demo.launch()
|