alakxender's picture
f
4641f88
import gradio as gr
from transformers import AutoTokenizer, T5Tokenizer
import asyncio
import threading
from concurrent.futures import ThreadPoolExecutor
import time
# Fixed list of custom tokenizers (left)
TOKENIZER_CUSTOM = {
"T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
"RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended",
"Google mT5": "google/mt5-base",
"Google mT5 Extended": "alakxender/mt5-dhivehi-tokenizer-extended",
"DeBERTa Extended": "alakxender/deberta-dhivehi-tokenizer-extended",
"XLM-RoBERTa Extended": "alakxender/xlmr-dhivehi-tokenizer-extended",
"Bert Extended": "alakxender/bert-dhivehi-tokenizer-extended",
"Bert Extended Fast": "alakxender/bert-fast-dhivehi-tokenizer-extended"
}
# Suggested stock model paths for the right input
SUGGESTED_STOCK_PATHS = [
"google/flan-t5-base",
"t5-small",
"t5-base",
"t5-large",
"google/mt5-base",
"microsoft/trocr-base-handwritten",
"microsoft/trocr-base-printed",
"microsoft/deberta-v3-base"
"xlm-roberta-base",
"naver-clova-ix/donut-base",
"bert-base-multilingual-cased"
]
# Cache for loaded tokenizers to avoid reloading
tokenizer_cache = {}
# Load tokenizer with fallback to slow T5
def load_tokenizer(tokenizer_path):
if tokenizer_path in tokenizer_cache:
return tokenizer_cache[tokenizer_path]
try:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
tokenizer_cache[tokenizer_path] = tokenizer
return tokenizer
except Exception:
if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
tokenizer_cache[tokenizer_path] = tokenizer
return tokenizer
raise
# Tokenize and decode with enhanced visualization
def tokenize_display(text, tokenizer_path):
try:
tokenizer = load_tokenizer(tokenizer_path)
encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
ids = encoding.input_ids
decoded = tokenizer.decode(ids, skip_special_tokens=False)
return tokens, ids, decoded
except Exception as e:
return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"
def create_token_visualization(tokens, ids):
"""Create a visual representation of tokens with colors and spacing"""
if not tokens or not ids:
return "❌ No tokens to display"
# Create colored token blocks
token_blocks = []
colors = ["🟦", "🟩", "🟨", "🟪", "🟧", "🟫"]
for i, (token, token_id) in enumerate(zip(tokens, ids)):
color = colors[i % len(colors)]
# Clean token display (remove special characters for better readability)
clean_token = token.replace('▁', '_').replace('</s>', '[END]').replace('<s>', '[START]')
token_blocks.append(f"{color} `{clean_token}` ({token_id})")
return " ".join(token_blocks)
# Async comparison with progress updates
def compare_side_by_side_with_progress(dv_text, en_text, custom_label, stock_path, progress=gr.Progress()):
def format_block(title, tokenizer_path):
dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
return f"""\
## 🔤 {title}
### 🈁 Dhivehi: `{dv_text}`
**🎯 Tokens:** {len(dv_tokens) if dv_ids else 'N/A'} tokens
{create_token_visualization(dv_tokens, dv_ids)}
**🔢 Token IDs:** `{dv_ids if dv_ids else '[ERROR]'}`
**🔄 Decoded:** `{dv_decoded}`
---
### 🇬🇧 English: `{en_text}`
**🎯 Tokens:** {len(en_tokens) if en_ids else 'N/A'} tokens
{create_token_visualization(en_tokens, en_ids)}
**🔢 Token IDs:** `{en_ids if en_ids else '[ERROR]'}`
**🔄 Decoded:** `{en_decoded}`
---
"""
try:
custom_path = TOKENIZER_CUSTOM[custom_label]
except KeyError:
return "[ERROR] Invalid custom tokenizer selected", ""
# Show loading progress
progress(0.1, desc="Loading custom tokenizer...")
# Load custom tokenizer
try:
custom_result = format_block("Custom Tokenizer", custom_path)
progress(0.5, desc="Custom tokenizer loaded. Loading stock tokenizer...")
except Exception as e:
custom_result = f"[ERROR] Failed to load custom tokenizer: {str(e)}"
progress(0.5, desc="Custom tokenizer failed. Loading stock tokenizer...")
# Load stock tokenizer
try:
stock_result = format_block("Stock Tokenizer", stock_path)
progress(1.0, desc="Complete!")
except Exception as e:
stock_result = f"[ERROR] Failed to load stock tokenizer: {str(e)}"
progress(1.0, desc="Complete with errors!")
return custom_result, stock_result
# Non-blocking comparison function
def compare_tokenizers_async(dv_text, en_text, custom_label, stock_path):
# Return immediate loading message
loading_msg = """
## ⏳ Loading Tokenizer...
🚀 **Status:** Downloading and initializing tokenizer...
*This may take a moment for first-time downloads*
"""
# Use ThreadPoolExecutor for non-blocking execution
with ThreadPoolExecutor(max_workers=2) as executor:
future = executor.submit(compare_side_by_side_with_progress, dv_text, en_text, custom_label, stock_path)
# Return loading state first
yield loading_msg, loading_msg
# Then return actual results
try:
custom_result, stock_result = future.result(timeout=120) # 2 minute timeout
yield custom_result, stock_result
except Exception as e:
error_msg = f"## ❌ Error\n\n**Failed to load tokenizers:** {str(e)}"
yield error_msg, error_msg
# Gradio UI with better UX
with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool", theme=gr.themes.Soft()) as demo:
gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")
with gr.Row():
dhivehi_text = gr.Textbox(
label="Dhivehi Text",
lines=2,
value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
rtl=True,
placeholder="Enter Dhivehi text here..."
)
english_text = gr.Textbox(
label="English Text",
lines=2,
value="The quick brown fox jumps over the lazy dog",
placeholder="Enter English text here..."
)
with gr.Row():
tokenizer_a = gr.Dropdown(
label="Select Custom Tokenizer",
choices=list(TOKENIZER_CUSTOM.keys()),
value="T5 Extended",
info="Pre-trained Dhivehi tokenizers (or paste a path)"
)
tokenizer_b = gr.Dropdown(
label="Enter or Select Stock Tokenizer Path",
choices=SUGGESTED_STOCK_PATHS,
value="google/flan-t5-base",
allow_custom_value=True,
info="Standard HuggingFace tokenizers (or paste a path)"
)
compare_button = gr.Button("🔄 Compare Tokenizers", variant="primary", size="lg")
with gr.Row():
output_custom = gr.Markdown(label="Custom Tokenizer Output", height=400)
output_stock = gr.Markdown(label="Stock Tokenizer Output", height=400)
# Use the non-blocking function
compare_button.click(
compare_side_by_side_with_progress,
inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
outputs=[output_custom, output_stock],
show_progress=True
)
if __name__ == "__main__":
demo.launch()