Spaces:

alakxender
/

dhivehi-tokenizers

Running

App Files Files Community

alakxender commited on Jun 7

Commit

2af4cfb

1 Parent(s): fee5e46

u

Browse files

Files changed (1) hide show

app.py +118 -38

app.py CHANGED Viewed

@@ -1,5 +1,9 @@
 import gradio as gr
 from transformers import AutoTokenizer, T5Tokenizer
 # Fixed list of custom tokenizers (left)
 TOKENIZER_CUSTOM = {
@@ -21,16 +25,26 @@ SUGGESTED_STOCK_PATHS = [
     "microsoft/deberta-v3-base"
 ]
 # Load tokenizer with fallback to slow T5
 def load_tokenizer(tokenizer_path):
     try:
-        return AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
     except Exception:
         if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
-            return T5Tokenizer.from_pretrained(tokenizer_path)
         raise
-# Tokenize and decode with error handling
 def tokenize_display(text, tokenizer_path):
     try:
         tokenizer = load_tokenizer(tokenizer_path)
@@ -42,36 +56,51 @@ def tokenize_display(text, tokenizer_path):
     except Exception as e:
         return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"
-# Comparison logic
-def compare_side_by_side(dv_text, en_text, custom_label, stock_path):
     def format_block(title, tokenizer_path):
         dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
         en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
         return f"""\
-### 🔤 {title}
-#### 🈁 Dhivehi Text
-`{dv_text}`
-**Tokenized:**
-{' '.join(dv_tokens)}
-**Number of tokens:** {len(dv_tokens) if dv_ids else 'N/A'}
-**IDs:** {dv_ids or '[ERROR]'}
-**Decoded:** `{dv_decoded}`
 ---
-#### 🇬🇧 English Text
-`{en_text}`
-**Tokenized:**
-{' '.join(en_tokens)}
-**Number of tokens:** {len(en_tokens) if en_ids else 'N/A'}
-**IDs:** {en_ids or '[ERROR]'}
-**Decoded:** `{en_decoded}`
 """
     try:
@@ -79,52 +108,103 @@ def compare_side_by_side(dv_text, en_text, custom_label, stock_path):
     except KeyError:
         return "[ERROR] Invalid custom tokenizer selected", ""
-    return (
-        format_block("Custom Tokenizer", custom_path),
-        format_block("Stock Tokenizer", stock_path)
-    )
-# Gradio UI
-with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool") as demo:
     gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
     gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")
     with gr.Row():
         dhivehi_text = gr.Textbox(
             label="Dhivehi Text",
-            lines=1,
             value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
-            rtl=True
         )
         english_text = gr.Textbox(
             label="English Text",
-            lines=1,
-            value="The quick brown fox jumps over the lazy dog"
         )
     with gr.Row():
         tokenizer_a = gr.Dropdown(
             label="Select Custom Tokenizer",
             choices=list(TOKENIZER_CUSTOM.keys()),
-            value="T5 Extended"
         )
         tokenizer_b = gr.Dropdown(
             label="Enter or Select Stock Tokenizer Path",
             choices=SUGGESTED_STOCK_PATHS,
             value="google/flan-t5-base",
-            allow_custom_value=True
         )
-    compare_button = gr.Button("Compare Tokenizers")
     with gr.Row():
-        output_custom = gr.Markdown(label="Custom Tokenizer Output")
-        output_stock = gr.Markdown(label="Stock Tokenizer Output")
     compare_button.click(
-        compare_side_by_side,
         inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
-        outputs=[output_custom, output_stock]
     )
-demo.launch()

 import gradio as gr
 from transformers import AutoTokenizer, T5Tokenizer
+import asyncio
+import threading
+from concurrent.futures import ThreadPoolExecutor
+import time
 # Fixed list of custom tokenizers (left)
 TOKENIZER_CUSTOM = {
     "microsoft/deberta-v3-base"
 ]
+# Cache for loaded tokenizers to avoid reloading
+tokenizer_cache = {}
 # Load tokenizer with fallback to slow T5
 def load_tokenizer(tokenizer_path):
+    if tokenizer_path in tokenizer_cache:
+        return tokenizer_cache[tokenizer_path]
     try:
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
+        tokenizer_cache[tokenizer_path] = tokenizer
+        return tokenizer
     except Exception:
         if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
+            tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
+            tokenizer_cache[tokenizer_path] = tokenizer
+            return tokenizer
         raise
+# Tokenize and decode with enhanced visualization
 def tokenize_display(text, tokenizer_path):
     try:
         tokenizer = load_tokenizer(tokenizer_path)
     except Exception as e:
         return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"
+def create_token_visualization(tokens, ids):
+    """Create a visual representation of tokens with colors and spacing"""
+    if not tokens or not ids:
+        return "❌ No tokens to display"
+    # Create colored token blocks
+    token_blocks = []
+    colors = ["🟦", "🟩", "🟨", "🟪", "🟧", "🟫"]
+    for i, (token, token_id) in enumerate(zip(tokens, ids)):
+        color = colors[i % len(colors)]
+        # Clean token display (remove special characters for better readability)
+        clean_token = token.replace('▁', '_').replace('</s>', '[END]').replace('<s>', '[START]')
+        token_blocks.append(f"{color} `{clean_token}` ({token_id})")
+    return " ".join(token_blocks)
+# Async comparison with progress updates
+def compare_side_by_side_with_progress(dv_text, en_text, custom_label, stock_path, progress=gr.Progress()):
     def format_block(title, tokenizer_path):
         dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
         en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)
         return f"""\
+## 🔤 {title}
+### 🈁 Dhivehi: `{dv_text}`
+**🎯 Tokens:** {len(dv_tokens) if dv_ids else 'N/A'} tokens
+{create_token_visualization(dv_tokens, dv_ids)}
+**🔢 Token IDs:** `{dv_ids if dv_ids else '[ERROR]'}`
+**🔄 Decoded:** `{dv_decoded}`
 ---
+### 🇬🇧 English: `{en_text}`
+**🎯 Tokens:** {len(en_tokens) if en_ids else 'N/A'} tokens
+{create_token_visualization(en_tokens, en_ids)}
+**🔢 Token IDs:** `{en_ids if en_ids else '[ERROR]'}`
+**🔄 Decoded:** `{en_decoded}`
+---
 """
     try:
     except KeyError:
         return "[ERROR] Invalid custom tokenizer selected", ""
+    # Show loading progress
+    progress(0.1, desc="Loading custom tokenizer...")
+    # Load custom tokenizer
+    try:
+        custom_result = format_block("Custom Tokenizer", custom_path)
+        progress(0.5, desc="Custom tokenizer loaded. Loading stock tokenizer...")
+    except Exception as e:
+        custom_result = f"[ERROR] Failed to load custom tokenizer: {str(e)}"
+        progress(0.5, desc="Custom tokenizer failed. Loading stock tokenizer...")
+    # Load stock tokenizer
+    try:
+        stock_result = format_block("Stock Tokenizer", stock_path)
+        progress(1.0, desc="Complete!")
+    except Exception as e:
+        stock_result = f"[ERROR] Failed to load stock tokenizer: {str(e)}"
+        progress(1.0, desc="Complete with errors!")
+    return custom_result, stock_result
+# Non-blocking comparison function
+def compare_tokenizers_async(dv_text, en_text, custom_label, stock_path):
+    # Return immediate loading message
+    loading_msg = """
+## ⏳ Loading Tokenizer...
+🚀 **Status:** Downloading and initializing tokenizer...
+*This may take a moment for first-time downloads*
+"""
+    # Use ThreadPoolExecutor for non-blocking execution
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        future = executor.submit(compare_side_by_side_with_progress, dv_text, en_text, custom_label, stock_path)
+        # Return loading state first
+        yield loading_msg, loading_msg
+        # Then return actual results
+        try:
+            custom_result, stock_result = future.result(timeout=120)  # 2 minute timeout
+            yield custom_result, stock_result
+        except Exception as e:
+            error_msg = f"## ❌ Error\n\n**Failed to load tokenizers:** {str(e)}"
+            yield error_msg, error_msg
+# Gradio UI with better UX
+with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool", theme=gr.themes.Soft()) as demo:
     gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
     gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")
     with gr.Row():
         dhivehi_text = gr.Textbox(
             label="Dhivehi Text",
+            lines=2,
             value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
+            rtl=True,
+            placeholder="Enter Dhivehi text here..."
         )
         english_text = gr.Textbox(
             label="English Text",
+            lines=2,
+            value="The quick brown fox jumps over the lazy dog",
+            placeholder="Enter English text here..."
         )
     with gr.Row():
         tokenizer_a = gr.Dropdown(
             label="Select Custom Tokenizer",
             choices=list(TOKENIZER_CUSTOM.keys()),
+            value="T5 Extended",
+            info="Pre-trained Dhivehi tokenizers"
         )
         tokenizer_b = gr.Dropdown(
             label="Enter or Select Stock Tokenizer Path",
             choices=SUGGESTED_STOCK_PATHS,
             value="google/flan-t5-base",
+            allow_custom_value=True,
+            info="Standard HuggingFace tokenizers"
         )
+    compare_button = gr.Button("🔄 Compare Tokenizers", variant="primary", size="lg")
     with gr.Row():
+        output_custom = gr.Markdown(label="Custom Tokenizer Output", height=400)
+        output_stock = gr.Markdown(label="Stock Tokenizer Output", height=400)
+    # Use the non-blocking function
     compare_button.click(
+        compare_side_by_side_with_progress,
         inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
+        outputs=[output_custom, output_stock],
+        show_progress=True
     )
+if __name__ == "__main__":
+    demo.launch()