Spaces:

alakxender
/

dhivehi-tokenizers

Running

File size: 7,823 Bytes

60c8da8
fee5e46
2af4cfb
 
 
 
60c8da8
249b1cb
 
 
fee5e46
 
7ef74ac
9f21410
3a6f187
4641f88
 
9405745
 
249b1cb
 
 
 
 
 
 
 
fee5e46
 
9f21410
3a6f187
 
249b1cb
 
2af4cfb
 
 
fee5e46
 
2af4cfb
 
 
fee5e46
11f1c3c
2af4cfb
 
fee5e46
 
2af4cfb
 
 
fee5e46
 
2af4cfb
9405745
249b1cb
fee5e46
249b1cb
 
 
 
 
 
 
 
2af4cfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9405745
 
 
 
 
2af4cfb
9405745
2af4cfb
9405745
2af4cfb
 
9405745
2af4cfb
 
9405745
 
 
2af4cfb
 
 
 
9405745
2af4cfb
 
9405745
2af4cfb
9405745
 
249b1cb
 
 
 
9405745
2af4cfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9405745
2af4cfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249b1cb
 
9405745
 
 
 
2af4cfb
249b1cb
2af4cfb
 
9405745
 
 
2af4cfb
 
 
9405745
 
 
 
abcbfe3
249b1cb
2af4cfb
1d5b281
9405745
 
249b1cb
 
 
2af4cfb
1d5b281
9405745
 
2af4cfb
9405745
 
2af4cfb
 
9405745
2af4cfb
9405745
2af4cfb
9405745
2af4cfb
 
9405745
60c8da8
2af4cfb

import gradio as gr
from transformers import AutoTokenizer, T5Tokenizer
import asyncio
import threading
from concurrent.futures import ThreadPoolExecutor
import time

# Fixed list of custom tokenizers (left)
TOKENIZER_CUSTOM = {
    "T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
    "RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended",
    "Google mT5": "google/mt5-base",
    "Google mT5 Extended": "alakxender/mt5-dhivehi-tokenizer-extended",
    "DeBERTa Extended": "alakxender/deberta-dhivehi-tokenizer-extended",
    "XLM-RoBERTa Extended": "alakxender/xlmr-dhivehi-tokenizer-extended",
    "Bert Extended": "alakxender/bert-dhivehi-tokenizer-extended",
    "Bert Extended Fast": "alakxender/bert-fast-dhivehi-tokenizer-extended"
}

# Suggested stock model paths for the right input
SUGGESTED_STOCK_PATHS = [
    "google/flan-t5-base",
    "t5-small",
    "t5-base",
    "t5-large",
    "google/mt5-base",
    "microsoft/trocr-base-handwritten",
    "microsoft/trocr-base-printed",
    "microsoft/deberta-v3-base"
    "xlm-roberta-base",
    "naver-clova-ix/donut-base",
    "bert-base-multilingual-cased"
]

# Cache for loaded tokenizers to avoid reloading
tokenizer_cache = {}

# Load tokenizer with fallback to slow T5
def load_tokenizer(tokenizer_path):
    if tokenizer_path in tokenizer_cache:
        return tokenizer_cache[tokenizer_path]
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        tokenizer_cache[tokenizer_path] = tokenizer
        return tokenizer
    except Exception:
        if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
            tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
            tokenizer_cache[tokenizer_path] = tokenizer
            return tokenizer
        raise

# Tokenize and decode with enhanced visualization
def tokenize_display(text, tokenizer_path):
    try:
        tokenizer = load_tokenizer(tokenizer_path)
        encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
        tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
        ids = encoding.input_ids
        decoded = tokenizer.decode(ids, skip_special_tokens=False)
        return tokens, ids, decoded
    except Exception as e:
        return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"

def create_token_visualization(tokens, ids):
    """Create a visual representation of tokens with colors and spacing"""
    if not tokens or not ids:
        return "❌ No tokens to display"
    
    # Create colored token blocks
    token_blocks = []
    colors = ["🟦", "🟩", "🟨", "🟪", "🟧", "🟫"]
    
    for i, (token, token_id) in enumerate(zip(tokens, ids)):
        color = colors[i % len(colors)]
        # Clean token display (remove special characters for better readability)
        clean_token = token.replace('▁', '_').replace('</s>', '[END]').replace('<s>', '[START]')
        token_blocks.append(f"{color} `{clean_token}` ({token_id})")
    
    return " ".join(token_blocks)

# Async comparison with progress updates
def compare_side_by_side_with_progress(dv_text, en_text, custom_label, stock_path, progress=gr.Progress()):
    def format_block(title, tokenizer_path):
        dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
        en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)

        return f"""\
## 🔤 {title}

### 🈁 Dhivehi: `{dv_text}`

**🎯 Tokens:** {len(dv_tokens) if dv_ids else 'N/A'} tokens  
{create_token_visualization(dv_tokens, dv_ids)}

**🔢 Token IDs:** `{dv_ids if dv_ids else '[ERROR]'}`  
**🔄 Decoded:** `{dv_decoded}`

---

### 🇬🇧 English: `{en_text}`

**🎯 Tokens:** {len(en_tokens) if en_ids else 'N/A'} tokens  
{create_token_visualization(en_tokens, en_ids)}

**🔢 Token IDs:** `{en_ids if en_ids else '[ERROR]'}`  
**🔄 Decoded:** `{en_decoded}`

---
"""

    try:
        custom_path = TOKENIZER_CUSTOM[custom_label]
    except KeyError:
        return "[ERROR] Invalid custom tokenizer selected", ""

    # Show loading progress
    progress(0.1, desc="Loading custom tokenizer...")
    
    # Load custom tokenizer
    try:
        custom_result = format_block("Custom Tokenizer", custom_path)
        progress(0.5, desc="Custom tokenizer loaded. Loading stock tokenizer...")
    except Exception as e:
        custom_result = f"[ERROR] Failed to load custom tokenizer: {str(e)}"
        progress(0.5, desc="Custom tokenizer failed. Loading stock tokenizer...")
    
    # Load stock tokenizer
    try:
        stock_result = format_block("Stock Tokenizer", stock_path)
        progress(1.0, desc="Complete!")
    except Exception as e:
        stock_result = f"[ERROR] Failed to load stock tokenizer: {str(e)}"
        progress(1.0, desc="Complete with errors!")

    return custom_result, stock_result

# Non-blocking comparison function
def compare_tokenizers_async(dv_text, en_text, custom_label, stock_path):
    # Return immediate loading message
    loading_msg = """
## ⏳ Loading Tokenizer...

🚀 **Status:** Downloading and initializing tokenizer...

*This may take a moment for first-time downloads*
"""
    
    # Use ThreadPoolExecutor for non-blocking execution
    with ThreadPoolExecutor(max_workers=2) as executor:
        future = executor.submit(compare_side_by_side_with_progress, dv_text, en_text, custom_label, stock_path)
        
        # Return loading state first
        yield loading_msg, loading_msg
        
        # Then return actual results
        try:
            custom_result, stock_result = future.result(timeout=120)  # 2 minute timeout
            yield custom_result, stock_result
        except Exception as e:
            error_msg = f"## ❌ Error\n\n**Failed to load tokenizers:** {str(e)}"
            yield error_msg, error_msg

# Gradio UI with better UX
with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
    gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")

    with gr.Row():
        dhivehi_text = gr.Textbox(
            label="Dhivehi Text",
            lines=2,
            value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
            rtl=True,
            placeholder="Enter Dhivehi text here..."
        )
        english_text = gr.Textbox(
            label="English Text",
            lines=2,
            value="The quick brown fox jumps over the lazy dog",
            placeholder="Enter English text here..."
        )

    with gr.Row():
        tokenizer_a = gr.Dropdown(
            label="Select Custom Tokenizer",
            choices=list(TOKENIZER_CUSTOM.keys()),
            value="T5 Extended",
            info="Pre-trained Dhivehi tokenizers (or paste a path)"
        )
        tokenizer_b = gr.Dropdown(
            label="Enter or Select Stock Tokenizer Path",
            choices=SUGGESTED_STOCK_PATHS,
            value="google/flan-t5-base",
            allow_custom_value=True,
            info="Standard HuggingFace tokenizers (or paste a path)"
        )

    compare_button = gr.Button("🔄 Compare Tokenizers", variant="primary", size="lg")

    with gr.Row():
        output_custom = gr.Markdown(label="Custom Tokenizer Output", height=400)
        output_stock = gr.Markdown(label="Stock Tokenizer Output", height=400)

    # Use the non-blocking function
    compare_button.click(
        compare_side_by_side_with_progress,
        inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
        outputs=[output_custom, output_stock],
        show_progress=True
    )



if __name__ == "__main__":
    demo.launch()