File size: 7,823 Bytes
60c8da8
fee5e46
2af4cfb
 
 
 
60c8da8
249b1cb
 
 
fee5e46
 
7ef74ac
9f21410
3a6f187
4641f88
 
9405745
 
249b1cb
 
 
 
 
 
 
 
fee5e46
 
9f21410
3a6f187
 
249b1cb
 
2af4cfb
 
 
fee5e46
 
2af4cfb
 
 
fee5e46
11f1c3c
2af4cfb
 
fee5e46
 
2af4cfb
 
 
fee5e46
 
2af4cfb
9405745
249b1cb
fee5e46
249b1cb
 
 
 
 
 
 
 
2af4cfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9405745
 
 
 
 
2af4cfb
9405745
2af4cfb
9405745
2af4cfb
 
9405745
2af4cfb
 
9405745
 
 
2af4cfb
 
 
 
9405745
2af4cfb
 
9405745
2af4cfb
9405745
 
249b1cb
 
 
 
9405745
2af4cfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9405745
2af4cfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249b1cb
 
9405745
 
 
 
2af4cfb
249b1cb
2af4cfb
 
9405745
 
 
2af4cfb
 
 
9405745
 
 
 
abcbfe3
249b1cb
2af4cfb
1d5b281
9405745
 
249b1cb
 
 
2af4cfb
1d5b281
9405745
 
2af4cfb
9405745
 
2af4cfb
 
9405745
2af4cfb
9405745
2af4cfb
9405745
2af4cfb
 
9405745
60c8da8
2af4cfb
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import gradio as gr
from transformers import AutoTokenizer, T5Tokenizer
import asyncio
import threading
from concurrent.futures import ThreadPoolExecutor
import time

# Fixed list of custom tokenizers (left)
TOKENIZER_CUSTOM = {
    "T5 Extended": "alakxender/dhivehi-T5-tokenizer-extended",
    "RoBERTa Extended": "alakxender/dhivehi-roberta-tokenizer-extended",
    "Google mT5": "google/mt5-base",
    "Google mT5 Extended": "alakxender/mt5-dhivehi-tokenizer-extended",
    "DeBERTa Extended": "alakxender/deberta-dhivehi-tokenizer-extended",
    "XLM-RoBERTa Extended": "alakxender/xlmr-dhivehi-tokenizer-extended",
    "Bert Extended": "alakxender/bert-dhivehi-tokenizer-extended",
    "Bert Extended Fast": "alakxender/bert-fast-dhivehi-tokenizer-extended"
}

# Suggested stock model paths for the right input
SUGGESTED_STOCK_PATHS = [
    "google/flan-t5-base",
    "t5-small",
    "t5-base",
    "t5-large",
    "google/mt5-base",
    "microsoft/trocr-base-handwritten",
    "microsoft/trocr-base-printed",
    "microsoft/deberta-v3-base"
    "xlm-roberta-base",
    "naver-clova-ix/donut-base",
    "bert-base-multilingual-cased"
]

# Cache for loaded tokenizers to avoid reloading
tokenizer_cache = {}

# Load tokenizer with fallback to slow T5
def load_tokenizer(tokenizer_path):
    if tokenizer_path in tokenizer_cache:
        return tokenizer_cache[tokenizer_path]
    
    try:
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        tokenizer_cache[tokenizer_path] = tokenizer
        return tokenizer
    except Exception:
        if "t5" in tokenizer_path.lower() or "mt5" in tokenizer_path.lower():
            tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
            tokenizer_cache[tokenizer_path] = tokenizer
            return tokenizer
        raise

# Tokenize and decode with enhanced visualization
def tokenize_display(text, tokenizer_path):
    try:
        tokenizer = load_tokenizer(tokenizer_path)
        encoding = tokenizer(text, return_offsets_mapping=False, add_special_tokens=True)
        tokens = tokenizer.convert_ids_to_tokens(encoding.input_ids)
        ids = encoding.input_ids
        decoded = tokenizer.decode(ids, skip_special_tokens=False)
        return tokens, ids, decoded
    except Exception as e:
        return [f"[ERROR] {str(e)}"], [], "[Tokenizer Error]"

def create_token_visualization(tokens, ids):
    """Create a visual representation of tokens with colors and spacing"""
    if not tokens or not ids:
        return "❌ No tokens to display"
    
    # Create colored token blocks
    token_blocks = []
    colors = ["🟦", "🟩", "🟨", "🟪", "🟧", "🟫"]
    
    for i, (token, token_id) in enumerate(zip(tokens, ids)):
        color = colors[i % len(colors)]
        # Clean token display (remove special characters for better readability)
        clean_token = token.replace('▁', '_').replace('</s>', '[END]').replace('<s>', '[START]')
        token_blocks.append(f"{color} `{clean_token}` ({token_id})")
    
    return " ".join(token_blocks)

# Async comparison with progress updates
def compare_side_by_side_with_progress(dv_text, en_text, custom_label, stock_path, progress=gr.Progress()):
    def format_block(title, tokenizer_path):
        dv_tokens, dv_ids, dv_decoded = tokenize_display(dv_text, tokenizer_path)
        en_tokens, en_ids, en_decoded = tokenize_display(en_text, tokenizer_path)

        return f"""\
## 🔤 {title}

### 🈁 Dhivehi: `{dv_text}`

**🎯 Tokens:** {len(dv_tokens) if dv_ids else 'N/A'} tokens  
{create_token_visualization(dv_tokens, dv_ids)}

**🔢 Token IDs:** `{dv_ids if dv_ids else '[ERROR]'}`  
**🔄 Decoded:** `{dv_decoded}`

---

### 🇬🇧 English: `{en_text}`

**🎯 Tokens:** {len(en_tokens) if en_ids else 'N/A'} tokens  
{create_token_visualization(en_tokens, en_ids)}

**🔢 Token IDs:** `{en_ids if en_ids else '[ERROR]'}`  
**🔄 Decoded:** `{en_decoded}`

---
"""

    try:
        custom_path = TOKENIZER_CUSTOM[custom_label]
    except KeyError:
        return "[ERROR] Invalid custom tokenizer selected", ""

    # Show loading progress
    progress(0.1, desc="Loading custom tokenizer...")
    
    # Load custom tokenizer
    try:
        custom_result = format_block("Custom Tokenizer", custom_path)
        progress(0.5, desc="Custom tokenizer loaded. Loading stock tokenizer...")
    except Exception as e:
        custom_result = f"[ERROR] Failed to load custom tokenizer: {str(e)}"
        progress(0.5, desc="Custom tokenizer failed. Loading stock tokenizer...")
    
    # Load stock tokenizer
    try:
        stock_result = format_block("Stock Tokenizer", stock_path)
        progress(1.0, desc="Complete!")
    except Exception as e:
        stock_result = f"[ERROR] Failed to load stock tokenizer: {str(e)}"
        progress(1.0, desc="Complete with errors!")

    return custom_result, stock_result

# Non-blocking comparison function
def compare_tokenizers_async(dv_text, en_text, custom_label, stock_path):
    # Return immediate loading message
    loading_msg = """
## ⏳ Loading Tokenizer...

🚀 **Status:** Downloading and initializing tokenizer...

*This may take a moment for first-time downloads*
"""
    
    # Use ThreadPoolExecutor for non-blocking execution
    with ThreadPoolExecutor(max_workers=2) as executor:
        future = executor.submit(compare_side_by_side_with_progress, dv_text, en_text, custom_label, stock_path)
        
        # Return loading state first
        yield loading_msg, loading_msg
        
        # Then return actual results
        try:
            custom_result, stock_result = future.result(timeout=120)  # 2 minute timeout
            yield custom_result, stock_result
        except Exception as e:
            error_msg = f"## ❌ Error\n\n**Failed to load tokenizers:** {str(e)}"
            yield error_msg, error_msg

# Gradio UI with better UX
with gr.Blocks(title="Dhivehi Tokenizer Comparison Tool", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🧠 Dhivehi Tokenizer Comparison")
    gr.Markdown("Compare how different tokenizers process Dhivehi and English input text.")

    with gr.Row():
        dhivehi_text = gr.Textbox(
            label="Dhivehi Text",
            lines=2,
            value="އީދުގެ ހަރަކާތްތައް ފެށުމަށް މިރޭ ހުޅުމާލޭގައި އީދު މަޅި ރޯކުރަނީ",
            rtl=True,
            placeholder="Enter Dhivehi text here..."
        )
        english_text = gr.Textbox(
            label="English Text",
            lines=2,
            value="The quick brown fox jumps over the lazy dog",
            placeholder="Enter English text here..."
        )

    with gr.Row():
        tokenizer_a = gr.Dropdown(
            label="Select Custom Tokenizer",
            choices=list(TOKENIZER_CUSTOM.keys()),
            value="T5 Extended",
            info="Pre-trained Dhivehi tokenizers (or paste a path)"
        )
        tokenizer_b = gr.Dropdown(
            label="Enter or Select Stock Tokenizer Path",
            choices=SUGGESTED_STOCK_PATHS,
            value="google/flan-t5-base",
            allow_custom_value=True,
            info="Standard HuggingFace tokenizers (or paste a path)"
        )

    compare_button = gr.Button("🔄 Compare Tokenizers", variant="primary", size="lg")

    with gr.Row():
        output_custom = gr.Markdown(label="Custom Tokenizer Output", height=400)
        output_stock = gr.Markdown(label="Stock Tokenizer Output", height=400)

    # Use the non-blocking function
    compare_button.click(
        compare_side_by_side_with_progress,
        inputs=[dhivehi_text, english_text, tokenizer_a, tokenizer_b],
        outputs=[output_custom, output_stock],
        show_progress=True
    )



if __name__ == "__main__":
    demo.launch()