gsaltintas commited on
Commit
6b9ea0b
·
verified ·
1 Parent(s): b6070dc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -0
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import tiktoken
3
+ from transformers import AutoTokenizer
4
+ import os
5
+
6
+ # Model mappings
7
+ MODEL_MAP = {
8
+ 'llama-2': 'meta-llama/Llama-2-7b-hf',
9
+ 'llama-3': 'meta-llama/Meta-Llama-3-8B',
10
+ 'gemma-2': 'google/gemma-2-2b',
11
+ 'qwen3': 'Qwen/Qwen2.5-0.5B',
12
+ 'bert': 'bert-base-uncased'
13
+ }
14
+
15
+ def tokenize_with_tiktoken(text, model):
16
+ encoding = 'cl100k_base' if model == 'gpt-4' else 'gpt2'
17
+ enc = tiktoken.get_encoding(encoding)
18
+ tokens = enc.encode(text)
19
+ token_texts = [enc.decode([token]) for token in tokens]
20
+
21
+ return {
22
+ 'model': f'GPT-4' if model == 'gpt-4' else 'GPT-2',
23
+ 'token_count': len(tokens),
24
+ 'tokens': token_texts,
25
+ 'token_ids': tokens.tolist()
26
+ }
27
+
28
+ def tokenize_with_hf(text, model):
29
+ try:
30
+ model_name = MODEL_MAP.get(model, 'gpt2')
31
+ tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.getenv('HF_TOKEN'))
32
+
33
+ tokens = tokenizer.encode(text)
34
+ token_texts = [tokenizer.decode([token], skip_special_tokens=False) for token in tokens]
35
+
36
+ return {
37
+ 'model': model.upper(),
38
+ 'token_count': len(tokens),
39
+ 'tokens': token_texts,
40
+ 'token_ids': tokens
41
+ }
42
+ except Exception as e:
43
+ return {
44
+ 'model': model.upper(),
45
+ 'token_count': 0,
46
+ 'tokens': [f"Error: {str(e)}"],
47
+ 'token_ids': []
48
+ }
49
+
50
+ def compare_tokenizers(text, selected_models):
51
+ if not text.strip():
52
+ return "Please enter some text to tokenize."
53
+
54
+ results = []
55
+
56
+ for model in selected_models:
57
+ if model in ['gpt-4', 'gpt-2']:
58
+ result = tokenize_with_tiktoken(text, model)
59
+ else:
60
+ result = tokenize_with_hf(text, model)
61
+
62
+ # Format output
63
+ tokens_display = ' | '.join([f'"{token}"' if token.strip() else '"·"' for token in result['tokens'][:20]])
64
+ if len(result['tokens']) > 20:
65
+ tokens_display += f" ... (+{len(result['tokens']) - 20} more)"
66
+
67
+ results.append(f"""
68
+ **{result['model']}**
69
+ - Token Count: **{result['token_count']}**
70
+ - Tokens: {tokens_display}
71
+ - Token IDs: {str(result['token_ids'][:10])}{'...' if len(result['token_ids']) > 10 else ''}
72
+ """)
73
+
74
+ return "\n\n---\n".join(results)
75
+
76
+ # Create Gradio interface
77
+ with gr.Blocks(
78
+ title="🔤 Tokenizer Comparison Tool",
79
+ theme=gr.themes.Soft()
80
+ ) as demo:
81
+
82
+ gr.Markdown("""
83
+ # 🔤 Tokenizer Comparison Tool
84
+
85
+ Compare how different LLM tokenizers split text into tokens. See the differences between GPT, LLaMA, Gemma, and other models.
86
+ """)
87
+
88
+ with gr.Row():
89
+ with gr.Column(scale=2):
90
+ text_input = gr.Textbox(
91
+ label="Text to tokenize",
92
+ placeholder="Hello world! This is a test with some subwords and punctuation.",
93
+ lines=4,
94
+ value="Hello world! This is a test with some subwords and punctuation."
95
+ )
96
+
97
+ with gr.Column(scale=1):
98
+ model_selector = gr.CheckboxGroup(
99
+ choices=['gpt-4', 'gpt-2', 'llama-2', 'llama-3', 'gemma-2', 'qwen3', 'bert'],
100
+ value=['gpt-4', 'llama-3', 'gpt-2'],
101
+ label="Select tokenizers to compare"
102
+ )
103
+
104
+ output = gr.Markdown(
105
+ label="Tokenization Results",
106
+ value="Enter text above to see tokenization results..."
107
+ )
108
+
109
+ # Auto-update on text or model change
110
+ text_input.change(
111
+ fn=compare_tokenizers,
112
+ inputs=[text_input, model_selector],
113
+ outputs=output
114
+ )
115
+
116
+ model_selector.change(
117
+ fn=compare_tokenizers,
118
+ inputs=[text_input, model_selector],
119
+ outputs=output
120
+ )
121
+
122
+ gr.Markdown("""
123
+ ### Legend:
124
+ - **Token Count**: Number of tokens the model uses
125
+ - **Tokens**: The actual text pieces (subwords)
126
+ - **Token IDs**: Numerical IDs in the vocabulary
127
+ - **"·"**: Represents spaces/whitespace
128
+
129
+ ### Models:
130
+ - **GPT-4/GPT-2**: OpenAI tokenizers (tiktoken)
131
+ - **LLaMA**: Meta's models (SentencePiece)
132
+ - **Gemma**: Google's models
133
+ - **Qwen**: Alibaba's models
134
+ - **BERT**: Google's BERT tokenizer
135
+ """)
136
+
137
+ if __name__ == "__main__":
138
+ demo.launch()