Spaces:
Running
Running
Gül Sena Altıntaş
commited on
Commit
·
d9779a0
1
Parent(s):
f58b113
Added support for showing newlines
Browse files- TODO: add toggle button to include newlines in the tokenization
README.md
CHANGED
@@ -11,3 +11,6 @@ license: apache-2.0
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
14 |
+
|
15 |
+
- [x] next up i want to add some sample texts that are interesting
|
16 |
+
- [x] normalization of the tokenization
|
app.py
CHANGED
@@ -228,6 +228,9 @@ def generate_interactive_tokenization(results):
|
|
228 |
for i, token in enumerate(result["tokens"]):
|
229 |
token_text = token["text"]
|
230 |
display_text = token_text if token_text.strip() else "·"
|
|
|
|
|
|
|
231 |
|
232 |
# Determine token class
|
233 |
token_class = f"token token-{token['type']}"
|
@@ -243,13 +246,17 @@ def generate_interactive_tokenization(results):
|
|
243 |
token_text.replace("\\", "\\\\")
|
244 |
.replace("'", "\\'")
|
245 |
.replace('"', '\\"')
|
246 |
-
.replace("\n", "\\n")
|
247 |
.replace("\r", "\\r")
|
|
|
248 |
)
|
249 |
|
250 |
-
escaped_display =
|
|
|
|
|
|
|
|
|
251 |
|
252 |
-
# Use inline event handlers that
|
253 |
html_parts.append(f"""<span class="{token_class}"
|
254 |
id="{token_id}"
|
255 |
data-text="{token_text.replace('"', """).replace("'", "'")}"
|
@@ -312,11 +319,6 @@ def generate_token_ids_display(results):
|
|
312 |
f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
|
313 |
)
|
314 |
|
315 |
-
# Show ID ranges
|
316 |
-
id_values = [token["id"] for token in result["tokens"]]
|
317 |
-
if id_values:
|
318 |
-
output.append(f"**ID Range**: {min(id_values)} - {max(id_values)}")
|
319 |
-
|
320 |
return "\n".join(output)
|
321 |
|
322 |
|
@@ -663,7 +665,6 @@ with gr.Blocks(
|
|
663 |
norm_eff, norm_html, norm_ids = generate_basic_comparison(
|
664 |
normalized_results
|
665 |
)
|
666 |
-
print(normalized_text)
|
667 |
|
668 |
# Combine or show separately
|
669 |
combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
|
|
|
228 |
for i, token in enumerate(result["tokens"]):
|
229 |
token_text = token["text"]
|
230 |
display_text = token_text if token_text.strip() else "·"
|
231 |
+
if token_text == "<newline>":
|
232 |
+
html_parts.append("<br>")
|
233 |
+
continue
|
234 |
|
235 |
# Determine token class
|
236 |
token_class = f"token token-{token['type']}"
|
|
|
246 |
token_text.replace("\\", "\\\\")
|
247 |
.replace("'", "\\'")
|
248 |
.replace('"', '\\"')
|
|
|
249 |
.replace("\r", "\\r")
|
250 |
+
.replace("\n", "\\n")
|
251 |
)
|
252 |
|
253 |
+
escaped_display = (
|
254 |
+
display_text.replace('"', """)
|
255 |
+
.replace("'", "'")
|
256 |
+
.replace("\r", "\n")
|
257 |
+
)
|
258 |
|
259 |
+
# Use inline event handlers that work in Gradio
|
260 |
html_parts.append(f"""<span class="{token_class}"
|
261 |
id="{token_id}"
|
262 |
data-text="{token_text.replace('"', """).replace("'", "'")}"
|
|
|
319 |
f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
|
320 |
)
|
321 |
|
|
|
|
|
|
|
|
|
|
|
322 |
return "\n".join(output)
|
323 |
|
324 |
|
|
|
665 |
norm_eff, norm_html, norm_ids = generate_basic_comparison(
|
666 |
normalized_results
|
667 |
)
|
|
|
668 |
|
669 |
# Combine or show separately
|
670 |
combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
|
utils.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import os
|
2 |
import re
|
3 |
-
import unicodedata
|
4 |
import traceback
|
|
|
5 |
|
6 |
import tiktoken
|
7 |
from transformers import AutoTokenizer
|
@@ -12,16 +12,17 @@ from mappings import MODEL_MAP, TOKENIZER_INFO
|
|
12 |
class TokenMonsterTokenizer:
|
13 |
def __init__(self, name):
|
14 |
import tokenmonster
|
|
|
15 |
self.name = name
|
16 |
self.vocab = tokenmonster.load(name.split("/")[-1])
|
17 |
-
|
18 |
def __call__(self, text, **kwargs):
|
19 |
ids = list(self.vocab.tokenize(text))
|
20 |
return {"input_ids": ids}
|
21 |
-
|
22 |
def convert_ids_to_tokens(self, ids):
|
23 |
return [self.vocab.decode(id_) for id_ in ids]
|
24 |
-
|
25 |
|
26 |
def get_token_type(token_text):
|
27 |
if re.match(r"^\s+$", token_text):
|
@@ -73,27 +74,37 @@ def is_subword(token_text, model, is_first):
|
|
73 |
def tokenize_with_tiktoken(text, model):
|
74 |
encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
|
75 |
enc = tiktoken.get_encoding(encoding)
|
76 |
-
tokens = enc.encode(text)
|
77 |
|
78 |
token_data = []
|
79 |
current_pos = 0
|
|
|
|
|
80 |
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
token_data.append(
|
87 |
{
|
88 |
-
"text":
|
89 |
-
"id":
|
90 |
-
"type":
|
91 |
-
"is_subword":
|
92 |
-
"
|
93 |
-
"position": i,
|
94 |
}
|
95 |
)
|
96 |
-
current_pos += len(token_text)
|
97 |
|
98 |
return {
|
99 |
"model": TOKENIZER_INFO[model]["name"],
|
@@ -117,37 +128,50 @@ def tokenize_with_hf(text, model):
|
|
117 |
"tokens": [],
|
118 |
"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
|
119 |
}
|
120 |
-
|
121 |
if "tokenmonster" in model_name:
|
122 |
tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
|
123 |
else:
|
124 |
tokenizer = AutoTokenizer.from_pretrained(
|
125 |
-
|
126 |
-
|
127 |
token_data = []
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
# print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
|
138 |
-
|
139 |
-
for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
|
140 |
-
token_type = get_token_type(token_text)
|
141 |
-
subword = is_subword(token_text, model, i == 0)
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
token_data.append(
|
144 |
{
|
145 |
-
"text":
|
146 |
-
"id":
|
147 |
-
"type":
|
148 |
-
"is_subword":
|
149 |
-
"
|
150 |
-
"position": i,
|
151 |
}
|
152 |
)
|
153 |
|
|
|
1 |
import os
|
2 |
import re
|
|
|
3 |
import traceback
|
4 |
+
import unicodedata
|
5 |
|
6 |
import tiktoken
|
7 |
from transformers import AutoTokenizer
|
|
|
12 |
class TokenMonsterTokenizer:
|
13 |
def __init__(self, name):
|
14 |
import tokenmonster
|
15 |
+
|
16 |
self.name = name
|
17 |
self.vocab = tokenmonster.load(name.split("/")[-1])
|
18 |
+
|
19 |
def __call__(self, text, **kwargs):
|
20 |
ids = list(self.vocab.tokenize(text))
|
21 |
return {"input_ids": ids}
|
22 |
+
|
23 |
def convert_ids_to_tokens(self, ids):
|
24 |
return [self.vocab.decode(id_) for id_ in ids]
|
25 |
+
|
26 |
|
27 |
def get_token_type(token_text):
|
28 |
if re.match(r"^\s+$", token_text):
|
|
|
74 |
def tokenize_with_tiktoken(text, model):
|
75 |
encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
|
76 |
enc = tiktoken.get_encoding(encoding)
|
|
|
77 |
|
78 |
token_data = []
|
79 |
current_pos = 0
|
80 |
+
for text_ in text.split("\n"):
|
81 |
+
tokens = enc.encode(text_ + "\n")
|
82 |
|
83 |
+
for i, token_id in enumerate(tokens):
|
84 |
+
token_text = enc.decode([token_id])
|
85 |
+
token_type = get_token_type(token_text)
|
86 |
+
subword = is_subword(token_text, model, i == 0)
|
87 |
|
88 |
+
token_data.append(
|
89 |
+
{
|
90 |
+
"text": token_text,
|
91 |
+
"id": int(token_id),
|
92 |
+
"type": token_type,
|
93 |
+
"is_subword": subword,
|
94 |
+
"bytes": len(token_text.encode("utf-8")),
|
95 |
+
"position": i,
|
96 |
+
}
|
97 |
+
)
|
98 |
+
current_pos += len(token_text)
|
99 |
token_data.append(
|
100 |
{
|
101 |
+
"text": "<newline>",
|
102 |
+
"id": 0,
|
103 |
+
"type": "special",
|
104 |
+
"is_subword": False,
|
105 |
+
"position": len(token_data),
|
|
|
106 |
}
|
107 |
)
|
|
|
108 |
|
109 |
return {
|
110 |
"model": TOKENIZER_INFO[model]["name"],
|
|
|
128 |
"tokens": [],
|
129 |
"error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
|
130 |
}
|
131 |
+
|
132 |
if "tokenmonster" in model_name:
|
133 |
tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
|
134 |
else:
|
135 |
tokenizer = AutoTokenizer.from_pretrained(
|
136 |
+
model_name, token=hf_token, trust_remote_code=True
|
137 |
+
)
|
138 |
token_data = []
|
139 |
+
for text_ in text.split("\n"):
|
140 |
+
text_ = text_ + "\n"
|
141 |
+
|
142 |
+
encoding = tokenizer(
|
143 |
+
text_,
|
144 |
+
return_offsets_mapping=False,
|
145 |
+
return_tensors=None,
|
146 |
+
add_special_tokens=False,
|
147 |
+
)
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
+
token_ids = encoding["input_ids"]
|
150 |
+
tokens = tokenizer.convert_ids_to_tokens(token_ids)
|
151 |
+
# print(model_name, text, "\n", tokens, token_ids)
|
152 |
+
# print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
|
153 |
+
|
154 |
+
for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
|
155 |
+
token_type = get_token_type(token_text)
|
156 |
+
subword = is_subword(token_text, model, i == 0)
|
157 |
+
|
158 |
+
token_data.append(
|
159 |
+
{
|
160 |
+
"text": token_text,
|
161 |
+
"id": token_id, # int(token_id),
|
162 |
+
"type": token_type,
|
163 |
+
"is_subword": subword,
|
164 |
+
"bytes": len(token_text.encode("utf-8")),
|
165 |
+
"position": i,
|
166 |
+
}
|
167 |
+
)
|
168 |
token_data.append(
|
169 |
{
|
170 |
+
"text": "<newline>",
|
171 |
+
"id": 0,
|
172 |
+
"type": "special",
|
173 |
+
"is_subword": False,
|
174 |
+
"position": len(token_data),
|
|
|
175 |
}
|
176 |
)
|
177 |
|