Gül Sena Altıntaş commited on
Commit
d9779a0
·
1 Parent(s): f58b113

Added support for showing newlines

Browse files

- TODO: add toggle button to include newlines in the tokenization

Files changed (3) hide show
  1. README.md +3 -0
  2. app.py +10 -9
  3. utils.py +63 -39
README.md CHANGED
@@ -11,3 +11,6 @@ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
14
+
15
+ - [x] next up i want to add some sample texts that are interesting
16
+ - [x] normalization of the tokenization
app.py CHANGED
@@ -228,6 +228,9 @@ def generate_interactive_tokenization(results):
228
  for i, token in enumerate(result["tokens"]):
229
  token_text = token["text"]
230
  display_text = token_text if token_text.strip() else "·"
 
 
 
231
 
232
  # Determine token class
233
  token_class = f"token token-{token['type']}"
@@ -243,13 +246,17 @@ def generate_interactive_tokenization(results):
243
  token_text.replace("\\", "\\\\")
244
  .replace("'", "\\'")
245
  .replace('"', '\\"')
246
- .replace("\n", "\\n")
247
  .replace("\r", "\\r")
 
248
  )
249
 
250
- escaped_display = display_text.replace('"', """).replace("'", "'")
 
 
 
 
251
 
252
- # Use inline event handlers that definitely work in Gradio
253
  html_parts.append(f"""<span class="{token_class}"
254
  id="{token_id}"
255
  data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
@@ -312,11 +319,6 @@ def generate_token_ids_display(results):
312
  f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
313
  )
314
 
315
- # Show ID ranges
316
- id_values = [token["id"] for token in result["tokens"]]
317
- if id_values:
318
- output.append(f"**ID Range**: {min(id_values)} - {max(id_values)}")
319
-
320
  return "\n".join(output)
321
 
322
 
@@ -663,7 +665,6 @@ with gr.Blocks(
663
  norm_eff, norm_html, norm_ids = generate_basic_comparison(
664
  normalized_results
665
  )
666
- print(normalized_text)
667
 
668
  # Combine or show separately
669
  combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
 
228
  for i, token in enumerate(result["tokens"]):
229
  token_text = token["text"]
230
  display_text = token_text if token_text.strip() else "·"
231
+ if token_text == "<newline>":
232
+ html_parts.append("<br>")
233
+ continue
234
 
235
  # Determine token class
236
  token_class = f"token token-{token['type']}"
 
246
  token_text.replace("\\", "\\\\")
247
  .replace("'", "\\'")
248
  .replace('"', '\\"')
 
249
  .replace("\r", "\\r")
250
+ .replace("\n", "\\n")
251
  )
252
 
253
+ escaped_display = (
254
+ display_text.replace('"', "&quot;")
255
+ .replace("'", "&#39;")
256
+ .replace("\r", "\n")
257
+ )
258
 
259
+ # Use inline event handlers that work in Gradio
260
  html_parts.append(f"""<span class="{token_class}"
261
  id="{token_id}"
262
  data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
 
319
  f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
320
  )
321
 
 
 
 
 
 
322
  return "\n".join(output)
323
 
324
 
 
665
  norm_eff, norm_html, norm_ids = generate_basic_comparison(
666
  normalized_results
667
  )
 
668
 
669
  # Combine or show separately
670
  combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"
utils.py CHANGED
@@ -1,7 +1,7 @@
1
  import os
2
  import re
3
- import unicodedata
4
  import traceback
 
5
 
6
  import tiktoken
7
  from transformers import AutoTokenizer
@@ -12,16 +12,17 @@ from mappings import MODEL_MAP, TOKENIZER_INFO
12
  class TokenMonsterTokenizer:
13
  def __init__(self, name):
14
  import tokenmonster
 
15
  self.name = name
16
  self.vocab = tokenmonster.load(name.split("/")[-1])
17
-
18
  def __call__(self, text, **kwargs):
19
  ids = list(self.vocab.tokenize(text))
20
  return {"input_ids": ids}
21
-
22
  def convert_ids_to_tokens(self, ids):
23
  return [self.vocab.decode(id_) for id_ in ids]
24
-
25
 
26
  def get_token_type(token_text):
27
  if re.match(r"^\s+$", token_text):
@@ -73,27 +74,37 @@ def is_subword(token_text, model, is_first):
73
  def tokenize_with_tiktoken(text, model):
74
  encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
75
  enc = tiktoken.get_encoding(encoding)
76
- tokens = enc.encode(text)
77
 
78
  token_data = []
79
  current_pos = 0
 
 
80
 
81
- for i, token_id in enumerate(tokens):
82
- token_text = enc.decode([token_id])
83
- token_type = get_token_type(token_text)
84
- subword = is_subword(token_text, model, i == 0)
85
 
 
 
 
 
 
 
 
 
 
 
 
86
  token_data.append(
87
  {
88
- "text": token_text,
89
- "id": int(token_id),
90
- "type": token_type,
91
- "is_subword": subword,
92
- "bytes": len(token_text.encode("utf-8")),
93
- "position": i,
94
  }
95
  )
96
- current_pos += len(token_text)
97
 
98
  return {
99
  "model": TOKENIZER_INFO[model]["name"],
@@ -117,37 +128,50 @@ def tokenize_with_hf(text, model):
117
  "tokens": [],
118
  "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
119
  }
120
-
121
  if "tokenmonster" in model_name:
122
  tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
123
  else:
124
  tokenizer = AutoTokenizer.from_pretrained(
125
- model_name, token=hf_token, trust_remote_code=True
126
- )
127
  token_data = []
128
- encoding = tokenizer(
129
- text,
130
- return_offsets_mapping=False,
131
- return_tensors=None,
132
- add_special_tokens=True,
133
- )
134
- token_ids = encoding["input_ids"]
135
- tokens = tokenizer.convert_ids_to_tokens(token_ids)
136
- print(model_name, tokens, token_ids)
137
- # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
138
-
139
- for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
140
- token_type = get_token_type(token_text)
141
- subword = is_subword(token_text, model, i == 0)
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  token_data.append(
144
  {
145
- "text": token_text,
146
- "id": int(token_id),
147
- "type": token_type,
148
- "is_subword": subword,
149
- "bytes": len(token_text.encode("utf-8")),
150
- "position": i,
151
  }
152
  )
153
 
 
1
  import os
2
  import re
 
3
  import traceback
4
+ import unicodedata
5
 
6
  import tiktoken
7
  from transformers import AutoTokenizer
 
12
  class TokenMonsterTokenizer:
13
  def __init__(self, name):
14
  import tokenmonster
15
+
16
  self.name = name
17
  self.vocab = tokenmonster.load(name.split("/")[-1])
18
+
19
  def __call__(self, text, **kwargs):
20
  ids = list(self.vocab.tokenize(text))
21
  return {"input_ids": ids}
22
+
23
  def convert_ids_to_tokens(self, ids):
24
  return [self.vocab.decode(id_) for id_ in ids]
25
+
26
 
27
  def get_token_type(token_text):
28
  if re.match(r"^\s+$", token_text):
 
74
  def tokenize_with_tiktoken(text, model):
75
  encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
76
  enc = tiktoken.get_encoding(encoding)
 
77
 
78
  token_data = []
79
  current_pos = 0
80
+ for text_ in text.split("\n"):
81
+ tokens = enc.encode(text_ + "\n")
82
 
83
+ for i, token_id in enumerate(tokens):
84
+ token_text = enc.decode([token_id])
85
+ token_type = get_token_type(token_text)
86
+ subword = is_subword(token_text, model, i == 0)
87
 
88
+ token_data.append(
89
+ {
90
+ "text": token_text,
91
+ "id": int(token_id),
92
+ "type": token_type,
93
+ "is_subword": subword,
94
+ "bytes": len(token_text.encode("utf-8")),
95
+ "position": i,
96
+ }
97
+ )
98
+ current_pos += len(token_text)
99
  token_data.append(
100
  {
101
+ "text": "<newline>",
102
+ "id": 0,
103
+ "type": "special",
104
+ "is_subword": False,
105
+ "position": len(token_data),
 
106
  }
107
  )
 
108
 
109
  return {
110
  "model": TOKENIZER_INFO[model]["name"],
 
128
  "tokens": [],
129
  "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
130
  }
131
+
132
  if "tokenmonster" in model_name:
133
  tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
134
  else:
135
  tokenizer = AutoTokenizer.from_pretrained(
136
+ model_name, token=hf_token, trust_remote_code=True
137
+ )
138
  token_data = []
139
+ for text_ in text.split("\n"):
140
+ text_ = text_ + "\n"
141
+
142
+ encoding = tokenizer(
143
+ text_,
144
+ return_offsets_mapping=False,
145
+ return_tensors=None,
146
+ add_special_tokens=False,
147
+ )
 
 
 
 
 
148
 
149
+ token_ids = encoding["input_ids"]
150
+ tokens = tokenizer.convert_ids_to_tokens(token_ids)
151
+ # print(model_name, text, "\n", tokens, token_ids)
152
+ # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
153
+
154
+ for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
155
+ token_type = get_token_type(token_text)
156
+ subword = is_subword(token_text, model, i == 0)
157
+
158
+ token_data.append(
159
+ {
160
+ "text": token_text,
161
+ "id": token_id, # int(token_id),
162
+ "type": token_type,
163
+ "is_subword": subword,
164
+ "bytes": len(token_text.encode("utf-8")),
165
+ "position": i,
166
+ }
167
+ )
168
  token_data.append(
169
  {
170
+ "text": "<newline>",
171
+ "id": 0,
172
+ "type": "special",
173
+ "is_subword": False,
174
+ "position": len(token_data),
 
175
  }
176
  )
177