Spaces:

gsaltintas
/

tokenizer-comparison

Running

App Files Files Community

Gül Sena Altıntaş commited on 9 days ago

Commit

d9779a0

1 Parent(s): f58b113

Added support for showing newlines

Browse files

- TODO: add toggle button to include newlines in the tokenization

Files changed (3) hide show

README.md +3 -0
app.py +10 -9
utils.py +63 -39

README.md CHANGED Viewed

@@ -11,3 +11,6 @@ license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+- [x] next up i want to add some sample texts that are interesting
+- [x] normalization of the tokenization

app.py CHANGED Viewed

@@ -228,6 +228,9 @@ def generate_interactive_tokenization(results):
         for i, token in enumerate(result["tokens"]):
             token_text = token["text"]
             display_text = token_text if token_text.strip() else "·"
             # Determine token class
             token_class = f"token token-{token['type']}"
@@ -243,13 +246,17 @@ def generate_interactive_tokenization(results):
                 token_text.replace("\\", "\\\\")
                 .replace("'", "\\'")
                 .replace('"', '\\"')
-                .replace("\n", "\\n")
                 .replace("\r", "\\r")
             )
-            escaped_display = display_text.replace('"', "&quot;").replace("'", "&#39;")
-            # Use inline event handlers that definitely work in Gradio
             html_parts.append(f"""<span class="{token_class}"
                       id="{token_id}"
                       data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
@@ -312,11 +319,6 @@ def generate_token_ids_display(results):
             f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
         )
-        # Show ID ranges
-        id_values = [token["id"] for token in result["tokens"]]
-        if id_values:
-            output.append(f"**ID Range**: {min(id_values)} - {max(id_values)}")
     return "\n".join(output)
@@ -663,7 +665,6 @@ with gr.Blocks(
             norm_eff, norm_html, norm_ids = generate_basic_comparison(
                 normalized_results
             )
-            print(normalized_text)
             # Combine or show separately
             combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"

         for i, token in enumerate(result["tokens"]):
             token_text = token["text"]
             display_text = token_text if token_text.strip() else "·"
+            if token_text == "<newline>":
+                html_parts.append("<br>")
+                continue
             # Determine token class
             token_class = f"token token-{token['type']}"
                 token_text.replace("\\", "\\\\")
                 .replace("'", "\\'")
                 .replace('"', '\\"')
                 .replace("\r", "\\r")
+                .replace("\n", "\\n")
             )
+            escaped_display = (
+                display_text.replace('"', "&quot;")
+                .replace("'", "&#39;")
+                .replace("\r", "\n")
+            )
+            # Use inline event handlers that work in Gradio
             html_parts.append(f"""<span class="{token_class}"
                       id="{token_id}"
                       data-text="{token_text.replace('"', "&quot;").replace("'", "&#39;")}"
             f"**Stats**: {len(token_ids)} total tokens, {unique_ids} unique IDs"
         )
     return "\n".join(output)
             norm_eff, norm_html, norm_ids = generate_basic_comparison(
                 normalized_results
             )
             # Combine or show separately
             combined_html = f"<h3>Normalized Text: {normalized_text}</h3>{norm_html}\n<h2>Original</h2>{orig_html}"

utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 import re
-import unicodedata
 import traceback
 import tiktoken
 from transformers import AutoTokenizer
@@ -12,16 +12,17 @@ from mappings import MODEL_MAP, TOKENIZER_INFO
 class TokenMonsterTokenizer:
     def __init__(self, name):
         import tokenmonster
         self.name = name
         self.vocab = tokenmonster.load(name.split("/")[-1])
     def __call__(self, text, **kwargs):
         ids = list(self.vocab.tokenize(text))
         return {"input_ids": ids}
     def convert_ids_to_tokens(self, ids):
         return [self.vocab.decode(id_) for id_ in ids]
 def get_token_type(token_text):
     if re.match(r"^\s+$", token_text):
@@ -73,27 +74,37 @@ def is_subword(token_text, model, is_first):
 def tokenize_with_tiktoken(text, model):
     encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
     enc = tiktoken.get_encoding(encoding)
-    tokens = enc.encode(text)
     token_data = []
     current_pos = 0
-    for i, token_id in enumerate(tokens):
-        token_text = enc.decode([token_id])
-        token_type = get_token_type(token_text)
-        subword = is_subword(token_text, model, i == 0)
         token_data.append(
             {
-                "text": token_text,
-                "id": int(token_id),
-                "type": token_type,
-                "is_subword": subword,
-                "bytes": len(token_text.encode("utf-8")),
-                "position": i,
             }
         )
-        current_pos += len(token_text)
     return {
         "model": TOKENIZER_INFO[model]["name"],
@@ -117,37 +128,50 @@ def tokenize_with_hf(text, model):
                 "tokens": [],
                 "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
             }
         if "tokenmonster" in model_name:
             tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
         else:
             tokenizer = AutoTokenizer.from_pretrained(
-            model_name, token=hf_token, trust_remote_code=True
-        )
         token_data = []
-        encoding = tokenizer(
-            text,
-            return_offsets_mapping=False,
-            return_tensors=None,
-            add_special_tokens=True,
-        )
-        token_ids = encoding["input_ids"]
-        tokens = tokenizer.convert_ids_to_tokens(token_ids)
-        print(model_name, tokens, token_ids)
-        # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
-        for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
-            token_type = get_token_type(token_text)
-            subword = is_subword(token_text, model, i == 0)
             token_data.append(
                 {
-                    "text": token_text,
-                    "id": int(token_id),
-                    "type": token_type,
-                    "is_subword": subword,
-                    "bytes": len(token_text.encode("utf-8")),
-                    "position": i,
                 }
             )

 import os
 import re
 import traceback
+import unicodedata
 import tiktoken
 from transformers import AutoTokenizer
 class TokenMonsterTokenizer:
     def __init__(self, name):
         import tokenmonster
         self.name = name
         self.vocab = tokenmonster.load(name.split("/")[-1])
     def __call__(self, text, **kwargs):
         ids = list(self.vocab.tokenize(text))
         return {"input_ids": ids}
     def convert_ids_to_tokens(self, ids):
         return [self.vocab.decode(id_) for id_ in ids]
 def get_token_type(token_text):
     if re.match(r"^\s+$", token_text):
 def tokenize_with_tiktoken(text, model):
     encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
     enc = tiktoken.get_encoding(encoding)
     token_data = []
     current_pos = 0
+    for text_ in text.split("\n"):
+        tokens = enc.encode(text_ + "\n")
+        for i, token_id in enumerate(tokens):
+            token_text = enc.decode([token_id])
+            token_type = get_token_type(token_text)
+            subword = is_subword(token_text, model, i == 0)
+            token_data.append(
+                {
+                    "text": token_text,
+                    "id": int(token_id),
+                    "type": token_type,
+                    "is_subword": subword,
+                    "bytes": len(token_text.encode("utf-8")),
+                    "position": i,
+                }
+            )
+            current_pos += len(token_text)
         token_data.append(
             {
+                "text": "<newline>",
+                "id": 0,
+                "type": "special",
+                "is_subword": False,
+                "position": len(token_data),
             }
         )
     return {
         "model": TOKENIZER_INFO[model]["name"],
                 "tokens": [],
                 "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
             }
         if "tokenmonster" in model_name:
             tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
         else:
             tokenizer = AutoTokenizer.from_pretrained(
+                model_name, token=hf_token, trust_remote_code=True
+            )
         token_data = []
+        for text_ in text.split("\n"):
+            text_ = text_ + "\n"
+            encoding = tokenizer(
+                text_,
+                return_offsets_mapping=False,
+                return_tensors=None,
+                add_special_tokens=False,
+            )
+            token_ids = encoding["input_ids"]
+            tokens = tokenizer.convert_ids_to_tokens(token_ids)
+            # print(model_name, text, "\n", tokens, token_ids)
+            # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
+            for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
+                token_type = get_token_type(token_text)
+                subword = is_subword(token_text, model, i == 0)
+                token_data.append(
+                    {
+                        "text": token_text,
+                        "id": token_id,  # int(token_id),
+                        "type": token_type,
+                        "is_subword": subword,
+                        "bytes": len(token_text.encode("utf-8")),
+                        "position": i,
+                    }
+                )
             token_data.append(
                 {
+                    "text": "<newline>",
+                    "id": 0,
+                    "type": "special",
+                    "is_subword": False,
+                    "position": len(token_data),
                 }
             )