Spaces:

gsaltintas
/

tokenizer-comparison

Running

Gül Sena Altıntaş commited on 11 days ago

Commit

199862a

1 Parent(s): d9779a0

Small improvement for visualization

Files changed (2) hide show

app.py CHANGED Viewed

@@ -81,14 +81,23 @@ def generate_interactive_tokenization(results):
     # Add styles first
     html_parts.append("""
-    <div id="tokenizer-container">
     <style>
     .tokenizer-section {
         margin-bottom: 20px;
         border: 1px solid #e0e0e0;
         border-radius: 8px;
         padding: 15px;
         background: white;
     }
     .tokenizer-header {
         font-weight: bold;
@@ -157,6 +166,9 @@ def generate_interactive_tokenization(results):
         font-size: 12px;
         display: none;
         z-index: 1000;
     }
     </style>

     # Add styles first
     html_parts.append("""
+    <div id="tokenizer-container" class="tokenizer-container">
     <style>
+    .tokenizer-container {
+        display: flex;
+        flex-wrap: wrap;
+        justify-content: space-between;
+        gap: 20px;
+    }
     .tokenizer-section {
         margin-bottom: 20px;
         border: 1px solid #e0e0e0;
         border-radius: 8px;
         padding: 15px;
         background: white;
+        flex-wrap: wrap;
+        display: inline-block;
+        justify-content: space-between;
     }
     .tokenizer-header {
         font-weight: bold;
         font-size: 12px;
         display: none;
         z-index: 1000;
+        flex-wrap: wrap;
+        display: inline-block;
+        justify-content: space-between;
     }
     </style>

utils.py CHANGED Viewed

@@ -8,6 +8,8 @@ from transformers import AutoTokenizer
 from mappings import MODEL_MAP, TOKENIZER_INFO
 class TokenMonsterTokenizer:
     def __init__(self, name):
@@ -116,25 +118,33 @@ def tokenize_with_tiktoken(text, model):
     }
 def tokenize_with_hf(text, model):
     try:
-        model_name = MODEL_MAP.get(model, "gpt2")
-        # Get token from environment
-        hf_token = os.getenv("HF_TOKEN")
-        if not hf_token:
-            return {
-                "model": TOKENIZER_INFO[model]["name"],
-                "token_count": 0,
-                "tokens": [],
-                "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
-            }
-        if "tokenmonster" in model_name:
-            tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
-        else:
-            tokenizer = AutoTokenizer.from_pretrained(
-                model_name, token=hf_token, trust_remote_code=True
-            )
         token_data = []
         for text_ in text.split("\n"):
             text_ = text_ + "\n"

 from mappings import MODEL_MAP, TOKENIZER_INFO
+TOKENIZER_CACHE = {}
 class TokenMonsterTokenizer:
     def __init__(self, name):
     }
+def get_hf_tokenizer(model):
+    model_name = MODEL_MAP.get(model, "gpt2")
+    if model_name in TOKENIZER_CACHE:
+        return TOKENIZER_CACHE[model_name]
+    # Get token from environment
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        return {
+            "model": TOKENIZER_INFO[model]["name"],
+            "token_count": 0,
+            "tokens": [],
+            "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
+        }
+    if "tokenmonster" in model_name:
+        tokenizer = TokenMonsterTokenizer("englishcode-32000-consistent-v1")
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_name, token=hf_token, trust_remote_code=True
+        )
+    TOKENIZER_CACHE[model_name] = tokenizer
+    return tokenizer
 def tokenize_with_hf(text, model):
     try:
+        tokenizer = get_hf_tokenizer(model)
         token_data = []
         for text_ in text.split("\n"):
             text_ = text_ + "\n"