Spaces:

gsaltintas
/

tokenizer-comparison

Running

File size: 6,725 Bytes

import os
import re
import unicodedata

import tiktoken
from transformers import AutoTokenizer

from mappings import MODEL_MAP, TOKENIZER_INFO


def get_token_type(token_text):
    if re.match(r"^\s+$", token_text):
        return "whitespace"
    elif re.match(r"^[a-zA-Z]+$", token_text):
        return "word"
    elif re.match(r"^\d+$", token_text):
        return "number"
    elif re.match(r"^[^\w\s]+$", token_text):
        return "punctuation"
    elif token_text.startswith("<") and token_text.endswith(">"):
        return "special"
    else:
        return "mixed"


def is_subword(token_text, model, is_first):
    if not token_text or token_text.isspace():
        return False

    if token_text.startswith("<") and token_text.endswith(">"):
        return False  # special token

    if model in {
        "llama-2",
        "llama-3",
        "gemma-2",
        "bloom",
        "aya-expanse",
        "comma",
    }:
        return (
            not (token_text.startswith("▁") or token_text.startswith("Ġ"))
            and not is_first
        )
    elif model == "bert":
        return token_text.startswith("##")
    elif model in {"qwen3", "qwen2.5"}:
        return (
            not (token_text.startswith("▁") or token_text.startswith("Ġ"))
            and not is_first
        )
    elif model in {"gpt-4", "gpt-2", "byt5"}:
        return not token_text.startswith(" ") and not is_first
    else:
        return not is_first


def tokenize_with_tiktoken(text, model):
    encoding = "cl100k_base" if model == "gpt-4" else "gpt2"
    enc = tiktoken.get_encoding(encoding)
    tokens = enc.encode(text)

    token_data = []
    current_pos = 0

    for i, token_id in enumerate(tokens):
        token_text = enc.decode([token_id])
        token_type = get_token_type(token_text)
        subword = is_subword(token_text, model, i == 0)

        token_data.append(
            {
                "text": token_text,
                "id": int(token_id),
                "type": token_type,
                "is_subword": subword,
                "bytes": len(token_text.encode("utf-8")),
                "position": i,
            }
        )
        current_pos += len(token_text)

    return {
        "model": TOKENIZER_INFO[model]["name"],
        "token_count": len(tokens),
        "tokens": token_data,
        "compression_ratio": len(text) / len(tokens) if tokens else 0,
        "encoding": TOKENIZER_INFO[model]["encoding"],
        "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
    }


def tokenize_with_hf(text, model):
    try:
        model_name = MODEL_MAP.get(model, "gpt2")

        # Get token from environment
        hf_token = os.getenv("HF_TOKEN")
        if not hf_token:
            return {
                "model": TOKENIZER_INFO[model]["name"],
                "token_count": 0,
                "tokens": [],
                "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.",
            }

        print(f"DEBUG: Loading model {model_name} with token")
        tokenizer = AutoTokenizer.from_pretrained(
            model_name, token=hf_token, trust_remote_code=True
        )
        token_data = []
        encoding = tokenizer(
            text,
            return_offsets_mapping=False,
            return_tensors=None,
            add_special_tokens=True,
        )
        token_ids = encoding["input_ids"]
        tokens = tokenizer.convert_ids_to_tokens(token_ids)
        # print(tokenizer.backend_tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))

        for i, (token_id, token_text) in enumerate(zip(token_ids, tokens)):
            token_type = get_token_type(token_text)
            subword = is_subword(token_text, model, i == 0)

            token_data.append(
                {
                    "text": token_text,
                    "id": int(token_id),
                    "type": token_type,
                    "is_subword": subword,
                    "bytes": len(token_text.encode("utf-8")),
                    "position": i,
                }
            )

        return {
            "model": TOKENIZER_INFO[model]["name"],
            "token_count": len(token_ids),
            "tokens": token_data,
            "compression_ratio": len(text) / len(token_ids) if token_ids else 0,
            "encoding": TOKENIZER_INFO[model]["encoding"],
            "vocab_size": TOKENIZER_INFO[model]["vocab_size"],
        }
    except Exception as e:
        error_msg = str(e)
        print(f"DEBUG: Error: {error_msg}")

        # Provide helpful error messages
        if "gated repo" in error_msg.lower():
            error_msg = f"Model is gated. Request access at https://huggingface.co/{model_name} and ensure HF_TOKEN is set."
        elif "401" in error_msg:
            error_msg = "Authentication failed. Check your HF_TOKEN in Space secrets."
        elif "not found" in error_msg.lower():
            error_msg = (
                f"Model {model_name} not found. It may have been moved or renamed."
            )

        return {
            "model": TOKENIZER_INFO[model]["name"],
            "token_count": 0,
            "tokens": [],
            "compression_ratio": 0,
            "encoding": "Error",
            "vocab_size": 0,
            "error": error_msg,
        }


def normalize_text(text, method):
    """Apply normalization method to text"""
    if method == "none":
        return text
    elif method == "lowercase":
        return text.lower()
    elif method == "nfc":
        return unicodedata.normalize("NFC", text)
    elif method == "nfd":
        return unicodedata.normalize("NFD", text)
    elif method == "nfkc":
        return unicodedata.normalize("NFKC", text)
    elif method == "nfkd":
        return unicodedata.normalize("NFKD", text)
    elif method == "strip_accents":
        return "".join(
            c
            for c in unicodedata.normalize("NFD", text)
            if unicodedata.category(c) != "Mn"
        )
    elif method == "strip_punctuation":
        return re.sub(r"[^\w\s]", "", text)
    elif method == "whitespace_normalize":
        return " ".join(text.split())
    return text


def get_normalization_methods():
    """Return available normalization methods"""
    return [
        ("none", "No normalization"),
        ("lowercase", "Lowercase"),
        ("nfc", "Unicode NFC (Canonical)"),
        ("nfd", "Unicode NFD (Decomposed)"),
        ("nfkc", "Unicode NFKC (Compatible)"),
        ("nfkd", "Unicode NFKD (Compatible Decomposed)"),
        ("strip_accents", "Remove Accents"),
        ("strip_punctuation", "Remove Punctuation"),
        ("whitespace_normalize", "Normalize Whitespace"),
    ]