import os import re import tiktoken from transformers import AutoTokenizer from mappings import MODEL_MAP, TOKENIZER_INFO def get_token_type(token_text): if re.match(r"^\s+$", token_text): return "whitespace" elif re.match(r"^[a-zA-Z]+$", token_text): return "word" elif re.match(r"^\d+$", token_text): return "number" elif re.match(r"^[^\w\s]+$", token_text): return "punctuation" elif token_text.startswith("<") and token_text.endswith(">"): return "special" else: return "mixed" def is_subword(token_text, model, is_first): if model in ["llama-2", "llama-3", "qwen3"]: return not token_text.startswith("▁") and not is_first elif model == "bert": return token_text.startswith("##") else: # BPE models return not token_text.startswith(" ") and not is_first and len(token_text) > 0 def tokenize_with_tiktoken(text, model): encoding = "cl100k_base" if model == "gpt-4" else "gpt2" enc = tiktoken.get_encoding(encoding) tokens = enc.encode(text) token_data = [] current_pos = 0 for i, token_id in enumerate(tokens): token_text = enc.decode([token_id]) token_type = get_token_type(token_text) subword = is_subword(token_text, model, i == 0) token_data.append( { "text": token_text, "id": int(token_id), "type": token_type, "is_subword": subword, "bytes": len(token_text.encode("utf-8")), "position": i, } ) current_pos += len(token_text) return { "model": TOKENIZER_INFO[model]["name"], "token_count": len(tokens), "tokens": token_data, "compression_ratio": len(text) / len(tokens) if tokens else 0, "encoding": TOKENIZER_INFO[model]["encoding"], "vocab_size": TOKENIZER_INFO[model]["vocab_size"], } def tokenize_with_hf(text, model): try: model_name = MODEL_MAP.get(model, "gpt2") # Get token from environment hf_token = os.getenv("HF_TOKEN") if not hf_token: return { "model": TOKENIZER_INFO[model]["name"], "token_count": 0, "tokens": [], "error": "HF_TOKEN not found in environment. Please add your HuggingFace token to Space secrets.", } print(f"DEBUG: Loading model {model_name} with token") tokenizer = AutoTokenizer.from_pretrained( model_name, token=hf_token, trust_remote_code=True ) tokens = tokenizer.encode(text) token_data = [] for i, token_id in enumerate(tokens): token_text = tokenizer.decode([token_id], skip_special_tokens=False) token_type = get_token_type(token_text) subword = is_subword(token_text, model, i == 0) token_data.append( { "text": token_text, "id": int(token_id), "type": token_type, "is_subword": subword, "bytes": len(token_text.encode("utf-8")), "position": i, } ) return { "model": TOKENIZER_INFO[model]["name"], "token_count": len(tokens), "tokens": token_data, "compression_ratio": len(text) / len(tokens) if tokens else 0, "encoding": TOKENIZER_INFO[model]["encoding"], "vocab_size": TOKENIZER_INFO[model]["vocab_size"], } except Exception as e: error_msg = str(e) # Provide helpful error messages if "gated repo" in error_msg.lower(): error_msg = f"Model is gated. Request access at https://huggingface.co/{model_name} and ensure HF_TOKEN is set." elif "401" in error_msg: error_msg = "Authentication failed. Check your HF_TOKEN in Space secrets." elif "not found" in error_msg.lower(): error_msg = ( f"Model {model_name} not found. It may have been moved or renamed." ) return { "model": TOKENIZER_INFO[model]["name"], "token_count": 0, "tokens": [], "compression_ratio": 0, "encoding": "Error", "vocab_size": 0, "error": error_msg, }