diff --git "a/index.html" "b/index.html" --- "a/index.html" +++ "b/index.html" @@ -10,7 +10,7 @@ tokenizer._tokenizer.pre_tokenizer = pre_tokenizers.Sequence( # Added step: split by R2L digits pre_tokenizers.Split(pattern = Regex(r"\d{1,3}(?=(\d{3})*\b)"), behavior="isolated", invert = False), - # Existing steps + # Below: Existing steps from Llama 3's tokenizer pre_tokenizers.Split(pattern=Regex(r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"), behavior="isolated", invert=False), pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=False) @@ -23,4 +23,4 @@ print(tokenizer.tokenize("42069")) # [42, 069] title={From Digits to Decisions: How Tokenization Impacts Arithmetic in LLMs}, author={Garreth Lee, Guilherme Penedo, Leandro von Werra and Thomas Wolf}, url={https://huggingface.co/spaces/huggingface/number-tokenization-blog}, -} \ No newline at end of file +} \ No newline at end of file