diff --git "a/index.html" "b/index.html"
--- "a/index.html"
+++ "b/index.html"
@@ -10,7 +10,7 @@ tokenizer._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
# Added step: split by R2L digits
pre_tokenizers.Split(pattern = Regex(r"\d{1,3}(?=(\d{3})*\b)"),
behavior="isolated", invert = False),
- # Existing steps
+ # Below: Existing steps from Llama 3's tokenizer
pre_tokenizers.Split(pattern=Regex(r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"),
behavior="isolated", invert=False),
pre_tokenizers.ByteLevel(add_prefix_space=False, trim_offsets=True, use_regex=False)
@@ -23,4 +23,4 @@ print(tokenizer.tokenize("42069")) # [42, 069]
title={From Digits to Decisions: How Tokenization Impacts Arithmetic in LLMs},
author={Garreth Lee, Guilherme Penedo, Leandro von Werra and Thomas Wolf},
url={https://huggingface.co/spaces/huggingface/number-tokenization-blog},
-}