import re import unicodedata from transformers import LlamaTokenizerFast tokenizer = LlamaTokenizerFast.from_pretrained("local_tokenizer") def fasttext_preprocess_func(content: str) -> str: """Fasttext preprocess function. Args: content (str): Content to process. Returns: str: Processed normalized content. """ # 1. remove multiple newlines content = re.sub(r'\n{3,}', '\n\n', content) # 2. lower the content content = content.lower() # 3. remove diacritics content = ''.join( c for c in unicodedata.normalize('NFKD', content) if unicodedata.category(c) != 'Mn') # 4. word segmentation token_ids = tokenizer.encode(content, add_special_tokens=False) single_text_list = [] for token_id in token_ids: curr_text = tokenizer.decode([token_id]) single_text_list.append(curr_text) content = ' '.join(single_text_list) # 5. keep escape chars, \n, \t, \r -> \\n, \\t, \\r, # which will saved as \n, \t, \r in txt file. content = re.sub(r'\n', '\\\\n', content) content = re.sub(r'\r', '\\\\r', content) content = re.sub(r'\t', '\\\\t', content) content = re.sub(r' +', ' ', content) content = content.strip() return content