|
import re |
|
import unicodedata |
|
|
|
from transformers import LlamaTokenizerFast |
|
|
|
|
|
tokenizer = LlamaTokenizerFast.from_pretrained("local_tokenizer") |
|
|
|
def fasttext_preprocess_func(content: str) -> str: |
|
"""Fasttext preprocess function. |
|
|
|
Args: |
|
content (str): Content to process. |
|
|
|
Returns: |
|
str: Processed normalized content. |
|
""" |
|
|
|
|
|
content = re.sub(r'\n{3,}', '\n\n', content) |
|
|
|
|
|
content = content.lower() |
|
|
|
|
|
content = ''.join( |
|
c for c in unicodedata.normalize('NFKD', content) |
|
if unicodedata.category(c) != 'Mn') |
|
|
|
|
|
token_ids = tokenizer.encode(content, add_special_tokens=False) |
|
single_text_list = [] |
|
for token_id in token_ids: |
|
curr_text = tokenizer.decode([token_id]) |
|
single_text_list.append(curr_text) |
|
|
|
content = ' '.join(single_text_list) |
|
|
|
|
|
|
|
content = re.sub(r'\n', '\\\\n', content) |
|
content = re.sub(r'\r', '\\\\r', content) |
|
content = re.sub(r'\t', '\\\\t', content) |
|
content = re.sub(r' +', ' ', content) |
|
content = content.strip() |
|
|
|
return content |
|
|