cognisafe-backend / app /services /text_cleaner.py
zyriean's picture
add app
d68e65a verified
# app/services/text_cleaner.py
from symspellpy.symspellpy import SymSpell
from pathlib import Path
# Initialize only once when this module is imported
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
# Load dictionary
dict_path = Path.cwd()/"app"/"services"/"frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dict_path, term_index=0, count_index=1)
# Leetspeak normalizer
def leetspeak_normalizer(text: str) -> str:
leet_map = str.maketrans("014!3$@5#+", "oialesasht")
return text.translate(leet_map)
# Combined cleaning function
def clean_text(text: str) -> str:
normalized = leetspeak_normalizer(text)
suggestions = sym_spell.lookup_compound(normalized, max_edit_distance=2)
corrected = suggestions[0].term if suggestions else normalized
return corrected