File size: 840 Bytes
d68e65a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# app/services/text_cleaner.py
from symspellpy.symspellpy import SymSpell
from pathlib import Path

# Initialize only once when this module is imported
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# Load dictionary
dict_path = Path.cwd()/"app"/"services"/"frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dict_path, term_index=0, count_index=1)

# Leetspeak normalizer
def leetspeak_normalizer(text: str) -> str:
    leet_map = str.maketrans("014!3$@5#+", "oialesasht")
    return text.translate(leet_map)

# Combined cleaning function
def clean_text(text: str) -> str:
    normalized = leetspeak_normalizer(text)
    suggestions = sym_spell.lookup_compound(normalized, max_edit_distance=2)
    corrected = suggestions[0].term if suggestions else normalized
    return corrected