import re from nltk.corpus import stopwords from nltk.stem.isri import ISRIStemmer arabic_stopwords = set(stopwords.words("arabic")) stemmer = ISRIStemmer() char_map = str.maketrans( {"أ": "ا", "إ": "ا", "آ": "ا", "ى": "ي", "ة": "ه", "ؤ": "و", "ئ": "ي", "ـ": ""} ) diacritics_pattern = re.compile(r"[\u064B-\u0652]") punctuation_pattern = re.compile(r"[^\w\s]") whitespace_pattern = re.compile(r"\s+") repeated_char_pattern = re.compile(r"(.)\1+") def normalize_arabic(text: str) -> str: """Normalize Arabic characters.""" return text.translate(char_map) def remove_diacritics(text: str) -> str: """Remove Arabic diacritics.""" return diacritics_pattern.sub("", text) def remove_punctuation(text: str) -> str: """Remove punctuation marks.""" return punctuation_pattern.sub(" ", text) def reduce_repeated_characters(text: str) -> str: """Reduce repeated characters to single occurrence.""" return repeated_char_pattern.sub(r"\1", text) def remove_stopwords(tokens: list[str]) -> list[str]: """Remove Arabic stopwords from tokens.""" return [word for word in tokens if word not in arabic_stopwords] def stem_tokens(tokens: list[str]) -> list[str]: """Apply ISRI stemming to tokens.""" return [stemmer.stem(token) for token in tokens] def preprocess_for_classification(text: str) -> str: """Preprocess text for classification: normalize, clean, tokenize, stem.""" text = text.strip().lower() text = normalize_arabic(text) text = remove_diacritics(text) text = remove_punctuation(text) text = reduce_repeated_characters(text) text = whitespace_pattern.sub(" ", text).strip() text = re.sub(r"\d+", "", text) tokens = text.split() tokens = remove_stopwords(tokens) tokens = stem_tokens(tokens) return " ".join(tokens) def preprocess_for_summarization(text: str) -> str: """Light preprocessing for summarization: remove diacritics and numbers.""" if not isinstance(text, str): return "" text = text.strip().lower() text = remove_diacritics(text) text = whitespace_pattern.sub(" ", text).strip() return re.sub(r"\d+", "", text) class ArabicPreprocessor: """Arabic text preprocessor with analysis capabilities.""" def __init__(self): self.arabic_stopwords = arabic_stopwords self.stemmer = stemmer self.char_map = char_map def preprocess_for_classification(self, text: str) -> str: """Preprocess text for classification.""" return preprocess_for_classification(text) def preprocess_for_summarization(self, text: str) -> str: """Preprocess text for summarization.""" return preprocess_for_summarization(text) def get_preprocessing_steps(self, text: str, task_type: str = "classification") -> dict: """Get detailed preprocessing steps for analysis.""" steps = { "original": text, "stripped_lowered": text.strip().lower(), } current = text.strip().lower() if task_type == "classification": steps["normalized"] = normalize_arabic(current) current = normalize_arabic(current) steps["diacritics_removed"] = remove_diacritics(current) current = remove_diacritics(current) steps["punctuation_removed"] = remove_punctuation(current) current = remove_punctuation(current) steps["repeated_chars_reduced"] = reduce_repeated_characters(current) current = reduce_repeated_characters(current) steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip() current = whitespace_pattern.sub(" ", current).strip() steps["numbers_removed"] = re.sub(r"\d+", "", current) current = re.sub(r"\d+", "", current) tokens = current.split() steps["tokenized"] = tokens tokens_no_stop = remove_stopwords(tokens) steps["stopwords_removed"] = tokens_no_stop stemmed_tokens = stem_tokens(tokens_no_stop) steps["stemmed"] = stemmed_tokens steps["final"] = " ".join(stemmed_tokens) elif task_type == "summarization": steps["diacritics_removed"] = remove_diacritics(current) current = remove_diacritics(current) steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip() current = whitespace_pattern.sub(" ", current).strip() steps["numbers_removed"] = re.sub(r"\d+", "", current) steps["final"] = re.sub(r"\d+", "", current) return steps def analyze_text(self, text: str) -> dict: """Analyze text characteristics and statistics.""" original_sentences = re.split(r"[.!؟\n]+", text) original_sentences = [s.strip() for s in original_sentences if s.strip()] tokens = text.split() arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text)) return { "character_count": len(text), "word_count": len(tokens), "sentence_count": len(original_sentences), "arabic_character_count": arabic_chars, "arabic_character_ratio": arabic_chars / len(text) if len(text) > 0 else 0, "average_word_length": sum(len(word) for word in tokens) / len(tokens) if tokens else 0, "average_sentence_length": len(tokens) / len(original_sentences) if original_sentences else 0, "has_diacritics": bool(re.search(r'[\u064B-\u0652]', text)), "punctuation_count": len(re.findall(r'[^\w\s]', text)) }