Spaces:

mabosaimi
/

arabic-summarizer-classifier

Running

arabic-summarizer-classifier / preprocessor.py

moabos

chore: setup fastapi with initial routes and hook up traditional models (phase 1) with preprocessing

354c6a0 29 days ago

5.87 kB

	import re
	from nltk.corpus import stopwords
	from nltk.stem.isri import ISRIStemmer

	arabic_stopwords = set(stopwords.words("arabic"))
	stemmer = ISRIStemmer()

	char_map = str.maketrans(
	{"أ": "ا", "إ": "ا", "آ": "ا", "ى": "ي", "ة": "ه", "ؤ": "و", "ئ": "ي", "ـ": ""}
	)

	diacritics_pattern = re.compile(r"[\u064B-\u0652]")
	punctuation_pattern = re.compile(r"[^\w\s]")
	whitespace_pattern = re.compile(r"\s+")
	repeated_char_pattern = re.compile(r"(.)\1+")


	def normalize_arabic(text: str) -> str:
	"""Normalize Arabic characters."""
	return text.translate(char_map)


	def remove_diacritics(text: str) -> str:
	"""Remove Arabic diacritics."""
	return diacritics_pattern.sub("", text)


	def remove_punctuation(text: str) -> str:
	"""Remove punctuation marks."""
	return punctuation_pattern.sub(" ", text)


	def reduce_repeated_characters(text: str) -> str:
	"""Reduce repeated characters to single occurrence."""
	return repeated_char_pattern.sub(r"\1", text)


	def remove_stopwords(tokens: list[str]) -> list[str]:
	"""Remove Arabic stopwords from tokens."""
	return [word for word in tokens if word not in arabic_stopwords]


	def stem_tokens(tokens: list[str]) -> list[str]:
	"""Apply ISRI stemming to tokens."""
	return [stemmer.stem(token) for token in tokens]


	def preprocess_for_classification(text: str) -> str:
	"""Preprocess text for classification: normalize, clean, tokenize, stem."""
	text = text.strip().lower()
	text = normalize_arabic(text)
	text = remove_diacritics(text)
	text = remove_punctuation(text)
	text = reduce_repeated_characters(text)
	text = whitespace_pattern.sub(" ", text).strip()
	text = re.sub(r"\d+", "", text)
	tokens = text.split()
	tokens = remove_stopwords(tokens)
	tokens = stem_tokens(tokens)
	return " ".join(tokens)


	def preprocess_for_summarization(text: str) -> str:
	"""Light preprocessing for summarization: remove diacritics and numbers."""
	if not isinstance(text, str):
	return ""
	text = text.strip().lower()
	text = remove_diacritics(text)
	text = whitespace_pattern.sub(" ", text).strip()
	return re.sub(r"\d+", "", text)


	class ArabicPreprocessor:
	"""Arabic text preprocessor with analysis capabilities."""

	def __init__(self):
	self.arabic_stopwords = arabic_stopwords
	self.stemmer = stemmer
	self.char_map = char_map

	def preprocess_for_classification(self, text: str) -> str:
	"""Preprocess text for classification."""
	return preprocess_for_classification(text)

	def preprocess_for_summarization(self, text: str) -> str:
	"""Preprocess text for summarization."""
	return preprocess_for_summarization(text)

	def get_preprocessing_steps(self, text: str, task_type: str = "classification") -> dict:
	"""Get detailed preprocessing steps for analysis."""
	steps = {
	"original": text,
	"stripped_lowered": text.strip().lower(),
	}

	current = text.strip().lower()

	if task_type == "classification":
	steps["normalized"] = normalize_arabic(current)
	current = normalize_arabic(current)

	steps["diacritics_removed"] = remove_diacritics(current)
	current = remove_diacritics(current)

	steps["punctuation_removed"] = remove_punctuation(current)
	current = remove_punctuation(current)

	steps["repeated_chars_reduced"] = reduce_repeated_characters(current)
	current = reduce_repeated_characters(current)

	steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
	current = whitespace_pattern.sub(" ", current).strip()

	steps["numbers_removed"] = re.sub(r"\d+", "", current)
	current = re.sub(r"\d+", "", current)

	tokens = current.split()
	steps["tokenized"] = tokens

	tokens_no_stop = remove_stopwords(tokens)
	steps["stopwords_removed"] = tokens_no_stop

	stemmed_tokens = stem_tokens(tokens_no_stop)
	steps["stemmed"] = stemmed_tokens

	steps["final"] = " ".join(stemmed_tokens)

	elif task_type == "summarization":
	steps["diacritics_removed"] = remove_diacritics(current)
	current = remove_diacritics(current)

	steps["whitespace_normalized"] = whitespace_pattern.sub(" ", current).strip()
	current = whitespace_pattern.sub(" ", current).strip()

	steps["numbers_removed"] = re.sub(r"\d+", "", current)
	steps["final"] = re.sub(r"\d+", "", current)

	return steps

	def analyze_text(self, text: str) -> dict:
	"""Analyze text characteristics and statistics."""
	original_sentences = re.split(r"[.!؟\n]+", text)
	original_sentences = [s.strip() for s in original_sentences if s.strip()]

	tokens = text.split()
	arabic_chars = len(re.findall(r'[\u0600-\u06FF]', text))

	return {
	"character_count": len(text),
	"word_count": len(tokens),
	"sentence_count": len(original_sentences),
	"arabic_character_count": arabic_chars,
	"arabic_character_ratio": arabic_chars / len(text) if len(text) > 0 else 0,
	"average_word_length": sum(len(word) for word in tokens) / len(tokens) if tokens else 0,
	"average_sentence_length": len(tokens) / len(original_sentences) if original_sentences else 0,
	"has_diacritics": bool(re.search(r'[\u064B-\u0652]', text)),
	"punctuation_count": len(re.findall(r'[^\w\s]', text))
	}