import re import numpy as np import joblib from typing import Dict, Any from preprocessor import preprocess_for_summarization class ArabicSummarizer: """Arabic text summarizer using TF-IDF scoring.""" def __init__(self, vectorizer_path: str = "models/traditional_tfidf_vectorizer_summarization.joblib"): self.vectorizer = joblib.load(vectorizer_path) def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]: """Summarize text by selecting top-scored sentences.""" cleaned_text = preprocess_for_summarization(text) sentences = re.split(r"[.!؟\n]+", cleaned_text) sentences = [s.strip() for s in sentences if s.strip()] if len(sentences) <= num_sentences: return { "summary": text, "original_sentence_count": int(len(sentences)), "summary_sentence_count": int(len(sentences)), "sentences": sentences, "selected_indices": list(range(len(sentences))), "sentence_scores": None } tfidf_matrix = self.vectorizer.transform(sentences) sentence_scores = tfidf_matrix.sum(axis=1).A1 top_indices = np.argsort(sentence_scores)[-num_sentences:][::-1] top_sentences = [sentences[i] for i in sorted(top_indices)] return { "summary": " ".join(top_sentences), "original_sentence_count": int(len(sentences)), "summary_sentence_count": int(len(top_sentences)), "sentences": sentences, "selected_indices": [int(i) for i in sorted(top_indices)], "sentence_scores": sentence_scores.tolist(), "top_sentence_scores": [float(sentence_scores[i]) for i in sorted(top_indices)] } def get_sentence_analysis(self, text: str) -> Dict[str, Any]: """Get detailed analysis of all sentences with scores and rankings.""" cleaned_text = preprocess_for_summarization(text) sentences = re.split(r"[.!؟\n]+", cleaned_text) sentences = [s.strip() for s in sentences if s.strip()] if not sentences: return {"error": "No sentences found in text"} tfidf_matrix = self.vectorizer.transform(sentences) sentence_scores = tfidf_matrix.sum(axis=1).A1 sentence_analysis = [] for i, (sentence, score) in enumerate(zip(sentences, sentence_scores)): sentence_analysis.append({ "index": int(i), "sentence": sentence, "score": float(score), "rank": int(np.argsort(sentence_scores)[::-1].tolist().index(i) + 1) }) return { "sentences": sentence_analysis, "total_sentences": int(len(sentences)), "score_statistics": { "mean": float(np.mean(sentence_scores)), "std": float(np.std(sentence_scores)), "min": float(np.min(sentence_scores)), "max": float(np.max(sentence_scores)) } }