mabosaimi's picture
Fkhrayef (#1)
5fc9256 verified
import re
import numpy as np
import joblib
from typing import Dict, Any
from preprocessor import preprocess_for_summarization
class ArabicSummarizer:
"""Arabic text summarizer using TF-IDF scoring."""
def __init__(self, vectorizer_path: str = "models/traditional_tfidf_vectorizer_summarization.joblib"):
self.vectorizer = joblib.load(vectorizer_path)
def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
"""Summarize text by selecting top-scored sentences."""
cleaned_text = preprocess_for_summarization(text)
sentences = re.split(r"[.!؟\n]+", cleaned_text)
sentences = [s.strip() for s in sentences if s.strip()]
if len(sentences) <= num_sentences:
return {
"summary": text,
"original_sentence_count": int(len(sentences)),
"summary_sentence_count": int(len(sentences)),
"sentences": sentences,
"selected_indices": list(range(len(sentences))),
"sentence_scores": None
}
tfidf_matrix = self.vectorizer.transform(sentences)
sentence_scores = tfidf_matrix.sum(axis=1).A1
top_indices = np.argsort(sentence_scores)[-num_sentences:][::-1]
top_sentences = [sentences[i] for i in sorted(top_indices)]
return {
"summary": " ".join(top_sentences),
"original_sentence_count": int(len(sentences)),
"summary_sentence_count": int(len(top_sentences)),
"sentences": sentences,
"selected_indices": [int(i) for i in sorted(top_indices)],
"sentence_scores": sentence_scores.tolist(),
"top_sentence_scores": [float(sentence_scores[i]) for i in sorted(top_indices)]
}
def get_sentence_analysis(self, text: str) -> Dict[str, Any]:
"""Get detailed analysis of all sentences with scores and rankings."""
cleaned_text = preprocess_for_summarization(text)
sentences = re.split(r"[.!؟\n]+", cleaned_text)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return {"error": "No sentences found in text"}
tfidf_matrix = self.vectorizer.transform(sentences)
sentence_scores = tfidf_matrix.sum(axis=1).A1
sentence_analysis = []
for i, (sentence, score) in enumerate(zip(sentences, sentence_scores)):
sentence_analysis.append({
"index": int(i),
"sentence": sentence,
"score": float(score),
"rank": int(np.argsort(sentence_scores)[::-1].tolist().index(i) + 1)
})
return {
"sentences": sentence_analysis,
"total_sentences": int(len(sentences)),
"score_statistics": {
"mean": float(np.mean(sentence_scores)),
"std": float(np.std(sentence_scores)),
"min": float(np.min(sentence_scores)),
"max": float(np.max(sentence_scores))
}
}