File size: 3,163 Bytes
354c6a0 5fc9256 354c6a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import re
import numpy as np
import joblib
from typing import Dict, Any
from preprocessor import preprocess_for_summarization
class ArabicSummarizer:
"""Arabic text summarizer using TF-IDF scoring."""
def __init__(self, vectorizer_path: str = "models/traditional_tfidf_vectorizer_summarization.joblib"):
self.vectorizer = joblib.load(vectorizer_path)
def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
"""Summarize text by selecting top-scored sentences."""
cleaned_text = preprocess_for_summarization(text)
sentences = re.split(r"[.!؟\n]+", cleaned_text)
sentences = [s.strip() for s in sentences if s.strip()]
if len(sentences) <= num_sentences:
return {
"summary": text,
"original_sentence_count": int(len(sentences)),
"summary_sentence_count": int(len(sentences)),
"sentences": sentences,
"selected_indices": list(range(len(sentences))),
"sentence_scores": None
}
tfidf_matrix = self.vectorizer.transform(sentences)
sentence_scores = tfidf_matrix.sum(axis=1).A1
top_indices = np.argsort(sentence_scores)[-num_sentences:][::-1]
top_sentences = [sentences[i] for i in sorted(top_indices)]
return {
"summary": " ".join(top_sentences),
"original_sentence_count": int(len(sentences)),
"summary_sentence_count": int(len(top_sentences)),
"sentences": sentences,
"selected_indices": [int(i) for i in sorted(top_indices)],
"sentence_scores": sentence_scores.tolist(),
"top_sentence_scores": [float(sentence_scores[i]) for i in sorted(top_indices)]
}
def get_sentence_analysis(self, text: str) -> Dict[str, Any]:
"""Get detailed analysis of all sentences with scores and rankings."""
cleaned_text = preprocess_for_summarization(text)
sentences = re.split(r"[.!؟\n]+", cleaned_text)
sentences = [s.strip() for s in sentences if s.strip()]
if not sentences:
return {"error": "No sentences found in text"}
tfidf_matrix = self.vectorizer.transform(sentences)
sentence_scores = tfidf_matrix.sum(axis=1).A1
sentence_analysis = []
for i, (sentence, score) in enumerate(zip(sentences, sentence_scores)):
sentence_analysis.append({
"index": int(i),
"sentence": sentence,
"score": float(score),
"rank": int(np.argsort(sentence_scores)[::-1].tolist().index(i) + 1)
})
return {
"sentences": sentence_analysis,
"total_sentences": int(len(sentences)),
"score_statistics": {
"mean": float(np.mean(sentence_scores)),
"std": float(np.std(sentence_scores)),
"min": float(np.min(sentence_scores)),
"max": float(np.max(sentence_scores))
}
}
|