|
import re |
|
import numpy as np |
|
import joblib |
|
from typing import Dict, Any |
|
from preprocessor import preprocess_for_summarization |
|
|
|
|
|
class ArabicSummarizer: |
|
"""Arabic text summarizer using TF-IDF scoring.""" |
|
|
|
def __init__(self, vectorizer_path: str = "models/traditional_tfidf_vectorizer_summarization.joblib"): |
|
self.vectorizer = joblib.load(vectorizer_path) |
|
|
|
def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]: |
|
"""Summarize text by selecting top-scored sentences.""" |
|
cleaned_text = preprocess_for_summarization(text) |
|
|
|
sentences = re.split(r"[.!؟\n]+", cleaned_text) |
|
sentences = [s.strip() for s in sentences if s.strip()] |
|
|
|
if len(sentences) <= num_sentences: |
|
return { |
|
"summary": text, |
|
"original_sentence_count": int(len(sentences)), |
|
"summary_sentence_count": int(len(sentences)), |
|
"sentences": sentences, |
|
"selected_indices": list(range(len(sentences))), |
|
"sentence_scores": None |
|
} |
|
|
|
tfidf_matrix = self.vectorizer.transform(sentences) |
|
sentence_scores = tfidf_matrix.sum(axis=1).A1 |
|
|
|
top_indices = np.argsort(sentence_scores)[-num_sentences:][::-1] |
|
top_sentences = [sentences[i] for i in sorted(top_indices)] |
|
|
|
return { |
|
"summary": " ".join(top_sentences), |
|
"original_sentence_count": int(len(sentences)), |
|
"summary_sentence_count": int(len(top_sentences)), |
|
"sentences": sentences, |
|
"selected_indices": [int(i) for i in sorted(top_indices)], |
|
"sentence_scores": sentence_scores.tolist(), |
|
"top_sentence_scores": [float(sentence_scores[i]) for i in sorted(top_indices)] |
|
} |
|
|
|
def get_sentence_analysis(self, text: str) -> Dict[str, Any]: |
|
"""Get detailed analysis of all sentences with scores and rankings.""" |
|
cleaned_text = preprocess_for_summarization(text) |
|
|
|
sentences = re.split(r"[.!؟\n]+", cleaned_text) |
|
sentences = [s.strip() for s in sentences if s.strip()] |
|
|
|
if not sentences: |
|
return {"error": "No sentences found in text"} |
|
|
|
tfidf_matrix = self.vectorizer.transform(sentences) |
|
sentence_scores = tfidf_matrix.sum(axis=1).A1 |
|
|
|
sentence_analysis = [] |
|
for i, (sentence, score) in enumerate(zip(sentences, sentence_scores)): |
|
sentence_analysis.append({ |
|
"index": int(i), |
|
"sentence": sentence, |
|
"score": float(score), |
|
"rank": int(np.argsort(sentence_scores)[::-1].tolist().index(i) + 1) |
|
}) |
|
|
|
return { |
|
"sentences": sentence_analysis, |
|
"total_sentences": int(len(sentences)), |
|
"score_statistics": { |
|
"mean": float(np.mean(sentence_scores)), |
|
"std": float(np.std(sentence_scores)), |
|
"min": float(np.min(sentence_scores)), |
|
"max": float(np.max(sentence_scores)) |
|
} |
|
} |
|
|