Spaces:

mabosaimi
/

arabic-summarizer-classifier

Running

File size: 3,163 Bytes

import re
import numpy as np
import joblib
from typing import Dict, Any
from preprocessor import preprocess_for_summarization


class ArabicSummarizer:
    """Arabic text summarizer using TF-IDF scoring."""
    
    def __init__(self, vectorizer_path: str = "models/traditional_tfidf_vectorizer_summarization.joblib"):
        self.vectorizer = joblib.load(vectorizer_path)
    
    def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
        """Summarize text by selecting top-scored sentences."""
        cleaned_text = preprocess_for_summarization(text)
        
        sentences = re.split(r"[.!؟\n]+", cleaned_text)
        sentences = [s.strip() for s in sentences if s.strip()]
        
        if len(sentences) <= num_sentences:
            return {
                "summary": text,
                "original_sentence_count": int(len(sentences)),
                "summary_sentence_count": int(len(sentences)),
                "sentences": sentences,
                "selected_indices": list(range(len(sentences))),
                "sentence_scores": None
            }
        
        tfidf_matrix = self.vectorizer.transform(sentences)
        sentence_scores = tfidf_matrix.sum(axis=1).A1
        
        top_indices = np.argsort(sentence_scores)[-num_sentences:][::-1]
        top_sentences = [sentences[i] for i in sorted(top_indices)]
        
        return {
            "summary": " ".join(top_sentences),
            "original_sentence_count": int(len(sentences)),
            "summary_sentence_count": int(len(top_sentences)),
            "sentences": sentences,
            "selected_indices": [int(i) for i in sorted(top_indices)],
            "sentence_scores": sentence_scores.tolist(),
            "top_sentence_scores": [float(sentence_scores[i]) for i in sorted(top_indices)]
        }
    
    def get_sentence_analysis(self, text: str) -> Dict[str, Any]:
        """Get detailed analysis of all sentences with scores and rankings."""
        cleaned_text = preprocess_for_summarization(text)
        
        sentences = re.split(r"[.!؟\n]+", cleaned_text)
        sentences = [s.strip() for s in sentences if s.strip()]
        
        if not sentences:
            return {"error": "No sentences found in text"}
        
        tfidf_matrix = self.vectorizer.transform(sentences)
        sentence_scores = tfidf_matrix.sum(axis=1).A1
        
        sentence_analysis = []
        for i, (sentence, score) in enumerate(zip(sentences, sentence_scores)):
            sentence_analysis.append({
                "index": int(i),
                "sentence": sentence,
                "score": float(score),
                "rank": int(np.argsort(sentence_scores)[::-1].tolist().index(i) + 1)
            })
        
        return {
            "sentences": sentence_analysis,
            "total_sentences": int(len(sentences)),
            "score_statistics": {
                "mean": float(np.mean(sentence_scores)),
                "std": float(np.std(sentence_scores)),
                "min": float(np.min(sentence_scores)),
                "max": float(np.max(sentence_scores))
            }
        }