|
import torch |
|
import numpy as np |
|
import re |
|
from typing import Dict, List, Any |
|
from transformers import BertTokenizer, BertModel |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from preprocessor import preprocess_for_summarization |
|
|
|
|
|
class BERTExtractiveSummarizer: |
|
def __init__(self, model_name='aubmindlab/bert-base-arabertv02'): |
|
"""Initialize BERT-based Arabic summarizer.""" |
|
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
print(f"Using device: {self.device}") |
|
|
|
|
|
self.tokenizer = BertTokenizer.from_pretrained(model_name) |
|
self.model = BertModel.from_pretrained(model_name) |
|
self.model.to(self.device) |
|
self.model.eval() |
|
|
|
def get_sentence_embeddings(self, sentences: List[str]) -> np.ndarray: |
|
"""Get BERT embeddings for sentences.""" |
|
embeddings = [] |
|
|
|
with torch.no_grad(): |
|
for sentence in sentences: |
|
|
|
inputs = self.tokenizer( |
|
sentence, |
|
return_tensors='pt', |
|
max_length=512, |
|
truncation=True, |
|
padding=True |
|
).to(self.device) |
|
|
|
|
|
outputs = self.model(**inputs) |
|
|
|
embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy() |
|
embeddings.append(embedding.squeeze()) |
|
|
|
return np.array(embeddings) |
|
|
|
def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]: |
|
""" |
|
Summarize Arabic text using BERT extractive summarization. |
|
Returns the same structure as other summarizers for consistency. |
|
""" |
|
print(f"BERT Summarizer: Processing text with {len(text)} characters") |
|
|
|
|
|
cleaned_text = preprocess_for_summarization(text) |
|
print(f"BERT Summarizer: After preprocessing: '{cleaned_text[:100]}...'") |
|
|
|
|
|
sentences = re.split(r'[.!؟\n]+', cleaned_text) |
|
sentences = [s.strip() for s in sentences if s.strip()] |
|
|
|
print(f"BERT Summarizer: Found {len(sentences)} sentences") |
|
original_sentence_count = len(sentences) |
|
|
|
|
|
if len(sentences) <= num_sentences: |
|
print(f"BERT Summarizer: Returning all {len(sentences)} sentences (fewer than requested)") |
|
return { |
|
"summary": cleaned_text.strip(), |
|
"original_sentence_count": original_sentence_count, |
|
"summary_sentence_count": len(sentences), |
|
"sentences": sentences, |
|
"selected_indices": list(range(len(sentences))), |
|
"sentence_scores": [1.0] * len(sentences) |
|
} |
|
|
|
print("BERT Summarizer: Getting sentence embeddings...") |
|
|
|
sentence_embeddings = self.get_sentence_embeddings(sentences) |
|
print(f"BERT Summarizer: Got embeddings shape: {sentence_embeddings.shape}") |
|
|
|
|
|
doc_embedding = np.mean(sentence_embeddings, axis=0) |
|
|
|
|
|
similarities = cosine_similarity([doc_embedding], sentence_embeddings)[0] |
|
print(f"BERT Summarizer: Similarity scores: {similarities}") |
|
|
|
|
|
top_indices = np.argsort(similarities)[-num_sentences:] |
|
print(f"BERT Summarizer: Top indices: {top_indices}") |
|
|
|
|
|
top_indices_sorted = sorted(top_indices) |
|
|
|
top_indices_sorted = [int(i) for i in top_indices_sorted] |
|
print(f"BERT Summarizer: Selected indices (in order): {top_indices_sorted}") |
|
|
|
|
|
selected_sentences = [sentences[i] for i in top_indices_sorted] |
|
selected_scores = [float(similarities[i]) for i in top_indices_sorted] |
|
|
|
print(f"BERT Summarizer: Selected sentences: {[s[:50] + '...' for s in selected_sentences]}") |
|
|
|
|
|
summary = ' '.join(selected_sentences) |
|
|
|
return { |
|
"summary": summary, |
|
"original_sentence_count": original_sentence_count, |
|
"summary_sentence_count": len(selected_sentences), |
|
"sentences": sentences, |
|
"selected_indices": top_indices_sorted, |
|
"sentence_scores": selected_scores, |
|
"top_sentence_scores": selected_scores |
|
} |