Spaces:

mabosaimi
/

arabic-summarizer-classifier

Running

@@ -7,6 +7,7 @@ from summarizer import ArabicSummarizer
 from preprocessor import ArabicPreprocessor
 from model_manager import ModelManager
 from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
 class TaskType(str, Enum):
@@ -14,80 +15,74 @@ class TaskType(str, Enum):
     SUMMARIZATION = "summarization"
-class ModelType(str, Enum):
     TRADITIONAL_SVM = "traditional_svm"
-    MODERN_BERT = "modern_bert"
     MODERN_LSTM = "modern_lstm"
-app = FastAPI(
-    title="Arabic Text Analysis API",
-    description="API for Arabic text classification, summarization, and preprocessing with multiple model support",
-    version="1.0.0",
-)
-model_manager = ModelManager(default_model="traditional_svm")
-summarizer = ArabicSummarizer("traditional_tfidf_vectorizer_summarization.joblib")
-preprocessor = ArabicPreprocessor()
-class TextInput(BaseModel):
     text: str
-    model: Optional[ModelType] = None
-    model_config = {"json_schema_extra": {"example": REQUEST_EXAMPLES["text_input"]}}
-class TextInputWithSentences(BaseModel):
     text: str
-    num_sentences: Optional[int] = 3
-    model: Optional[ModelType] = None
-    model_config = {
-        "json_schema_extra": {"example": REQUEST_EXAMPLES["text_input_with_sentences"]}
-    }
-class BatchTextInput(BaseModel):
-    texts: List[str]
-    model: Optional[ModelType] = None
-    model_config = {
-        "json_schema_extra": {"example": REQUEST_EXAMPLES["batch_text_input"]}
-    }
-class PreprocessingInput(BaseModel):
-    text: str
-    task_type: TaskType = TaskType.CLASSIFICATION
-    model_config = {
-        "json_schema_extra": {"example": REQUEST_EXAMPLES["preprocessing_input"]}
-    }
 class ClassificationResponse(BaseModel):
     prediction: str
-    prediction_index: int
     confidence: float
     probability_distribution: Dict[str, float]
     cleaned_text: str
     model_used: str
-    prediction_metadata: Dict[str, Any]
-    model_config = {
-        "protected_namespaces": (),
-        "json_schema_extra": {
-            "example": RESPONSE_EXAMPLES["classification"],
-            "schema_extra": {
-                "properties": {
-                    "prediction_index": {
-                        "description": "Numerical index of the predicted class (0=culture, 1=economy, 2=international, 3=local, 4=religion, 5=sports)"
-                    }
-                }
-            },
-        },
-    }
 class SummarizationResponse(BaseModel):
@@ -96,89 +91,142 @@ class SummarizationResponse(BaseModel):
     summary_sentence_count: int
     sentences: List[str]
     selected_indices: List[int]
-    sentence_scores: Optional[List[float]]
-    top_sentence_scores: Optional[List[float]]
-    model_config = {
-        "json_schema_extra": {"example": RESPONSE_EXAMPLES["summarization"]}
-    }
-class TextAnalysisResponse(BaseModel):
-    text: str
-    analysis: Dict[str, Any]
-    model_config = {
-        "json_schema_extra": {"example": RESPONSE_EXAMPLES["text_analysis"]}
-    }
-class BatchClassificationResponse(BaseModel):
-    results: List[ClassificationResponse]
-    total_texts: int
     model_used: str
-    model_config = {
-        "protected_namespaces": (),
-        "json_schema_extra": {"example": RESPONSE_EXAMPLES["batch_classification"]},
-    }
-class SentenceAnalysisResponse(BaseModel):
-    sentences: List[Dict[str, Any]]
-    total_sentences: int
-    score_statistics: Dict[str, float]
-    model_config = {
-        "json_schema_extra": {"example": RESPONSE_EXAMPLES["sentence_analysis"]}
-    }
-class CompleteAnalysisResponse(BaseModel):
-    original_text: str
-    text_analysis: Dict[str, Any]
-    classification: ClassificationResponse
-    summarization: SummarizationResponse
-    model_config = {
-        "json_schema_extra": {"example": RESPONSE_EXAMPLES["complete_analysis"]}
     }
-class PreprocessingResponse(BaseModel):
-    task_type: str
-    preprocessing_steps: Dict[str, Any]
-    model_config = {
-        "json_schema_extra": {"example": RESPONSE_EXAMPLES["preprocessing"]}
     }
-class ModelInfoResponse(BaseModel):
-    classifier: Dict[str, Any]
-    summarizer: Dict[str, Any]
-    model_config = {"json_schema_extra": {"example": RESPONSE_EXAMPLES["model_info"]}}
-class ModelSwitchInput(BaseModel):
-    model: ModelType
-    model_config = {"json_schema_extra": {"example": {"model": "modern_bert"}}}
-class ModelSwitchResponse(BaseModel):
-    success: bool
-    message: str
-    previous_model: Optional[str] = None
-    current_model: str
-class AvailableModelsResponse(BaseModel):
-    models: Dict[str, Any]
-    current_model: str
 @app.get("/")
 def read_root() -> Dict[str, Any]:
     """API welcome message and endpoint documentation."""
@@ -190,162 +238,128 @@ def read_root() -> Dict[str, Any]:
             "openapi_schema": "/openapi.json",
         },
         "endpoints": {
             "classify": "POST /classify - Classify Arabic text",
-            "classify_batch": "POST /classify/batch - Classify multiple texts",
             "summarize": "POST /summarize - Summarize Arabic text",
-            "analyze": "POST /analyze - Both classify and summarize",
-            "preprocess": "POST /preprocess - Preprocess text with detailed steps",
-            "text_analysis": "POST /text-analysis - Analyze text characteristics",
-            "sentence_analysis": "POST /sentence-analysis - Detailed sentence analysis",
-            "model_info": "GET /model-info - Get model information",
-            "available_models": "GET /models - Get all available models",
         },
     }
-@app.post("/classify", response_model=ClassificationResponse)
-def classify_text(data: TextInput) -> ClassificationResponse:
-    """Classify Arabic text with probability distribution and metadata."""
     try:
-        model_name = data.model.value if data.model else None
-        result = model_manager.predict(data.text, model_name)
-        return result
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
-@app.post("/classify/batch", response_model=BatchClassificationResponse)
-def classify_texts(data: BatchTextInput) -> BatchClassificationResponse:
-    """Classify multiple Arabic texts in batch."""
     try:
-        model_name = data.model.value if data.model else None
-        results = model_manager.predict_batch(data.texts, model_name)
-        used_model = model_name or model_manager.default_model
-        return {
-            "results": results,
-            "total_texts": len(data.texts),
-            "model_used": used_model,
-        }
-    except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Batch classification failed: {str(e)}"
         )
-@app.post("/summarize", response_model=SummarizationResponse)
-def summarize_text(data: TextInputWithSentences) -> SummarizationResponse:
-    """Summarize Arabic text with sentence analysis."""
-    try:
-        result = summarizer.summarize(data.text, data.num_sentences)
-        return result
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
-@app.post("/sentence-analysis", response_model=SentenceAnalysisResponse)
-def analyze_sentences(data: TextInput) -> SentenceAnalysisResponse:
-    """Analyze all sentences with scores and rankings."""
-    try:
-        result = summarizer.get_sentence_analysis(data.text)
-        return result
     except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Sentence analysis failed: {str(e)}"
-        )
-@app.post("/analyze", response_model=CompleteAnalysisResponse)
-def analyze_text_complete(data: TextInputWithSentences) -> CompleteAnalysisResponse:
-    """Complete analysis: classification, summarization, and text statistics."""
     try:
-        model_name = data.model.value if data.model else None
-        classification_result = model_manager.predict(data.text, model_name)
-        summarization_result = summarizer.summarize(data.text, data.num_sentences)
-        text_stats = preprocessor.analyze_text(data.text)
-        return {
-            "original_text": data.text,
-            "text_analysis": text_stats,
-            "classification": classification_result,
-            "summarization": summarization_result,
-        }
-    except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Complete analysis failed: {str(e)}"
         )
-@app.post("/preprocess", response_model=PreprocessingResponse)
-def preprocess_text(data: PreprocessingInput) -> PreprocessingResponse:
-    """Preprocess text with step-by-step breakdown."""
-    try:
-        steps = preprocessor.get_preprocessing_steps(data.text, data.task_type.value)
-        return {"task_type": data.task_type.value, "preprocessing_steps": steps}
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
-@app.post("/text-analysis", response_model=TextAnalysisResponse)
-def analyze_text_characteristics(data: TextInput) -> TextAnalysisResponse:
-    """Analyze text characteristics and statistics."""
-    try:
-        analysis = preprocessor.analyze_text(data.text)
-        return {"text": data.text, "analysis": analysis}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
-@app.get("/model-info", response_model=ModelInfoResponse)
-def get_model_info(model: Optional[ModelType] = None) -> ModelInfoResponse:
-    """Get information about a specific model or the default model."""
-    try:
-        model_name = model.value if model else None
-        classifier_info = model_manager.get_model_info(model_name)
-        return {
-            "classifier": classifier_info,
-            "summarizer": {
-                "vectorizer_loaded": hasattr(summarizer, "vectorizer"),
-                "model_type": "TF-IDF based summarization",
             },
         }
-    except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Failed to get model info: {str(e)}"
-        )
-@app.get("/models", response_model=AvailableModelsResponse)
-def get_available_models() -> AvailableModelsResponse:
-    """Get all available classification models."""
-    try:
-        models = model_manager.get_available_models()
-        return {"models": models, "current_model": model_manager.default_model}
-    except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Failed to get available models: {str(e)}"
-        )
-@app.get("/models/cache")
-def get_cache_status() -> Dict[str, Any]:
-    """Get information about cached models."""
-    try:
-        return model_manager.get_cache_status()
-    except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Failed to get cache status: {str(e)}"
-        )
-@app.post("/models/cache/clear")
-def clear_model_cache(model: Optional[ModelType] = None) -> Dict[str, Any]:
-    """Clear model cache for a specific model or all models."""
-    try:
-        model_name = model.value if model else None
-        return model_manager.clear_cache(model_name)
-    except Exception as e:
-        raise HTTPException(
-            status_code=500, detail=f"Failed to clear cache: {str(e)}"
-        )

 from preprocessor import ArabicPreprocessor
 from model_manager import ModelManager
 from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
+from bert_summarizer import BERTExtractiveSummarizer
 class TaskType(str, Enum):
     SUMMARIZATION = "summarization"
+# New enums for frontend compatibility
+class ClassificationModelType(str, Enum):
     TRADITIONAL_SVM = "traditional_svm"
     MODERN_LSTM = "modern_lstm"
+    MODERN_BERT = "modern_bert"
+class SummarizationModelType(str, Enum):
+    TRADITIONAL_TFIDF = "traditional_tfidf"
+    MODERN_SEQ2SEQ = "modern_seq2seq"
+    MODERN_BERT = "modern_bert"
+# Request models
+class PreprocessRequest(BaseModel):
     text: str
+    task_type: TaskType
+    model_config = {
+        "json_schema_extra": {"example": {"text": "هذا نص عربي للمعالجة", "task_type": "classification"}}
+    }
+class ClassificationRequest(BaseModel):
     text: str
+    model: ClassificationModelType
+    model_config = {"json_schema_extra": {"example": {"text": "هذا نص عربي للتصنيف", "model": "traditional_svm"}}}
+class SummarizationRequest(BaseModel):
+    text: str
+    num_sentences: int = 3
+    model: SummarizationModelType
+    model_config = {"json_schema_extra": {"example": {"text": "هذا نص عربي طويل للتلخيص", "num_sentences": 3, "model": "traditional_tfidf"}}}
+# Response models
+class PreprocessingSteps(BaseModel):
+    original: str
+    stripped_lowered: Optional[str] = None
+    normalized: Optional[str] = None
+    diacritics_removed: Optional[str] = None
+    punctuation_removed: Optional[str] = None
+    repeated_chars_reduced: Optional[str] = None
+    whitespace_normalized: Optional[str] = None
+    numbers_removed: Optional[str] = None
+    tokenized: Optional[List[str]] = None
+    stopwords_removed: Optional[List[str]] = None
+    stemmed: Optional[List[str]] = None
+    final: str
+class PreprocessingResponse(BaseModel):
+    task_type: str
+    preprocessing_steps: PreprocessingSteps
 class ClassificationResponse(BaseModel):
     prediction: str
     confidence: float
     probability_distribution: Dict[str, float]
     cleaned_text: str
     model_used: str
+    # Optional fields for extra info
+    prediction_index: Optional[int] = None
+    prediction_metadata: Optional[Dict[str, Any]] = None
 class SummarizationResponse(BaseModel):
     summary_sentence_count: int
     sentences: List[str]
     selected_indices: List[int]
+    sentence_scores: List[float]
     model_used: str
+    # Optional fields for extra info
+    top_sentence_scores: Optional[List[float]] = None
+app = FastAPI(
+    title="Arabic Text Analysis API",
+    description="API for Arabic text classification, summarization, and preprocessing with multiple model support",
+    version="1.0.0",
+)
+model_manager = ModelManager(default_model="traditional_svm")
+summarizer = ArabicSummarizer("models/traditional_tfidf_vectorizer_summarization.joblib")
+preprocessor = ArabicPreprocessor()
+# Summarizer manager for model dispatch
+class SummarizerManager:
+    """Manages different types of Arabic text summarizers."""
+    def __init__(self):
+        # Initialize the traditional TF-IDF summarizer
+        self.traditional_tfidf = ArabicSummarizer("models/traditional_tfidf_vectorizer_summarization.joblib")
+        # Initialize BERT summarizer (lazy loading to avoid startup delays)
+        self.bert_summarizer = None
+    def get_summarizer(self, model_type: str):
+        """Get summarizer based on model type."""
+        if model_type == "traditional_tfidf":
+            return self.traditional_tfidf
+        elif model_type == "modern_seq2seq":
+            # TODO: Implement seq2seq summarizer
+            # For now, fallback to TF-IDF
+            return self.traditional_tfidf
+        elif model_type == "modern_bert":
+            # Initialize BERT summarizer on first use
+            if self.bert_summarizer is None:
+                try:
+                    print("Loading BERT summarizer...")
+                    self.bert_summarizer = BERTExtractiveSummarizer()
+                    print("BERT summarizer loaded successfully!")
+                except Exception as e:
+                    print(f"Failed to load BERT summarizer: {e}")
+                    raise ValueError(f"BERT summarizer initialization failed: {e}")
+            return self.bert_summarizer
+        else:
+            raise ValueError(f"Unknown summarizer model: {model_type}")
+    def summarize(self, text: str, num_sentences: int, model_type: str) -> Dict[str, Any]:
+        """Summarize text using the specified model."""
+        try:
+            print(f"SummarizerManager: Using model '{model_type}' for text with {len(text)} characters")
+            summarizer_instance = self.get_summarizer(model_type)
+            result = summarizer_instance.summarize(text, num_sentences)
+            # Add debugging info
+            print(f"SummarizerManager: {model_type} selected indices: {result.get('selected_indices', [])}")
+            print(f"SummarizerManager: {model_type} summary preview: '{result.get('summary', '')[:100]}...'")
+            # Ensure sentence_scores is always a list (not None)
+            if result.get("sentence_scores") is None:
+                result["sentence_scores"] = []
+            return result
+        except Exception as e:
+            # If BERT fails, provide helpful error message
+            if model_type == "modern_bert":
+                raise ValueError(f"BERT summarization failed: {str(e)}. This might be due to missing dependencies (torch, transformers) or network issues downloading the model.")
+            else:
+                raise
+summarizer_manager = SummarizerManager()
+# Check which models are actually available
+def check_model_availability():
+    """Check which models are actually available and working."""
+    available_models = {
+        "traditional_svm": True,  # Always available
+        "modern_lstm": True,      # Always available
+        "modern_bert": False      # Will be checked
     }
+    # Test BERT model availability
+    try:
+        from modern_classifier import ModernClassifier
+        # Try to create a BERT classifier instance
+        bert_classifier = ModernClassifier("bert", "models/modern_bert_classifier.safetensors")
+        available_models["modern_bert"] = True
+    except Exception as e:
+        print(f"BERT model not available: {e}")
+        available_models["modern_bert"] = False
+    return available_models
+# Check model availability at startup
+AVAILABLE_MODELS = check_model_availability()
+def _map_classification_model(frontend_model: str) -> str:
+    """Map frontend model names to backend model names."""
+    # Check if the requested model is available
+    if not AVAILABLE_MODELS.get(frontend_model, False):
+        raise ValueError(f"Model '{frontend_model}' is not available. Available models: {[k for k, v in AVAILABLE_MODELS.items() if v]}")
+    mapping = {
+        "traditional_svm": "traditional_svm",
+        "modern_lstm": "modern_lstm",
+        "modern_bert": "modern_bert"
     }
+    return mapping.get(frontend_model, frontend_model)
+def _create_preprocessing_steps(steps: Dict[str, Any]) -> PreprocessingSteps:
+    """Create preprocessing steps response with only the fields that exist."""
+    return PreprocessingSteps(
+        original=steps.get("original", ""),
+        stripped_lowered=steps.get("stripped_lowered"),
+        normalized=steps.get("normalized"),
+        diacritics_removed=steps.get("diacritics_removed"),
+        punctuation_removed=steps.get("punctuation_removed"),
+        repeated_chars_reduced=steps.get("repeated_chars_reduced"),
+        whitespace_normalized=steps.get("whitespace_normalized"),
+        numbers_removed=steps.get("numbers_removed"),
+        tokenized=steps.get("tokenized"),
+        stopwords_removed=steps.get("stopwords_removed"),
+        stemmed=steps.get("stemmed"),
+        final=steps.get("final", "")
+    )
+# Main endpoints
 @app.get("/")
 def read_root() -> Dict[str, Any]:
     """API welcome message and endpoint documentation."""
             "openapi_schema": "/openapi.json",
         },
         "endpoints": {
+            "preprocess": "POST /preprocess - Preprocess text with detailed steps",
             "classify": "POST /classify - Classify Arabic text",
             "summarize": "POST /summarize - Summarize Arabic text",
         },
     }
+@app.post("/preprocess", response_model=PreprocessingResponse)
+def preprocess_text(req: PreprocessRequest) -> PreprocessingResponse:
+    """Preprocess text with step-by-step breakdown."""
     try:
+        steps = preprocessor.get_preprocessing_steps(req.text, req.task_type.value)
+        preprocessing_steps = _create_preprocessing_steps(steps)
+        return PreprocessingResponse(
+            task_type=req.task_type.value,
+            preprocessing_steps=preprocessing_steps
+        )
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
+@app.post("/classify", response_model=ClassificationResponse)
+def classify_text(req: ClassificationRequest) -> ClassificationResponse:
+    """Classify Arabic text."""
     try:
+        backend_model = _map_classification_model(req.model.value)
+        result = model_manager.predict(req.text, backend_model)
+        return ClassificationResponse(
+            prediction=result["prediction"],
+            confidence=result["confidence"],
+            probability_distribution=result["probability_distribution"],
+            cleaned_text=result["cleaned_text"],
+            model_used=req.model.value,  # Echo back the frontend model name
+            prediction_index=result.get("prediction_index"),
+            prediction_metadata=result.get("prediction_metadata")
         )
+    except ValueError as e:
+        # Handle model availability errors
+        if "not available" in str(e):
+            raise HTTPException(
+                status_code=503,
+                detail=f"Model unavailable: {str(e)}. Check /models/available for current model status."
+            )
+        else:
+            raise HTTPException(status_code=400, detail=str(e))
     except Exception as e:
+        error_msg = str(e)
+        # Provide more helpful error messages for common issues
+        if "BERT" in error_msg and ("connect" in error_msg.lower() or "internet" in error_msg.lower() or "huggingface" in error_msg.lower()):
+            raise HTTPException(
+                status_code=503,
+                detail=f"BERT model unavailable: The model requires internet connection to download tokenizer/config from Hugging Face, or the files need to be cached locally. Error: {error_msg}"
+            )
+        elif "modern_bert" in req.model.value and "Error loading" in error_msg:
+            raise HTTPException(
+                status_code=503,
+                detail=f"BERT model loading failed: {error_msg}. Please ensure the model files are properly configured and Hugging Face dependencies are available."
+            )
+        else:
+            raise HTTPException(status_code=500, detail=f"Classification failed: {error_msg}")
+@app.post("/summarize", response_model=SummarizationResponse)
+def summarize_text(req: SummarizationRequest) -> SummarizationResponse:
+    """Summarize Arabic text."""
     try:
+        result = summarizer_manager.summarize(req.text, req.num_sentences, req.model.value)
+        return SummarizationResponse(
+            summary=result["summary"],
+            original_sentence_count=result["original_sentence_count"],
+            summary_sentence_count=result["summary_sentence_count"],
+            sentences=result["sentences"],
+            selected_indices=result["selected_indices"],
+            sentence_scores=result["sentence_scores"],
+            model_used=req.model.value,  # Echo back the frontend model name
+            top_sentence_scores=result.get("top_sentence_scores")
         )
     except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
+@app.get("/models/available")
+def get_available_models() -> Dict[str, Any]:
+    """Get information about which models are currently available."""
+    return {
+        "classification_models": {
+            "traditional_svm": {
+                "available": AVAILABLE_MODELS.get("traditional_svm", False),
+                "description": "Traditional SVM classifier with TF-IDF vectorization"
+            },
+            "modern_lstm": {
+                "available": AVAILABLE_MODELS.get("modern_lstm", False),
+                "description": "Modern LSTM-based neural network classifier"
+            },
+            "modern_bert": {
+                "available": AVAILABLE_MODELS.get("modern_bert", False),
+                "description": "Modern BERT-based transformer classifier",
+                "note": "Requires internet connection or cached Hugging Face models" if not AVAILABLE_MODELS.get("modern_bert", False) else None
+            }
+        },
+        "summarization_models": {
+            "traditional_tfidf": {
+                "available": True,
+                "description": "Traditional TF-IDF based extractive summarization"
             },
+            "modern_seq2seq": {
+                "available": True,
+                "description": "Modern sequence-to-sequence summarization (currently uses TF-IDF fallback)",
+                "note": "Implementation in progress - currently falls back to TF-IDF"
+            },
+            "modern_bert": {
+                "available": True,
+                "description": "Modern BERT-based extractive summarization using asafaya/bert-base-arabic",
+                "note": "Requires torch and transformers dependencies. Model will be downloaded on first use."
+            }
+        },
+        "status": {
+            "total_classification_models": len([k for k, v in AVAILABLE_MODELS.items() if v]),
+            "total_available": len([k for k, v in AVAILABLE_MODELS.items() if v]),
+            "unavailable_models": [k for k, v in AVAILABLE_MODELS.items() if not v]
         }
+    }

bert_summarizer.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import numpy as np
+import re
+from typing import Dict, List, Any
+from transformers import BertTokenizer, BertModel
+from sklearn.metrics.pairwise import cosine_similarity
+from preprocessor import preprocess_for_summarization
+class BERTExtractiveSummarizer:
+    def __init__(self, model_name='aubmindlab/bert-base-arabertv02'):
+        """Initialize BERT-based Arabic summarizer."""
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        print(f"Using device: {self.device}")
+        # Load tokenizer and model
+        self.tokenizer = BertTokenizer.from_pretrained(model_name)
+        self.model = BertModel.from_pretrained(model_name)
+        self.model.to(self.device)
+        self.model.eval()
+    def get_sentence_embeddings(self, sentences: List[str]) -> np.ndarray:
+        """Get BERT embeddings for sentences."""
+        embeddings = []
+        with torch.no_grad():
+            for sentence in sentences:
+                # Tokenize
+                inputs = self.tokenizer(
+                    sentence,
+                    return_tensors='pt',
+                    max_length=512,
+                    truncation=True,
+                    padding=True
+                ).to(self.device)
+                # Get embeddings
+                outputs = self.model(**inputs)
+                # Use CLS token embedding
+                embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
+                embeddings.append(embedding.squeeze())
+        return np.array(embeddings)
+    def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
+        """
+        Summarize Arabic text using BERT extractive summarization.
+        Returns the same structure as other summarizers for consistency.
+        """
+        print(f"BERT Summarizer: Processing text with {len(text)} characters")
+        # Use the same preprocessing as TF-IDF for fair comparison
+        cleaned_text = preprocess_for_summarization(text)
+        print(f"BERT Summarizer: After preprocessing: '{cleaned_text[:100]}...'")
+        # Split into sentences - same approach as TF-IDF
+        sentences = re.split(r'[.!؟\n]+', cleaned_text)
+        sentences = [s.strip() for s in sentences if s.strip()]  # Same as TF-IDF
+        print(f"BERT Summarizer: Found {len(sentences)} sentences")
+        original_sentence_count = len(sentences)
+        # If we have fewer sentences than requested, return all
+        if len(sentences) <= num_sentences:
+            print(f"BERT Summarizer: Returning all {len(sentences)} sentences (fewer than requested)")
+            return {
+                "summary": cleaned_text.strip(),  # Use cleaned text like TF-IDF
+                "original_sentence_count": original_sentence_count,
+                "summary_sentence_count": len(sentences),
+                "sentences": sentences,
+                "selected_indices": list(range(len(sentences))),
+                "sentence_scores": [1.0] * len(sentences)  # All sentences selected
+            }
+        print("BERT Summarizer: Getting sentence embeddings...")
+        # Get sentence embeddings
+        sentence_embeddings = self.get_sentence_embeddings(sentences)
+        print(f"BERT Summarizer: Got embeddings shape: {sentence_embeddings.shape}")
+        # Calculate document embedding (mean of all sentences)
+        doc_embedding = np.mean(sentence_embeddings, axis=0)
+        # Calculate similarity scores
+        similarities = cosine_similarity([doc_embedding], sentence_embeddings)[0]
+        print(f"BERT Summarizer: Similarity scores: {similarities}")
+        # Get top sentences (indices with highest scores)
+        top_indices = np.argsort(similarities)[-num_sentences:]
+        print(f"BERT Summarizer: Top indices: {top_indices}")
+        # Sort indices to maintain original order in summary
+        top_indices_sorted = sorted(top_indices)
+        # Convert numpy indices to regular ints for JSON serialization
+        top_indices_sorted = [int(i) for i in top_indices_sorted]
+        print(f"BERT Summarizer: Selected indices (in order): {top_indices_sorted}")
+        # Get selected sentences and their scores
+        selected_sentences = [sentences[i] for i in top_indices_sorted]
+        selected_scores = [float(similarities[i]) for i in top_indices_sorted]
+        print(f"BERT Summarizer: Selected sentences: {[s[:50] + '...' for s in selected_sentences]}")
+        # Create summary by joining selected sentences
+        summary = ' '.join(selected_sentences)
+        return {
+            "summary": summary,
+            "original_sentence_count": original_sentence_count,
+            "summary_sentence_count": len(selected_sentences),
+            "sentences": sentences,  # All original sentences
+            "selected_indices": top_indices_sorted,
+            "sentence_scores": selected_scores,
+            "top_sentence_scores": selected_scores  # Additional info
+        }

examples.py CHANGED Viewed

@@ -269,8 +269,8 @@ RESPONSE_EXAMPLES = {
                 "model_description": "Traditional SVM classifier with TF-IDF vectorization",
                 "model_config": {
                     "type": "traditional",
-                    "classifier_path": "traditional_svm_classifier.joblib",
-                    "vectorizer_path": "traditional_tfidf_vectorizer_classifier.joblib",
                     "description": "Traditional SVM classifier with TF-IDF vectorization"
                 },
                 "is_cached": True

                 "model_description": "Traditional SVM classifier with TF-IDF vectorization",
                 "model_config": {
                     "type": "traditional",
+                                "classifier_path": "models/traditional_svm_classifier.joblib",
+            "vectorizer_path": "models/traditional_tfidf_vectorizer_classifier.joblib",
                     "description": "Traditional SVM classifier with TF-IDF vectorization"
                 },
                 "is_cached": True

model_manager.py CHANGED Viewed

@@ -15,15 +15,15 @@ class ModelManager:
     AVAILABLE_MODELS = {
         "traditional_svm": {
             "type": "traditional",
-            "classifier_path": "traditional_svm_classifier.joblib",
-            "vectorizer_path": "traditional_tfidf_vectorizer_classifier.joblib",
             "description": "Traditional SVM classifier with TF-IDF vectorization"
         },
         "modern_bert": {
             "type": "modern",
             "model_type": "bert",
-            "model_path": "modern_bert_classifier.safetensors",
             "config_path": "config.json",
             "description": "Modern BERT-based transformer classifier"
         },
@@ -31,7 +31,7 @@ class ModelManager:
         "modern_lstm": {
             "type": "modern",
             "model_type": "lstm",
-            "model_path": "modern_lstm_classifier.pth",
             "description": "Modern LSTM-based neural network classifier"
         }
     }

     AVAILABLE_MODELS = {
         "traditional_svm": {
             "type": "traditional",
+            "classifier_path": "models/traditional_svm_classifier.joblib",
+            "vectorizer_path": "models/traditional_tfidf_vectorizer_classifier.joblib",
             "description": "Traditional SVM classifier with TF-IDF vectorization"
         },
         "modern_bert": {
             "type": "modern",
             "model_type": "bert",
+            "model_path": "models/modern_bert_classifier.safetensors",
             "config_path": "config.json",
             "description": "Modern BERT-based transformer classifier"
         },
         "modern_lstm": {
             "type": "modern",
             "model_type": "lstm",
+            "model_path": "models/modern_lstm_classifier.pth",
             "description": "Modern LSTM-based neural network classifier"
         }
     }

models/Seq2seq/seq2seq_config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"ENC_MAXLEN": 1900, "DEC_MAXLEN": 178, "SRC_VOCAB_SIZE": 20000, "TGT_VOCAB_SIZE": 10000, "EMB_DIM": 128, "HID_DIM": 256}

models/Seq2seq/seq2seq_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a35f8f2f2dc4f77570cc86c77a9fb90a1649d79d3e5e632be92499e889958a27
+size 117152336

models/Seq2seq/src_tokenizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff87d78b4f45fa3aaa9b9a43c0d94e7aecc1f7f18e0ab5c4caed15a0f1ca61ee
+size 12722191

models/Seq2seq/tgt_tokenizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca4e33cc944afd29a11b4fed11da27787ef604e7403b765ab589a7b304059e95
+size 2577556

modern_bert_classifier.safetensors → models/modern_bert_classifier.safetensors RENAMED Viewed

File without changes

modern_lstm_classifier.pth → models/modern_lstm_classifier.pth RENAMED Viewed

File without changes

traditional_svm_classifier.joblib → models/traditional_svm_classifier.joblib RENAMED Viewed

File without changes

traditional_tfidf_vectorizer_classifier.joblib → models/traditional_tfidf_vectorizer_classifier.joblib RENAMED Viewed

File without changes

traditional_tfidf_vectorizer_summarization.joblib → models/traditional_tfidf_vectorizer_summarization.joblib RENAMED Viewed

File without changes

modern_classifier.py CHANGED Viewed

@@ -65,15 +65,71 @@ class ModernClassifier:
     def _load_bert_model(self):
         """Load BERT model from safetensors."""
         try:
-            self.tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv2')
             state_dict = load_file(self.model_path)
             embed_key = next(k for k in state_dict if 'embeddings.word_embeddings.weight' in k)
             checkpoint_vocab_size = state_dict[embed_key].shape[0]
-            config = AutoConfig.from_pretrained(
-                'aubmindlab/bert-base-arabertv2',
-                num_labels=len(self.classes),
-                vocab_size=checkpoint_vocab_size
-            )
             self.model = AutoModelForSequenceClassification.from_config(config)
             self.model.resize_token_embeddings(checkpoint_vocab_size)
             self.model.load_state_dict(state_dict, strict=False)
@@ -116,6 +172,15 @@ class ModernClassifier:
             max_length=512
         )
         return {key: value.to(self.device) for key, value in inputs.items()}
     def _preprocess_text_for_lstm(self, text: str) -> torch.Tensor:
@@ -150,7 +215,11 @@ class ModernClassifier:
                 inputs = self._preprocess_text_for_lstm(text)
                 logits = self.model(inputs)
-            probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]
             prediction_index = int(np.argmax(probabilities))
             prediction = self.classes[prediction_index]

     def _load_bert_model(self):
         """Load BERT model from safetensors."""
         try:
+            # Try different Arabic BERT tokenizers that match 32K vocabulary
+            tokenizer_options = [
+                'asafaya/bert-base-arabic',  # This one has 32K vocab
+                'aubmindlab/bert-base-arabertv02',  # Alternative
+                'aubmindlab/bert-base-arabertv2'   # Fallback (64K vocab)
+            ]
+            self.tokenizer = None
+            for tokenizer_name in tokenizer_options:
+                try:
+                    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, local_files_only=True)
+                    # Test if vocabulary size matches
+                    if len(tokenizer.vocab) <= 32000:
+                        self.tokenizer = tokenizer
+                        print(f"Using tokenizer: {tokenizer_name} (vocab size: {len(tokenizer.vocab)})")
+                        break
+                except:
+                    continue
+            if self.tokenizer is None:
+                # Try downloading if local files don't work
+                for tokenizer_name in tokenizer_options:
+                    try:
+                        tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+                        if len(tokenizer.vocab) <= 32000:
+                            self.tokenizer = tokenizer
+                            print(f"Downloaded tokenizer: {tokenizer_name} (vocab size: {len(tokenizer.vocab)})")
+                            break
+                    except:
+                        continue
+            if self.tokenizer is None:
+                raise RuntimeError("No compatible Arabic BERT tokenizer found with 32K vocabulary")
             state_dict = load_file(self.model_path)
             embed_key = next(k for k in state_dict if 'embeddings.word_embeddings.weight' in k)
             checkpoint_vocab_size = state_dict[embed_key].shape[0]
+            # Try to load config locally first
+            try:
+                config = AutoConfig.from_pretrained(
+                    'aubmindlab/bert-base-arabertv2',
+                    num_labels=len(self.classes),
+                    vocab_size=checkpoint_vocab_size,
+                    local_files_only=True
+                )
+            except:
+                try:
+                    config = AutoConfig.from_pretrained(
+                        'aubmindlab/bert-base-arabertv2',
+                        num_labels=len(self.classes),
+                        vocab_size=checkpoint_vocab_size
+                    )
+                except:
+                    # Fallback: create a basic BERT config
+                    from transformers import BertConfig
+                    config = BertConfig(
+                        vocab_size=checkpoint_vocab_size,
+                        hidden_size=768,
+                        num_hidden_layers=12,
+                        num_attention_heads=12,
+                        intermediate_size=3072,
+                        num_labels=len(self.classes)
+                    )
             self.model = AutoModelForSequenceClassification.from_config(config)
             self.model.resize_token_embeddings(checkpoint_vocab_size)
             self.model.load_state_dict(state_dict, strict=False)
             max_length=512
         )
+        # CRITICAL FIX: Check for vocabulary mismatch and clamp token IDs
+        input_ids = inputs['input_ids']
+        max_token_id = input_ids.max().item()
+        model_vocab_size = self.model.config.vocab_size
+        if max_token_id >= model_vocab_size:
+            # Fix: Clamp token IDs to valid range to prevent "index out of range" error
+            inputs['input_ids'] = torch.clamp(input_ids, 0, model_vocab_size - 1)
         return {key: value.to(self.device) for key, value in inputs.items()}
     def _preprocess_text_for_lstm(self, text: str) -> torch.Tensor:
                 inputs = self._preprocess_text_for_lstm(text)
                 logits = self.model(inputs)
+            probabilities = torch.softmax(logits, dim=-1).cpu().numpy()
+            # Handle batch dimension
+            if len(probabilities.shape) > 1:
+                probabilities = probabilities[0]
             prediction_index = int(np.argmax(probabilities))
             prediction = self.classes[prediction_index]

summarizer.py CHANGED Viewed

@@ -8,7 +8,7 @@ from preprocessor import preprocess_for_summarization
 class ArabicSummarizer:
     """Arabic text summarizer using TF-IDF scoring."""
-    def __init__(self, vectorizer_path: str = "traditional_tfidf_vectorizer_summarization.joblib"):
         self.vectorizer = joblib.load(vectorizer_path)
     def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:

 class ArabicSummarizer:
     """Arabic text summarizer using TF-IDF scoring."""
+    def __init__(self, vectorizer_path: str = "models/traditional_tfidf_vectorizer_summarization.joblib"):
         self.vectorizer = joblib.load(vectorizer_path)
     def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:

traditional_classifier.py CHANGED Viewed

@@ -9,8 +9,8 @@ class TraditionalClassifier:
     def __init__(
         self,
-        classifier_path: str = "traditional_svm_classifier.joblib",
-        vectorizer_path: str = "traditional_tfidf_vectorizer_classifier.joblib",
     ):
         self.model = joblib.load(classifier_path)
         self.vectorizer = joblib.load(vectorizer_path)

     def __init__(
         self,
+        classifier_path: str = "models/traditional_svm_classifier.joblib",
+        vectorizer_path: str = "models/traditional_tfidf_vectorizer_classifier.joblib",
     ):
         self.model = joblib.load(classifier_path)
         self.vectorizer = joblib.load(vectorizer_path)