arabic-summarizer-classifier / traditional_classifier.py
mabosaimi's picture
Fkhrayef (#1)
5fc9256 verified
import numpy as np
import joblib
from typing import List, Dict, Any
from preprocessor import preprocess_for_classification
class TraditionalClassifier:
"""Traditional text classifier with probability distributions and metadata."""
def __init__(
self,
classifier_path: str = "models/traditional_svm_classifier.joblib",
vectorizer_path: str = "models/traditional_tfidf_vectorizer_classifier.joblib",
):
self.model = joblib.load(classifier_path)
self.vectorizer = joblib.load(vectorizer_path)
self.model_name = classifier_path.split("/")[-1].replace(".joblib", "")
def predict(self, text: str) -> Dict[str, Any]:
"""Predict class with full probability distribution and metadata."""
cleaned_text = preprocess_for_classification(text)
if self.vectorizer:
text_vector = self.vectorizer.transform([cleaned_text])
else:
text_vector = [cleaned_text]
prediction = self.model.predict(text_vector)[0]
classes = getattr(self.model, "classes_", None)
if classes is not None:
prediction_index = int(np.where(classes == prediction)[0][0])
else:
prediction_index = (
int(prediction) if isinstance(prediction, (int, np.integer)) else 0
)
if hasattr(self.model, "predict_proba"):
probabilities = self.model.predict_proba(text_vector)[0]
confidence = float(probabilities[prediction_index])
else:
if hasattr(self.model, "decision_function"):
decision_scores = self.model.decision_function(text_vector)[0]
if len(decision_scores.shape) == 0:
probabilities = np.array(
[
1 / (1 + np.exp(decision_scores)),
1 / (1 + np.exp(-decision_scores)),
]
)
else:
exp_scores = np.exp(decision_scores - np.max(decision_scores))
probabilities = exp_scores / np.sum(exp_scores)
confidence = float(probabilities[prediction_index])
else:
classes = getattr(self.model, "classes_", None)
num_classes = len(classes) if classes is not None else 2
probabilities = np.zeros(num_classes)
probabilities[prediction_index] = 1.0
confidence = 1.0
classes = getattr(self.model, "classes_", None)
prob_distribution = {}
if classes is not None:
for i, class_label in enumerate(classes):
prob_distribution[str(class_label)] = float(probabilities[i])
else:
for i, prob in enumerate(probabilities):
prob_distribution[f"class_{i}"] = float(prob)
return {
"prediction": str(prediction),
"prediction_index": int(prediction_index),
"confidence": confidence,
"probability_distribution": prob_distribution,
"cleaned_text": cleaned_text,
"model_used": self.model_name,
"prediction_metadata": {
"max_probability": float(np.max(probabilities)),
"min_probability": float(np.min(probabilities)),
"entropy": float(
-np.sum(probabilities * np.log(probabilities + 1e-10))
),
"num_classes": len(probabilities),
},
}
def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
"""Predict classes for multiple texts."""
cleaned_texts = [preprocess_for_classification(text) for text in texts]
if self.vectorizer:
text_vectors = self.vectorizer.transform(cleaned_texts)
else:
text_vectors = cleaned_texts
predictions = self.model.predict(text_vectors)
classes = getattr(self.model, "classes_", None)
prediction_indices = []
for pred in predictions:
if classes is not None:
pred_index = int(np.where(classes == pred)[0][0])
else:
pred_index = int(pred) if isinstance(pred, (int, np.integer)) else 0
prediction_indices.append(pred_index)
if hasattr(self.model, "predict_proba"):
probabilities = self.model.predict_proba(text_vectors)
else:
if hasattr(self.model, "decision_function"):
decision_scores = self.model.decision_function(text_vectors)
if len(decision_scores.shape) == 1:
probabilities = np.column_stack(
[
1 / (1 + np.exp(decision_scores)),
1 / (1 + np.exp(-decision_scores)),
]
)
else:
exp_scores = np.exp(
decision_scores - np.max(decision_scores, axis=1, keepdims=True)
)
probabilities = exp_scores / np.sum(
exp_scores, axis=1, keepdims=True
)
else:
classes = getattr(self.model, "classes_", None)
num_classes = len(classes) if classes is not None else 2
probabilities = np.zeros((len(predictions), num_classes))
for i, pred_idx in enumerate(prediction_indices):
probabilities[i, pred_idx] = 1.0
results = []
for i, (pred, pred_idx) in enumerate(zip(predictions, prediction_indices)):
confidence = float(probabilities[i][pred_idx])
prob_distribution = {}
if classes is not None:
for j, class_label in enumerate(classes):
prob_distribution[str(class_label)] = float(probabilities[i][j])
else:
for j, prob in enumerate(probabilities[i]):
prob_distribution[f"class_{j}"] = float(prob)
results.append(
{
"prediction": str(pred),
"prediction_index": int(pred_idx),
"confidence": confidence,
"probability_distribution": prob_distribution,
"cleaned_text": cleaned_texts[i],
"model_used": self.model_name,
"prediction_metadata": {
"max_probability": float(np.max(probabilities[i])),
"min_probability": float(np.min(probabilities[i])),
"entropy": float(
-np.sum(probabilities[i] * np.log(probabilities[i] + 1e-10))
),
"num_classes": len(probabilities[i]),
},
}
)
return results
def get_model_info(self) -> Dict[str, Any]:
"""Get model information and capabilities."""
classes = getattr(self.model, "classes_", None)
return {
"model_name": self.model_name,
"model_type": type(self.model).__name__,
"num_classes": len(classes) if classes is not None else "unknown",
"classes": classes.tolist() if classes is not None else None,
"has_predict_proba": hasattr(self.model, "predict_proba"),
"has_vectorizer": self.vectorizer is not None,
"vectorizer_type": type(self.vectorizer).__name__
if self.vectorizer
else None,
}