File size: 15,221 Bytes
354c6a0 07edbf0 354c6a0 29dedef 354c6a0 123e49c 29dedef 5fc9256 07edbf0 29dedef 354c6a0 5fc9256 123e49c 5fc9256 123e49c 5fc9256 354c6a0 5fc9256 354c6a0 5fc9256 123e49c 5fc9256 354c6a0 5fc9256 354c6a0 5fc9256 123e49c 5fc9256 354c6a0 5fc9256 123e49c 5fc9256 354c6a0 5fc9256 123e49c 5fc9256 29dedef 5fc9256 29dedef 5fc9256 29dedef 5fc9256 29dedef 5fc9256 29dedef 5fc9256 29dedef 5fc9256 07edbf0 5fc9256 07edbf0 5fc9256 07edbf0 5fc9256 29dedef 5fc9256 29dedef 5fc9256 354c6a0 123e49c 354c6a0 5fc9256 354c6a0 123e49c 354c6a0 5fc9256 354c6a0 5fc9256 354c6a0 5fc9256 354c6a0 5fc9256 354c6a0 5fc9256 123e49c 5fc9256 123e49c 5fc9256 354c6a0 5fc9256 354c6a0 5fc9256 354c6a0 5fc9256 123e49c 5fc9256 123e49c 354c6a0 5fc9256 354c6a0 5fc9256 123e49c 5fc9256 354c6a0 5fc9256 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 |
from typing import Optional, List, Dict, Any
import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from enum import Enum
from summarizer import ArabicSummarizer
from preprocessor import ArabicPreprocessor
from model_manager import ModelManager
from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
from bert_summarizer import BERTExtractiveSummarizer
from seq2seq_summarizer import Seq2SeqSummarizer
class TaskType(str, Enum):
CLASSIFICATION = "classification"
SUMMARIZATION = "summarization"
# New enums for frontend compatibility
class ClassificationModelType(str, Enum):
TRADITIONAL_SVM = "traditional_svm"
MODERN_LSTM = "modern_lstm"
MODERN_BERT = "modern_bert"
class SummarizationModelType(str, Enum):
TRADITIONAL_TFIDF = "traditional_tfidf"
MODERN_SEQ2SEQ = "modern_seq2seq"
MODERN_BERT = "modern_bert"
# Request models
class PreprocessRequest(BaseModel):
text: str
task_type: TaskType
model_config = {
"json_schema_extra": {"example": {"text": "هذا نص عربي للمعالجة", "task_type": "classification"}}
}
class ClassificationRequest(BaseModel):
text: str
model: ClassificationModelType
model_config = {"json_schema_extra": {"example": {"text": "هذا نص عربي للتصنيف", "model": "traditional_svm"}}}
class SummarizationRequest(BaseModel):
text: str
num_sentences: int = 3
model: SummarizationModelType
model_config = {"json_schema_extra": {"example": {"text": "هذا نص عربي طويل للتلخيص", "num_sentences": 3, "model": "traditional_tfidf"}}}
# Response models
class PreprocessingSteps(BaseModel):
original: str
stripped_lowered: Optional[str] = None
normalized: Optional[str] = None
diacritics_removed: Optional[str] = None
punctuation_removed: Optional[str] = None
repeated_chars_reduced: Optional[str] = None
whitespace_normalized: Optional[str] = None
numbers_removed: Optional[str] = None
tokenized: Optional[List[str]] = None
stopwords_removed: Optional[List[str]] = None
stemmed: Optional[List[str]] = None
final: str
class PreprocessingResponse(BaseModel):
task_type: str
preprocessing_steps: PreprocessingSteps
class ClassificationResponse(BaseModel):
prediction: str
confidence: float
probability_distribution: Dict[str, float]
cleaned_text: str
model_used: str
# Optional fields for extra info
prediction_index: Optional[int] = None
prediction_metadata: Optional[Dict[str, Any]] = None
class SummarizationResponse(BaseModel):
summary: str
original_sentence_count: int
summary_sentence_count: int
sentences: List[str]
selected_indices: List[int]
sentence_scores: List[float]
model_used: str
# Optional fields for extra info
top_sentence_scores: Optional[List[float]] = None
app = FastAPI(
title="Arabic Text Analysis API",
description="API for Arabic text classification, summarization, and preprocessing with multiple model support",
version="1.0.0",
)
model_manager = ModelManager(default_model="traditional_svm")
summarizer = ArabicSummarizer("models/traditional_tfidf_vectorizer_summarization.joblib")
preprocessor = ArabicPreprocessor()
# Summarizer manager for model dispatch
class SummarizerManager:
"""Manages different types of Arabic text summarizers."""
def __init__(self):
# Initialize the traditional TF-IDF summarizer
self.traditional_tfidf = ArabicSummarizer("models/traditional_tfidf_vectorizer_summarization.joblib")
# Initialize other summarizers (lazy loading to avoid startup delays)
self.bert_summarizer = None
self.seq2seq_summarizer = None
def get_summarizer(self, model_type: str):
"""Get summarizer based on model type."""
if model_type == "traditional_tfidf":
return self.traditional_tfidf
elif model_type == "modern_seq2seq":
# Initialize seq2seq summarizer on first use
if self.seq2seq_summarizer is None:
try:
print("Loading Seq2Seq summarizer...")
model_path = os.path.join(os.path.dirname(__file__), "models", "modern_seq2seq_summarizer.safetensors")
self.seq2seq_summarizer = Seq2SeqSummarizer(model_path)
print("Seq2Seq summarizer loaded successfully!")
except Exception as e:
print(f"Failed to load Seq2Seq summarizer: {e}")
raise ValueError(f"Seq2Seq summarizer initialization failed: {e}")
return self.seq2seq_summarizer
elif model_type == "modern_bert":
# Initialize BERT summarizer on first use
if self.bert_summarizer is None:
try:
print("Loading BERT summarizer...")
self.bert_summarizer = BERTExtractiveSummarizer()
print("BERT summarizer loaded successfully!")
except Exception as e:
print(f"Failed to load BERT summarizer: {e}")
raise ValueError(f"BERT summarizer initialization failed: {e}")
return self.bert_summarizer
else:
raise ValueError(f"Unknown summarizer model: {model_type}")
def summarize(self, text: str, num_sentences: int, model_type: str) -> Dict[str, Any]:
"""Summarize text using the specified model."""
try:
print(f"SummarizerManager: Using model '{model_type}' for text with {len(text)} characters")
summarizer_instance = self.get_summarizer(model_type)
result = summarizer_instance.summarize(text, num_sentences)
# Add debugging info
print(f"SummarizerManager: {model_type} selected indices: {result.get('selected_indices', [])}")
print(f"SummarizerManager: {model_type} summary preview: '{result.get('summary', '')[:100]}...'")
# Ensure sentence_scores is always a list (not None)
if result.get("sentence_scores") is None:
result["sentence_scores"] = []
return result
except Exception as e:
# If BERT fails, provide helpful error message
if model_type == "modern_bert":
raise ValueError(f"BERT summarization failed: {str(e)}. This might be due to missing dependencies (torch, transformers) or network issues downloading the model.")
else:
raise
summarizer_manager = SummarizerManager()
# Check which models are actually available
def check_model_availability():
"""Check which models are actually available and working."""
available_models = {
"traditional_svm": True, # Always available
"modern_lstm": True, # Always available
"modern_bert": False # Will be checked
}
# Test BERT model availability
try:
from modern_classifier import ModernClassifier
# Try to create a BERT classifier instance
bert_classifier = ModernClassifier("bert", "models/modern_bert_classifier.safetensors")
available_models["modern_bert"] = True
except Exception as e:
print(f"BERT model not available: {e}")
available_models["modern_bert"] = False
return available_models
# Check model availability at startup
AVAILABLE_MODELS = check_model_availability()
def _map_classification_model(frontend_model: str) -> str:
"""Map frontend model names to backend model names."""
# Check if the requested model is available
if not AVAILABLE_MODELS.get(frontend_model, False):
raise ValueError(f"Model '{frontend_model}' is not available. Available models: {[k for k, v in AVAILABLE_MODELS.items() if v]}")
mapping = {
"traditional_svm": "traditional_svm",
"modern_lstm": "modern_lstm",
"modern_bert": "modern_bert"
}
return mapping.get(frontend_model, frontend_model)
def _create_preprocessing_steps(steps: Dict[str, Any]) -> PreprocessingSteps:
"""Create preprocessing steps response with only the fields that exist."""
return PreprocessingSteps(
original=steps.get("original", ""),
stripped_lowered=steps.get("stripped_lowered"),
normalized=steps.get("normalized"),
diacritics_removed=steps.get("diacritics_removed"),
punctuation_removed=steps.get("punctuation_removed"),
repeated_chars_reduced=steps.get("repeated_chars_reduced"),
whitespace_normalized=steps.get("whitespace_normalized"),
numbers_removed=steps.get("numbers_removed"),
tokenized=steps.get("tokenized"),
stopwords_removed=steps.get("stopwords_removed"),
stemmed=steps.get("stemmed"),
final=steps.get("final", "")
)
# Main endpoints
@app.get("/")
def read_root() -> Dict[str, Any]:
"""API welcome message and endpoint documentation."""
return {
"message": "Welcome to the Arabic Text Analysis API!",
"documentation": {
"interactive_docs": "/docs",
"redoc": "/redoc",
"openapi_schema": "/openapi.json",
},
"endpoints": {
"preprocess": "POST /preprocess - Preprocess text with detailed steps",
"classify": "POST /classify - Classify Arabic text",
"summarize": "POST /summarize - Summarize Arabic text",
},
}
@app.post("/preprocess", response_model=PreprocessingResponse)
def preprocess_text(req: PreprocessRequest) -> PreprocessingResponse:
"""Preprocess text with step-by-step breakdown."""
try:
steps = preprocessor.get_preprocessing_steps(req.text, req.task_type.value)
preprocessing_steps = _create_preprocessing_steps(steps)
return PreprocessingResponse(
task_type=req.task_type.value,
preprocessing_steps=preprocessing_steps
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
@app.post("/classify", response_model=ClassificationResponse)
def classify_text(req: ClassificationRequest) -> ClassificationResponse:
"""Classify Arabic text."""
try:
backend_model = _map_classification_model(req.model.value)
result = model_manager.predict(req.text, backend_model)
return ClassificationResponse(
prediction=result["prediction"],
confidence=result["confidence"],
probability_distribution=result["probability_distribution"],
cleaned_text=result["cleaned_text"],
model_used=req.model.value, # Echo back the frontend model name
prediction_index=result.get("prediction_index"),
prediction_metadata=result.get("prediction_metadata")
)
except ValueError as e:
# Handle model availability errors
if "not available" in str(e):
raise HTTPException(
status_code=503,
detail=f"Model unavailable: {str(e)}. Check /models/available for current model status."
)
else:
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
error_msg = str(e)
# Provide more helpful error messages for common issues
if "BERT" in error_msg and ("connect" in error_msg.lower() or "internet" in error_msg.lower() or "huggingface" in error_msg.lower()):
raise HTTPException(
status_code=503,
detail=f"BERT model unavailable: The model requires internet connection to download tokenizer/config from Hugging Face, or the files need to be cached locally. Error: {error_msg}"
)
elif "modern_bert" in req.model.value and "Error loading" in error_msg:
raise HTTPException(
status_code=503,
detail=f"BERT model loading failed: {error_msg}. Please ensure the model files are properly configured and Hugging Face dependencies are available."
)
else:
raise HTTPException(status_code=500, detail=f"Classification failed: {error_msg}")
@app.post("/summarize", response_model=SummarizationResponse)
def summarize_text(req: SummarizationRequest) -> SummarizationResponse:
"""Summarize Arabic text."""
try:
result = summarizer_manager.summarize(req.text, req.num_sentences, req.model.value)
return SummarizationResponse(
summary=result["summary"],
original_sentence_count=result["original_sentence_count"],
summary_sentence_count=result["summary_sentence_count"],
sentences=result["sentences"],
selected_indices=result["selected_indices"],
sentence_scores=result["sentence_scores"],
model_used=req.model.value, # Echo back the frontend model name
top_sentence_scores=result.get("top_sentence_scores")
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
@app.get("/models/available")
def get_available_models() -> Dict[str, Any]:
"""Get information about which models are currently available."""
return {
"classification_models": {
"traditional_svm": {
"available": AVAILABLE_MODELS.get("traditional_svm", False),
"description": "Traditional SVM classifier with TF-IDF vectorization"
},
"modern_lstm": {
"available": AVAILABLE_MODELS.get("modern_lstm", False),
"description": "Modern LSTM-based neural network classifier"
},
"modern_bert": {
"available": AVAILABLE_MODELS.get("modern_bert", False),
"description": "Modern BERT-based transformer classifier",
"note": "Requires internet connection or cached Hugging Face models" if not AVAILABLE_MODELS.get("modern_bert", False) else None
}
},
"summarization_models": {
"traditional_tfidf": {
"available": True,
"description": "Traditional TF-IDF based extractive summarization"
},
"modern_seq2seq": {
"available": True,
"description": "Modern sequence-to-sequence summarization (currently uses TF-IDF fallback)",
"note": "Implementation in progress - currently falls back to TF-IDF"
},
"modern_bert": {
"available": True,
"description": "Modern BERT-based extractive summarization using asafaya/bert-base-arabic",
"note": "Requires torch and transformers dependencies. Model will be downloaded on first use."
}
},
"status": {
"total_classification_models": len([k for k, v in AVAILABLE_MODELS.items() if v]),
"total_available": len([k for k, v in AVAILABLE_MODELS.items() if v]),
"unavailable_models": [k for k, v in AVAILABLE_MODELS.items() if not v]
}
}
|