__pycache__/app.cpython-312.pyc ADDED
Binary file (17 kB). View file
 
__pycache__/bert_summarizer.cpython-312.pyc ADDED
Binary file (5.94 kB). View file
 
__pycache__/debug_test.cpython-312.pyc ADDED
Binary file (3.86 kB). View file
 
__pycache__/examples.cpython-312.pyc ADDED
Binary file (7.97 kB). View file
 
__pycache__/model_manager.cpython-312.pyc ADDED
Binary file (7.56 kB). View file
 
__pycache__/modern_classifier.cpython-312.pyc ADDED
Binary file (17.3 kB). View file
 
__pycache__/preprocessor.cpython-312.pyc ADDED
Binary file (8.7 kB). View file
 
__pycache__/seq2seq_summarizer.cpython-312.pyc ADDED
Binary file (9.74 kB). View file
 
__pycache__/summarizer.cpython-312.pyc ADDED
Binary file (4.7 kB). View file
 
__pycache__/traditional_classifier.cpython-312.pyc ADDED
Binary file (8.89 kB). View file
 
app.py CHANGED
@@ -7,6 +7,7 @@ from summarizer import ArabicSummarizer
7
  from preprocessor import ArabicPreprocessor
8
  from model_manager import ModelManager
9
  from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
 
10
 
11
 
12
  class TaskType(str, Enum):
@@ -14,80 +15,74 @@ class TaskType(str, Enum):
14
  SUMMARIZATION = "summarization"
15
 
16
 
17
- class ModelType(str, Enum):
 
18
  TRADITIONAL_SVM = "traditional_svm"
19
- MODERN_BERT = "modern_bert"
20
  MODERN_LSTM = "modern_lstm"
 
21
 
22
 
23
- app = FastAPI(
24
- title="Arabic Text Analysis API",
25
- description="API for Arabic text classification, summarization, and preprocessing with multiple model support",
26
- version="1.0.0",
27
- )
28
-
29
- model_manager = ModelManager(default_model="traditional_svm")
30
- summarizer = ArabicSummarizer("traditional_tfidf_vectorizer_summarization.joblib")
31
- preprocessor = ArabicPreprocessor()
32
 
33
 
34
- class TextInput(BaseModel):
 
35
  text: str
36
- model: Optional[ModelType] = None
37
 
38
- model_config = {"json_schema_extra": {"example": REQUEST_EXAMPLES["text_input"]}}
 
 
39
 
40
 
41
- class TextInputWithSentences(BaseModel):
42
  text: str
43
- num_sentences: Optional[int] = 3
44
- model: Optional[ModelType] = None
45
 
46
- model_config = {
47
- "json_schema_extra": {"example": REQUEST_EXAMPLES["text_input_with_sentences"]}
48
- }
49
 
50
 
51
- class BatchTextInput(BaseModel):
52
- texts: List[str]
53
- model: Optional[ModelType] = None
 
54
 
55
- model_config = {
56
- "json_schema_extra": {"example": REQUEST_EXAMPLES["batch_text_input"]}
57
- }
58
 
59
 
60
- class PreprocessingInput(BaseModel):
61
- text: str
62
- task_type: TaskType = TaskType.CLASSIFICATION
 
 
 
 
 
 
 
 
 
 
 
63
 
64
- model_config = {
65
- "json_schema_extra": {"example": REQUEST_EXAMPLES["preprocessing_input"]}
66
- }
 
67
 
68
 
69
  class ClassificationResponse(BaseModel):
70
  prediction: str
71
- prediction_index: int
72
  confidence: float
73
  probability_distribution: Dict[str, float]
74
  cleaned_text: str
75
  model_used: str
76
- prediction_metadata: Dict[str, Any]
77
-
78
- model_config = {
79
- "protected_namespaces": (),
80
- "json_schema_extra": {
81
- "example": RESPONSE_EXAMPLES["classification"],
82
- "schema_extra": {
83
- "properties": {
84
- "prediction_index": {
85
- "description": "Numerical index of the predicted class (0=culture, 1=economy, 2=international, 3=local, 4=religion, 5=sports)"
86
- }
87
- }
88
- },
89
- },
90
- }
91
 
92
 
93
  class SummarizationResponse(BaseModel):
@@ -96,89 +91,142 @@ class SummarizationResponse(BaseModel):
96
  summary_sentence_count: int
97
  sentences: List[str]
98
  selected_indices: List[int]
99
- sentence_scores: Optional[List[float]]
100
- top_sentence_scores: Optional[List[float]]
101
-
102
- model_config = {
103
- "json_schema_extra": {"example": RESPONSE_EXAMPLES["summarization"]}
104
- }
105
-
106
-
107
- class TextAnalysisResponse(BaseModel):
108
- text: str
109
- analysis: Dict[str, Any]
110
-
111
- model_config = {
112
- "json_schema_extra": {"example": RESPONSE_EXAMPLES["text_analysis"]}
113
- }
114
-
115
-
116
- class BatchClassificationResponse(BaseModel):
117
- results: List[ClassificationResponse]
118
- total_texts: int
119
  model_used: str
120
-
121
- model_config = {
122
- "protected_namespaces": (),
123
- "json_schema_extra": {"example": RESPONSE_EXAMPLES["batch_classification"]},
124
- }
125
 
126
 
127
- class SentenceAnalysisResponse(BaseModel):
128
- sentences: List[Dict[str, Any]]
129
- total_sentences: int
130
- score_statistics: Dict[str, float]
131
-
132
- model_config = {
133
- "json_schema_extra": {"example": RESPONSE_EXAMPLES["sentence_analysis"]}
134
- }
135
 
 
 
 
136
 
137
- class CompleteAnalysisResponse(BaseModel):
138
- original_text: str
139
- text_analysis: Dict[str, Any]
140
- classification: ClassificationResponse
141
- summarization: SummarizationResponse
142
 
143
- model_config = {
144
- "json_schema_extra": {"example": RESPONSE_EXAMPLES["complete_analysis"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  }
146
-
147
-
148
- class PreprocessingResponse(BaseModel):
149
- task_type: str
150
- preprocessing_steps: Dict[str, Any]
151
-
152
- model_config = {
153
- "json_schema_extra": {"example": RESPONSE_EXAMPLES["preprocessing"]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  }
155
-
156
-
157
- class ModelInfoResponse(BaseModel):
158
- classifier: Dict[str, Any]
159
- summarizer: Dict[str, Any]
160
-
161
- model_config = {"json_schema_extra": {"example": RESPONSE_EXAMPLES["model_info"]}}
162
-
163
-
164
- class ModelSwitchInput(BaseModel):
165
- model: ModelType
166
-
167
- model_config = {"json_schema_extra": {"example": {"model": "modern_bert"}}}
168
-
169
-
170
- class ModelSwitchResponse(BaseModel):
171
- success: bool
172
- message: str
173
- previous_model: Optional[str] = None
174
- current_model: str
175
-
176
-
177
- class AvailableModelsResponse(BaseModel):
178
- models: Dict[str, Any]
179
- current_model: str
180
-
181
-
182
  @app.get("/")
183
  def read_root() -> Dict[str, Any]:
184
  """API welcome message and endpoint documentation."""
@@ -190,162 +238,128 @@ def read_root() -> Dict[str, Any]:
190
  "openapi_schema": "/openapi.json",
191
  },
192
  "endpoints": {
 
193
  "classify": "POST /classify - Classify Arabic text",
194
- "classify_batch": "POST /classify/batch - Classify multiple texts",
195
  "summarize": "POST /summarize - Summarize Arabic text",
196
- "analyze": "POST /analyze - Both classify and summarize",
197
- "preprocess": "POST /preprocess - Preprocess text with detailed steps",
198
- "text_analysis": "POST /text-analysis - Analyze text characteristics",
199
- "sentence_analysis": "POST /sentence-analysis - Detailed sentence analysis",
200
- "model_info": "GET /model-info - Get model information",
201
- "available_models": "GET /models - Get all available models",
202
  },
203
  }
204
 
205
 
206
- @app.post("/classify", response_model=ClassificationResponse)
207
- def classify_text(data: TextInput) -> ClassificationResponse:
208
- """Classify Arabic text with probability distribution and metadata."""
209
  try:
210
- model_name = data.model.value if data.model else None
211
- result = model_manager.predict(data.text, model_name)
212
- return result
 
 
 
213
  except Exception as e:
214
- raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
215
 
216
 
217
- @app.post("/classify/batch", response_model=BatchClassificationResponse)
218
- def classify_texts(data: BatchTextInput) -> BatchClassificationResponse:
219
- """Classify multiple Arabic texts in batch."""
220
  try:
221
- model_name = data.model.value if data.model else None
222
- results = model_manager.predict_batch(data.texts, model_name)
223
- used_model = model_name or model_manager.default_model
224
 
225
- return {
226
- "results": results,
227
- "total_texts": len(data.texts),
228
- "model_used": used_model,
229
- }
230
- except Exception as e:
231
- raise HTTPException(
232
- status_code=500, detail=f"Batch classification failed: {str(e)}"
233
  )
234
-
235
-
236
- @app.post("/summarize", response_model=SummarizationResponse)
237
- def summarize_text(data: TextInputWithSentences) -> SummarizationResponse:
238
- """Summarize Arabic text with sentence analysis."""
239
- try:
240
- result = summarizer.summarize(data.text, data.num_sentences)
241
- return result
242
- except Exception as e:
243
- raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
244
-
245
-
246
- @app.post("/sentence-analysis", response_model=SentenceAnalysisResponse)
247
- def analyze_sentences(data: TextInput) -> SentenceAnalysisResponse:
248
- """Analyze all sentences with scores and rankings."""
249
- try:
250
- result = summarizer.get_sentence_analysis(data.text)
251
- return result
252
  except Exception as e:
253
- raise HTTPException(
254
- status_code=500, detail=f"Sentence analysis failed: {str(e)}"
255
- )
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
 
258
- @app.post("/analyze", response_model=CompleteAnalysisResponse)
259
- def analyze_text_complete(data: TextInputWithSentences) -> CompleteAnalysisResponse:
260
- """Complete analysis: classification, summarization, and text statistics."""
261
  try:
262
- model_name = data.model.value if data.model else None
263
 
264
- classification_result = model_manager.predict(data.text, model_name)
265
- summarization_result = summarizer.summarize(data.text, data.num_sentences)
266
- text_stats = preprocessor.analyze_text(data.text)
267
-
268
- return {
269
- "original_text": data.text,
270
- "text_analysis": text_stats,
271
- "classification": classification_result,
272
- "summarization": summarization_result,
273
- }
274
- except Exception as e:
275
- raise HTTPException(
276
- status_code=500, detail=f"Complete analysis failed: {str(e)}"
277
  )
278
-
279
-
280
- @app.post("/preprocess", response_model=PreprocessingResponse)
281
- def preprocess_text(data: PreprocessingInput) -> PreprocessingResponse:
282
- """Preprocess text with step-by-step breakdown."""
283
- try:
284
- steps = preprocessor.get_preprocessing_steps(data.text, data.task_type.value)
285
- return {"task_type": data.task_type.value, "preprocessing_steps": steps}
286
  except Exception as e:
287
- raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
288
-
289
-
290
- @app.post("/text-analysis", response_model=TextAnalysisResponse)
291
- def analyze_text_characteristics(data: TextInput) -> TextAnalysisResponse:
292
- """Analyze text characteristics and statistics."""
293
- try:
294
- analysis = preprocessor.analyze_text(data.text)
295
- return {"text": data.text, "analysis": analysis}
296
- except Exception as e:
297
- raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
298
 
299
 
300
- @app.get("/model-info", response_model=ModelInfoResponse)
301
- def get_model_info(model: Optional[ModelType] = None) -> ModelInfoResponse:
302
- """Get information about a specific model or the default model."""
303
- try:
304
- model_name = model.value if model else None
305
- classifier_info = model_manager.get_model_info(model_name)
306
- return {
307
- "classifier": classifier_info,
308
- "summarizer": {
309
- "vectorizer_loaded": hasattr(summarizer, "vectorizer"),
310
- "model_type": "TF-IDF based summarization",
 
 
 
 
 
 
 
 
 
 
 
 
311
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  }
313
- except Exception as e:
314
- raise HTTPException(
315
- status_code=500, detail=f"Failed to get model info: {str(e)}"
316
- )
317
-
318
-
319
- @app.get("/models", response_model=AvailableModelsResponse)
320
- def get_available_models() -> AvailableModelsResponse:
321
- """Get all available classification models."""
322
- try:
323
- models = model_manager.get_available_models()
324
- return {"models": models, "current_model": model_manager.default_model}
325
- except Exception as e:
326
- raise HTTPException(
327
- status_code=500, detail=f"Failed to get available models: {str(e)}"
328
- )
329
-
330
-
331
- @app.get("/models/cache")
332
- def get_cache_status() -> Dict[str, Any]:
333
- """Get information about cached models."""
334
- try:
335
- return model_manager.get_cache_status()
336
- except Exception as e:
337
- raise HTTPException(
338
- status_code=500, detail=f"Failed to get cache status: {str(e)}"
339
- )
340
-
341
-
342
- @app.post("/models/cache/clear")
343
- def clear_model_cache(model: Optional[ModelType] = None) -> Dict[str, Any]:
344
- """Clear model cache for a specific model or all models."""
345
- try:
346
- model_name = model.value if model else None
347
- return model_manager.clear_cache(model_name)
348
- except Exception as e:
349
- raise HTTPException(
350
- status_code=500, detail=f"Failed to clear cache: {str(e)}"
351
- )
 
7
  from preprocessor import ArabicPreprocessor
8
  from model_manager import ModelManager
9
  from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
10
+ from bert_summarizer import BERTExtractiveSummarizer
11
 
12
 
13
  class TaskType(str, Enum):
 
15
  SUMMARIZATION = "summarization"
16
 
17
 
18
+ # New enums for frontend compatibility
19
+ class ClassificationModelType(str, Enum):
20
  TRADITIONAL_SVM = "traditional_svm"
 
21
  MODERN_LSTM = "modern_lstm"
22
+ MODERN_BERT = "modern_bert"
23
 
24
 
25
+ class SummarizationModelType(str, Enum):
26
+ TRADITIONAL_TFIDF = "traditional_tfidf"
27
+ MODERN_SEQ2SEQ = "modern_seq2seq"
28
+ MODERN_BERT = "modern_bert"
 
 
 
 
 
29
 
30
 
31
+ # Request models
32
+ class PreprocessRequest(BaseModel):
33
  text: str
34
+ task_type: TaskType
35
 
36
+ model_config = {
37
+ "json_schema_extra": {"example": {"text": "هذا نص عربي للمعالجة", "task_type": "classification"}}
38
+ }
39
 
40
 
41
+ class ClassificationRequest(BaseModel):
42
  text: str
43
+ model: ClassificationModelType
 
44
 
45
+ model_config = {"json_schema_extra": {"example": {"text": "هذا نص عربي للتصنيف", "model": "traditional_svm"}}}
 
 
46
 
47
 
48
+ class SummarizationRequest(BaseModel):
49
+ text: str
50
+ num_sentences: int = 3
51
+ model: SummarizationModelType
52
 
53
+ model_config = {"json_schema_extra": {"example": {"text": "هذا نص عربي طويل للتلخيص", "num_sentences": 3, "model": "traditional_tfidf"}}}
 
 
54
 
55
 
56
+ # Response models
57
+ class PreprocessingSteps(BaseModel):
58
+ original: str
59
+ stripped_lowered: Optional[str] = None
60
+ normalized: Optional[str] = None
61
+ diacritics_removed: Optional[str] = None
62
+ punctuation_removed: Optional[str] = None
63
+ repeated_chars_reduced: Optional[str] = None
64
+ whitespace_normalized: Optional[str] = None
65
+ numbers_removed: Optional[str] = None
66
+ tokenized: Optional[List[str]] = None
67
+ stopwords_removed: Optional[List[str]] = None
68
+ stemmed: Optional[List[str]] = None
69
+ final: str
70
 
71
+
72
+ class PreprocessingResponse(BaseModel):
73
+ task_type: str
74
+ preprocessing_steps: PreprocessingSteps
75
 
76
 
77
  class ClassificationResponse(BaseModel):
78
  prediction: str
 
79
  confidence: float
80
  probability_distribution: Dict[str, float]
81
  cleaned_text: str
82
  model_used: str
83
+ # Optional fields for extra info
84
+ prediction_index: Optional[int] = None
85
+ prediction_metadata: Optional[Dict[str, Any]] = None
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
 
88
  class SummarizationResponse(BaseModel):
 
91
  summary_sentence_count: int
92
  sentences: List[str]
93
  selected_indices: List[int]
94
+ sentence_scores: List[float]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  model_used: str
96
+ # Optional fields for extra info
97
+ top_sentence_scores: Optional[List[float]] = None
 
 
 
98
 
99
 
100
+ app = FastAPI(
101
+ title="Arabic Text Analysis API",
102
+ description="API for Arabic text classification, summarization, and preprocessing with multiple model support",
103
+ version="1.0.0",
104
+ )
 
 
 
105
 
106
+ model_manager = ModelManager(default_model="traditional_svm")
107
+ summarizer = ArabicSummarizer("models/traditional_tfidf_vectorizer_summarization.joblib")
108
+ preprocessor = ArabicPreprocessor()
109
 
 
 
 
 
 
110
 
111
+ # Summarizer manager for model dispatch
112
+ class SummarizerManager:
113
+ """Manages different types of Arabic text summarizers."""
114
+
115
+ def __init__(self):
116
+ # Initialize the traditional TF-IDF summarizer
117
+ self.traditional_tfidf = ArabicSummarizer("models/traditional_tfidf_vectorizer_summarization.joblib")
118
+
119
+ # Initialize BERT summarizer (lazy loading to avoid startup delays)
120
+ self.bert_summarizer = None
121
+
122
+ def get_summarizer(self, model_type: str):
123
+ """Get summarizer based on model type."""
124
+ if model_type == "traditional_tfidf":
125
+ return self.traditional_tfidf
126
+ elif model_type == "modern_seq2seq":
127
+ # TODO: Implement seq2seq summarizer
128
+ # For now, fallback to TF-IDF
129
+ return self.traditional_tfidf
130
+ elif model_type == "modern_bert":
131
+ # Initialize BERT summarizer on first use
132
+ if self.bert_summarizer is None:
133
+ try:
134
+ print("Loading BERT summarizer...")
135
+ self.bert_summarizer = BERTExtractiveSummarizer()
136
+ print("BERT summarizer loaded successfully!")
137
+ except Exception as e:
138
+ print(f"Failed to load BERT summarizer: {e}")
139
+ raise ValueError(f"BERT summarizer initialization failed: {e}")
140
+ return self.bert_summarizer
141
+ else:
142
+ raise ValueError(f"Unknown summarizer model: {model_type}")
143
+
144
+ def summarize(self, text: str, num_sentences: int, model_type: str) -> Dict[str, Any]:
145
+ """Summarize text using the specified model."""
146
+ try:
147
+ print(f"SummarizerManager: Using model '{model_type}' for text with {len(text)} characters")
148
+ summarizer_instance = self.get_summarizer(model_type)
149
+ result = summarizer_instance.summarize(text, num_sentences)
150
+
151
+ # Add debugging info
152
+ print(f"SummarizerManager: {model_type} selected indices: {result.get('selected_indices', [])}")
153
+ print(f"SummarizerManager: {model_type} summary preview: '{result.get('summary', '')[:100]}...'")
154
+
155
+ # Ensure sentence_scores is always a list (not None)
156
+ if result.get("sentence_scores") is None:
157
+ result["sentence_scores"] = []
158
+
159
+ return result
160
+ except Exception as e:
161
+ # If BERT fails, provide helpful error message
162
+ if model_type == "modern_bert":
163
+ raise ValueError(f"BERT summarization failed: {str(e)}. This might be due to missing dependencies (torch, transformers) or network issues downloading the model.")
164
+ else:
165
+ raise
166
+
167
+
168
+ summarizer_manager = SummarizerManager()
169
+
170
+
171
+ # Check which models are actually available
172
+ def check_model_availability():
173
+ """Check which models are actually available and working."""
174
+ available_models = {
175
+ "traditional_svm": True, # Always available
176
+ "modern_lstm": True, # Always available
177
+ "modern_bert": False # Will be checked
178
  }
179
+
180
+ # Test BERT model availability
181
+ try:
182
+ from modern_classifier import ModernClassifier
183
+ # Try to create a BERT classifier instance
184
+ bert_classifier = ModernClassifier("bert", "models/modern_bert_classifier.safetensors")
185
+ available_models["modern_bert"] = True
186
+ except Exception as e:
187
+ print(f"BERT model not available: {e}")
188
+ available_models["modern_bert"] = False
189
+
190
+ return available_models
191
+
192
+
193
+ # Check model availability at startup
194
+ AVAILABLE_MODELS = check_model_availability()
195
+
196
+
197
+ def _map_classification_model(frontend_model: str) -> str:
198
+ """Map frontend model names to backend model names."""
199
+ # Check if the requested model is available
200
+ if not AVAILABLE_MODELS.get(frontend_model, False):
201
+ raise ValueError(f"Model '{frontend_model}' is not available. Available models: {[k for k, v in AVAILABLE_MODELS.items() if v]}")
202
+
203
+ mapping = {
204
+ "traditional_svm": "traditional_svm",
205
+ "modern_lstm": "modern_lstm",
206
+ "modern_bert": "modern_bert"
207
  }
208
+ return mapping.get(frontend_model, frontend_model)
209
+
210
+
211
+ def _create_preprocessing_steps(steps: Dict[str, Any]) -> PreprocessingSteps:
212
+ """Create preprocessing steps response with only the fields that exist."""
213
+ return PreprocessingSteps(
214
+ original=steps.get("original", ""),
215
+ stripped_lowered=steps.get("stripped_lowered"),
216
+ normalized=steps.get("normalized"),
217
+ diacritics_removed=steps.get("diacritics_removed"),
218
+ punctuation_removed=steps.get("punctuation_removed"),
219
+ repeated_chars_reduced=steps.get("repeated_chars_reduced"),
220
+ whitespace_normalized=steps.get("whitespace_normalized"),
221
+ numbers_removed=steps.get("numbers_removed"),
222
+ tokenized=steps.get("tokenized"),
223
+ stopwords_removed=steps.get("stopwords_removed"),
224
+ stemmed=steps.get("stemmed"),
225
+ final=steps.get("final", "")
226
+ )
227
+
228
+
229
+ # Main endpoints
 
 
 
 
 
230
  @app.get("/")
231
  def read_root() -> Dict[str, Any]:
232
  """API welcome message and endpoint documentation."""
 
238
  "openapi_schema": "/openapi.json",
239
  },
240
  "endpoints": {
241
+ "preprocess": "POST /preprocess - Preprocess text with detailed steps",
242
  "classify": "POST /classify - Classify Arabic text",
 
243
  "summarize": "POST /summarize - Summarize Arabic text",
 
 
 
 
 
 
244
  },
245
  }
246
 
247
 
248
+ @app.post("/preprocess", response_model=PreprocessingResponse)
249
+ def preprocess_text(req: PreprocessRequest) -> PreprocessingResponse:
250
+ """Preprocess text with step-by-step breakdown."""
251
  try:
252
+ steps = preprocessor.get_preprocessing_steps(req.text, req.task_type.value)
253
+ preprocessing_steps = _create_preprocessing_steps(steps)
254
+ return PreprocessingResponse(
255
+ task_type=req.task_type.value,
256
+ preprocessing_steps=preprocessing_steps
257
+ )
258
  except Exception as e:
259
+ raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
260
 
261
 
262
+ @app.post("/classify", response_model=ClassificationResponse)
263
+ def classify_text(req: ClassificationRequest) -> ClassificationResponse:
264
+ """Classify Arabic text."""
265
  try:
266
+ backend_model = _map_classification_model(req.model.value)
267
+ result = model_manager.predict(req.text, backend_model)
 
268
 
269
+ return ClassificationResponse(
270
+ prediction=result["prediction"],
271
+ confidence=result["confidence"],
272
+ probability_distribution=result["probability_distribution"],
273
+ cleaned_text=result["cleaned_text"],
274
+ model_used=req.model.value, # Echo back the frontend model name
275
+ prediction_index=result.get("prediction_index"),
276
+ prediction_metadata=result.get("prediction_metadata")
277
  )
278
+ except ValueError as e:
279
+ # Handle model availability errors
280
+ if "not available" in str(e):
281
+ raise HTTPException(
282
+ status_code=503,
283
+ detail=f"Model unavailable: {str(e)}. Check /models/available for current model status."
284
+ )
285
+ else:
286
+ raise HTTPException(status_code=400, detail=str(e))
 
 
 
 
 
 
 
 
 
287
  except Exception as e:
288
+ error_msg = str(e)
289
+
290
+ # Provide more helpful error messages for common issues
291
+ if "BERT" in error_msg and ("connect" in error_msg.lower() or "internet" in error_msg.lower() or "huggingface" in error_msg.lower()):
292
+ raise HTTPException(
293
+ status_code=503,
294
+ detail=f"BERT model unavailable: The model requires internet connection to download tokenizer/config from Hugging Face, or the files need to be cached locally. Error: {error_msg}"
295
+ )
296
+ elif "modern_bert" in req.model.value and "Error loading" in error_msg:
297
+ raise HTTPException(
298
+ status_code=503,
299
+ detail=f"BERT model loading failed: {error_msg}. Please ensure the model files are properly configured and Hugging Face dependencies are available."
300
+ )
301
+ else:
302
+ raise HTTPException(status_code=500, detail=f"Classification failed: {error_msg}")
303
 
304
 
305
+ @app.post("/summarize", response_model=SummarizationResponse)
306
+ def summarize_text(req: SummarizationRequest) -> SummarizationResponse:
307
+ """Summarize Arabic text."""
308
  try:
309
+ result = summarizer_manager.summarize(req.text, req.num_sentences, req.model.value)
310
 
311
+ return SummarizationResponse(
312
+ summary=result["summary"],
313
+ original_sentence_count=result["original_sentence_count"],
314
+ summary_sentence_count=result["summary_sentence_count"],
315
+ sentences=result["sentences"],
316
+ selected_indices=result["selected_indices"],
317
+ sentence_scores=result["sentence_scores"],
318
+ model_used=req.model.value, # Echo back the frontend model name
319
+ top_sentence_scores=result.get("top_sentence_scores")
 
 
 
 
320
  )
 
 
 
 
 
 
 
 
321
  except Exception as e:
322
+ raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
323
 
324
 
325
+ @app.get("/models/available")
326
+ def get_available_models() -> Dict[str, Any]:
327
+ """Get information about which models are currently available."""
328
+ return {
329
+ "classification_models": {
330
+ "traditional_svm": {
331
+ "available": AVAILABLE_MODELS.get("traditional_svm", False),
332
+ "description": "Traditional SVM classifier with TF-IDF vectorization"
333
+ },
334
+ "modern_lstm": {
335
+ "available": AVAILABLE_MODELS.get("modern_lstm", False),
336
+ "description": "Modern LSTM-based neural network classifier"
337
+ },
338
+ "modern_bert": {
339
+ "available": AVAILABLE_MODELS.get("modern_bert", False),
340
+ "description": "Modern BERT-based transformer classifier",
341
+ "note": "Requires internet connection or cached Hugging Face models" if not AVAILABLE_MODELS.get("modern_bert", False) else None
342
+ }
343
+ },
344
+ "summarization_models": {
345
+ "traditional_tfidf": {
346
+ "available": True,
347
+ "description": "Traditional TF-IDF based extractive summarization"
348
  },
349
+ "modern_seq2seq": {
350
+ "available": True,
351
+ "description": "Modern sequence-to-sequence summarization (currently uses TF-IDF fallback)",
352
+ "note": "Implementation in progress - currently falls back to TF-IDF"
353
+ },
354
+ "modern_bert": {
355
+ "available": True,
356
+ "description": "Modern BERT-based extractive summarization using asafaya/bert-base-arabic",
357
+ "note": "Requires torch and transformers dependencies. Model will be downloaded on first use."
358
+ }
359
+ },
360
+ "status": {
361
+ "total_classification_models": len([k for k, v in AVAILABLE_MODELS.items() if v]),
362
+ "total_available": len([k for k, v in AVAILABLE_MODELS.items() if v]),
363
+ "unavailable_models": [k for k, v in AVAILABLE_MODELS.items() if not v]
364
  }
365
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bert_summarizer.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import re
4
+ from typing import Dict, List, Any
5
+ from transformers import BertTokenizer, BertModel
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from preprocessor import preprocess_for_summarization
8
+
9
+
10
+ class BERTExtractiveSummarizer:
11
+ def __init__(self, model_name='aubmindlab/bert-base-arabertv02'):
12
+ """Initialize BERT-based Arabic summarizer."""
13
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
14
+ print(f"Using device: {self.device}")
15
+
16
+ # Load tokenizer and model
17
+ self.tokenizer = BertTokenizer.from_pretrained(model_name)
18
+ self.model = BertModel.from_pretrained(model_name)
19
+ self.model.to(self.device)
20
+ self.model.eval()
21
+
22
+ def get_sentence_embeddings(self, sentences: List[str]) -> np.ndarray:
23
+ """Get BERT embeddings for sentences."""
24
+ embeddings = []
25
+
26
+ with torch.no_grad():
27
+ for sentence in sentences:
28
+ # Tokenize
29
+ inputs = self.tokenizer(
30
+ sentence,
31
+ return_tensors='pt',
32
+ max_length=512,
33
+ truncation=True,
34
+ padding=True
35
+ ).to(self.device)
36
+
37
+ # Get embeddings
38
+ outputs = self.model(**inputs)
39
+ # Use CLS token embedding
40
+ embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
41
+ embeddings.append(embedding.squeeze())
42
+
43
+ return np.array(embeddings)
44
+
45
+ def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
46
+ """
47
+ Summarize Arabic text using BERT extractive summarization.
48
+ Returns the same structure as other summarizers for consistency.
49
+ """
50
+ print(f"BERT Summarizer: Processing text with {len(text)} characters")
51
+
52
+ # Use the same preprocessing as TF-IDF for fair comparison
53
+ cleaned_text = preprocess_for_summarization(text)
54
+ print(f"BERT Summarizer: After preprocessing: '{cleaned_text[:100]}...'")
55
+
56
+ # Split into sentences - same approach as TF-IDF
57
+ sentences = re.split(r'[.!؟\n]+', cleaned_text)
58
+ sentences = [s.strip() for s in sentences if s.strip()] # Same as TF-IDF
59
+
60
+ print(f"BERT Summarizer: Found {len(sentences)} sentences")
61
+ original_sentence_count = len(sentences)
62
+
63
+ # If we have fewer sentences than requested, return all
64
+ if len(sentences) <= num_sentences:
65
+ print(f"BERT Summarizer: Returning all {len(sentences)} sentences (fewer than requested)")
66
+ return {
67
+ "summary": cleaned_text.strip(), # Use cleaned text like TF-IDF
68
+ "original_sentence_count": original_sentence_count,
69
+ "summary_sentence_count": len(sentences),
70
+ "sentences": sentences,
71
+ "selected_indices": list(range(len(sentences))),
72
+ "sentence_scores": [1.0] * len(sentences) # All sentences selected
73
+ }
74
+
75
+ print("BERT Summarizer: Getting sentence embeddings...")
76
+ # Get sentence embeddings
77
+ sentence_embeddings = self.get_sentence_embeddings(sentences)
78
+ print(f"BERT Summarizer: Got embeddings shape: {sentence_embeddings.shape}")
79
+
80
+ # Calculate document embedding (mean of all sentences)
81
+ doc_embedding = np.mean(sentence_embeddings, axis=0)
82
+
83
+ # Calculate similarity scores
84
+ similarities = cosine_similarity([doc_embedding], sentence_embeddings)[0]
85
+ print(f"BERT Summarizer: Similarity scores: {similarities}")
86
+
87
+ # Get top sentences (indices with highest scores)
88
+ top_indices = np.argsort(similarities)[-num_sentences:]
89
+ print(f"BERT Summarizer: Top indices: {top_indices}")
90
+
91
+ # Sort indices to maintain original order in summary
92
+ top_indices_sorted = sorted(top_indices)
93
+ # Convert numpy indices to regular ints for JSON serialization
94
+ top_indices_sorted = [int(i) for i in top_indices_sorted]
95
+ print(f"BERT Summarizer: Selected indices (in order): {top_indices_sorted}")
96
+
97
+ # Get selected sentences and their scores
98
+ selected_sentences = [sentences[i] for i in top_indices_sorted]
99
+ selected_scores = [float(similarities[i]) for i in top_indices_sorted]
100
+
101
+ print(f"BERT Summarizer: Selected sentences: {[s[:50] + '...' for s in selected_sentences]}")
102
+
103
+ # Create summary by joining selected sentences
104
+ summary = ' '.join(selected_sentences)
105
+
106
+ return {
107
+ "summary": summary,
108
+ "original_sentence_count": original_sentence_count,
109
+ "summary_sentence_count": len(selected_sentences),
110
+ "sentences": sentences, # All original sentences
111
+ "selected_indices": top_indices_sorted,
112
+ "sentence_scores": selected_scores,
113
+ "top_sentence_scores": selected_scores # Additional info
114
+ }
examples.py CHANGED
@@ -269,8 +269,8 @@ RESPONSE_EXAMPLES = {
269
  "model_description": "Traditional SVM classifier with TF-IDF vectorization",
270
  "model_config": {
271
  "type": "traditional",
272
- "classifier_path": "traditional_svm_classifier.joblib",
273
- "vectorizer_path": "traditional_tfidf_vectorizer_classifier.joblib",
274
  "description": "Traditional SVM classifier with TF-IDF vectorization"
275
  },
276
  "is_cached": True
 
269
  "model_description": "Traditional SVM classifier with TF-IDF vectorization",
270
  "model_config": {
271
  "type": "traditional",
272
+ "classifier_path": "models/traditional_svm_classifier.joblib",
273
+ "vectorizer_path": "models/traditional_tfidf_vectorizer_classifier.joblib",
274
  "description": "Traditional SVM classifier with TF-IDF vectorization"
275
  },
276
  "is_cached": True
model_manager.py CHANGED
@@ -15,15 +15,15 @@ class ModelManager:
15
  AVAILABLE_MODELS = {
16
  "traditional_svm": {
17
  "type": "traditional",
18
- "classifier_path": "traditional_svm_classifier.joblib",
19
- "vectorizer_path": "traditional_tfidf_vectorizer_classifier.joblib",
20
  "description": "Traditional SVM classifier with TF-IDF vectorization"
21
  },
22
 
23
  "modern_bert": {
24
  "type": "modern",
25
  "model_type": "bert",
26
- "model_path": "modern_bert_classifier.safetensors",
27
  "config_path": "config.json",
28
  "description": "Modern BERT-based transformer classifier"
29
  },
@@ -31,7 +31,7 @@ class ModelManager:
31
  "modern_lstm": {
32
  "type": "modern",
33
  "model_type": "lstm",
34
- "model_path": "modern_lstm_classifier.pth",
35
  "description": "Modern LSTM-based neural network classifier"
36
  }
37
  }
 
15
  AVAILABLE_MODELS = {
16
  "traditional_svm": {
17
  "type": "traditional",
18
+ "classifier_path": "models/traditional_svm_classifier.joblib",
19
+ "vectorizer_path": "models/traditional_tfidf_vectorizer_classifier.joblib",
20
  "description": "Traditional SVM classifier with TF-IDF vectorization"
21
  },
22
 
23
  "modern_bert": {
24
  "type": "modern",
25
  "model_type": "bert",
26
+ "model_path": "models/modern_bert_classifier.safetensors",
27
  "config_path": "config.json",
28
  "description": "Modern BERT-based transformer classifier"
29
  },
 
31
  "modern_lstm": {
32
  "type": "modern",
33
  "model_type": "lstm",
34
+ "model_path": "models/modern_lstm_classifier.pth",
35
  "description": "Modern LSTM-based neural network classifier"
36
  }
37
  }
models/Seq2seq/seq2seq_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"ENC_MAXLEN": 1900, "DEC_MAXLEN": 178, "SRC_VOCAB_SIZE": 20000, "TGT_VOCAB_SIZE": 10000, "EMB_DIM": 128, "HID_DIM": 256}
models/Seq2seq/seq2seq_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a35f8f2f2dc4f77570cc86c77a9fb90a1649d79d3e5e632be92499e889958a27
3
+ size 117152336
models/Seq2seq/src_tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff87d78b4f45fa3aaa9b9a43c0d94e7aecc1f7f18e0ab5c4caed15a0f1ca61ee
3
+ size 12722191
models/Seq2seq/tgt_tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca4e33cc944afd29a11b4fed11da27787ef604e7403b765ab589a7b304059e95
3
+ size 2577556
modern_bert_classifier.safetensors → models/modern_bert_classifier.safetensors RENAMED
File without changes
modern_lstm_classifier.pth → models/modern_lstm_classifier.pth RENAMED
File without changes
traditional_svm_classifier.joblib → models/traditional_svm_classifier.joblib RENAMED
File without changes
traditional_tfidf_vectorizer_classifier.joblib → models/traditional_tfidf_vectorizer_classifier.joblib RENAMED
File without changes
traditional_tfidf_vectorizer_summarization.joblib → models/traditional_tfidf_vectorizer_summarization.joblib RENAMED
File without changes
modern_classifier.py CHANGED
@@ -65,15 +65,71 @@ class ModernClassifier:
65
  def _load_bert_model(self):
66
  """Load BERT model from safetensors."""
67
  try:
68
- self.tokenizer = AutoTokenizer.from_pretrained('aubmindlab/bert-base-arabertv2')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  state_dict = load_file(self.model_path)
70
  embed_key = next(k for k in state_dict if 'embeddings.word_embeddings.weight' in k)
71
  checkpoint_vocab_size = state_dict[embed_key].shape[0]
72
- config = AutoConfig.from_pretrained(
73
- 'aubmindlab/bert-base-arabertv2',
74
- num_labels=len(self.classes),
75
- vocab_size=checkpoint_vocab_size
76
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  self.model = AutoModelForSequenceClassification.from_config(config)
78
  self.model.resize_token_embeddings(checkpoint_vocab_size)
79
  self.model.load_state_dict(state_dict, strict=False)
@@ -116,6 +172,15 @@ class ModernClassifier:
116
  max_length=512
117
  )
118
 
 
 
 
 
 
 
 
 
 
119
  return {key: value.to(self.device) for key, value in inputs.items()}
120
 
121
  def _preprocess_text_for_lstm(self, text: str) -> torch.Tensor:
@@ -150,7 +215,11 @@ class ModernClassifier:
150
  inputs = self._preprocess_text_for_lstm(text)
151
  logits = self.model(inputs)
152
 
153
- probabilities = torch.softmax(logits, dim=-1).cpu().numpy()[0]
 
 
 
 
154
 
155
  prediction_index = int(np.argmax(probabilities))
156
  prediction = self.classes[prediction_index]
 
65
  def _load_bert_model(self):
66
  """Load BERT model from safetensors."""
67
  try:
68
+ # Try different Arabic BERT tokenizers that match 32K vocabulary
69
+ tokenizer_options = [
70
+ 'asafaya/bert-base-arabic', # This one has 32K vocab
71
+ 'aubmindlab/bert-base-arabertv02', # Alternative
72
+ 'aubmindlab/bert-base-arabertv2' # Fallback (64K vocab)
73
+ ]
74
+
75
+ self.tokenizer = None
76
+ for tokenizer_name in tokenizer_options:
77
+ try:
78
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, local_files_only=True)
79
+ # Test if vocabulary size matches
80
+ if len(tokenizer.vocab) <= 32000:
81
+ self.tokenizer = tokenizer
82
+ print(f"Using tokenizer: {tokenizer_name} (vocab size: {len(tokenizer.vocab)})")
83
+ break
84
+ except:
85
+ continue
86
+
87
+ if self.tokenizer is None:
88
+ # Try downloading if local files don't work
89
+ for tokenizer_name in tokenizer_options:
90
+ try:
91
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
92
+ if len(tokenizer.vocab) <= 32000:
93
+ self.tokenizer = tokenizer
94
+ print(f"Downloaded tokenizer: {tokenizer_name} (vocab size: {len(tokenizer.vocab)})")
95
+ break
96
+ except:
97
+ continue
98
+
99
+ if self.tokenizer is None:
100
+ raise RuntimeError("No compatible Arabic BERT tokenizer found with 32K vocabulary")
101
+
102
  state_dict = load_file(self.model_path)
103
  embed_key = next(k for k in state_dict if 'embeddings.word_embeddings.weight' in k)
104
  checkpoint_vocab_size = state_dict[embed_key].shape[0]
105
+
106
+ # Try to load config locally first
107
+ try:
108
+ config = AutoConfig.from_pretrained(
109
+ 'aubmindlab/bert-base-arabertv2',
110
+ num_labels=len(self.classes),
111
+ vocab_size=checkpoint_vocab_size,
112
+ local_files_only=True
113
+ )
114
+ except:
115
+ try:
116
+ config = AutoConfig.from_pretrained(
117
+ 'aubmindlab/bert-base-arabertv2',
118
+ num_labels=len(self.classes),
119
+ vocab_size=checkpoint_vocab_size
120
+ )
121
+ except:
122
+ # Fallback: create a basic BERT config
123
+ from transformers import BertConfig
124
+ config = BertConfig(
125
+ vocab_size=checkpoint_vocab_size,
126
+ hidden_size=768,
127
+ num_hidden_layers=12,
128
+ num_attention_heads=12,
129
+ intermediate_size=3072,
130
+ num_labels=len(self.classes)
131
+ )
132
+
133
  self.model = AutoModelForSequenceClassification.from_config(config)
134
  self.model.resize_token_embeddings(checkpoint_vocab_size)
135
  self.model.load_state_dict(state_dict, strict=False)
 
172
  max_length=512
173
  )
174
 
175
+ # CRITICAL FIX: Check for vocabulary mismatch and clamp token IDs
176
+ input_ids = inputs['input_ids']
177
+ max_token_id = input_ids.max().item()
178
+ model_vocab_size = self.model.config.vocab_size
179
+
180
+ if max_token_id >= model_vocab_size:
181
+ # Fix: Clamp token IDs to valid range to prevent "index out of range" error
182
+ inputs['input_ids'] = torch.clamp(input_ids, 0, model_vocab_size - 1)
183
+
184
  return {key: value.to(self.device) for key, value in inputs.items()}
185
 
186
  def _preprocess_text_for_lstm(self, text: str) -> torch.Tensor:
 
215
  inputs = self._preprocess_text_for_lstm(text)
216
  logits = self.model(inputs)
217
 
218
+ probabilities = torch.softmax(logits, dim=-1).cpu().numpy()
219
+
220
+ # Handle batch dimension
221
+ if len(probabilities.shape) > 1:
222
+ probabilities = probabilities[0]
223
 
224
  prediction_index = int(np.argmax(probabilities))
225
  prediction = self.classes[prediction_index]
summarizer.py CHANGED
@@ -8,7 +8,7 @@ from preprocessor import preprocess_for_summarization
8
  class ArabicSummarizer:
9
  """Arabic text summarizer using TF-IDF scoring."""
10
 
11
- def __init__(self, vectorizer_path: str = "traditional_tfidf_vectorizer_summarization.joblib"):
12
  self.vectorizer = joblib.load(vectorizer_path)
13
 
14
  def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
 
8
  class ArabicSummarizer:
9
  """Arabic text summarizer using TF-IDF scoring."""
10
 
11
+ def __init__(self, vectorizer_path: str = "models/traditional_tfidf_vectorizer_summarization.joblib"):
12
  self.vectorizer = joblib.load(vectorizer_path)
13
 
14
  def summarize(self, text: str, num_sentences: int = 3) -> Dict[str, Any]:
traditional_classifier.py CHANGED
@@ -9,8 +9,8 @@ class TraditionalClassifier:
9
 
10
  def __init__(
11
  self,
12
- classifier_path: str = "traditional_svm_classifier.joblib",
13
- vectorizer_path: str = "traditional_tfidf_vectorizer_classifier.joblib",
14
  ):
15
  self.model = joblib.load(classifier_path)
16
  self.vectorizer = joblib.load(vectorizer_path)
 
9
 
10
  def __init__(
11
  self,
12
+ classifier_path: str = "models/traditional_svm_classifier.joblib",
13
+ vectorizer_path: str = "models/traditional_tfidf_vectorizer_classifier.joblib",
14
  ):
15
  self.model = joblib.load(classifier_path)
16
  self.vectorizer = joblib.load(vectorizer_path)