moabos commited on
Commit
29dedef
·
1 Parent(s): 354c6a0

chore: add req and res samples for API reqs, remove redundant lines

Browse files
Files changed (3) hide show
  1. app.py +162 -19
  2. classifier.py +82 -60
  3. examples.py +214 -0
app.py CHANGED
@@ -1,10 +1,18 @@
1
  from typing import Optional, List, Dict, Any
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
 
4
 
5
  from classifier import ArabicClassifier
6
  from summarizer import ArabicSummarizer
7
  from preprocessor import ArabicPreprocessor
 
 
 
 
 
 
 
8
 
9
  app = FastAPI(
10
  title="Arabic Text Analysis API",
@@ -19,20 +27,155 @@ preprocessor = ArabicPreprocessor()
19
 
20
  class TextInput(BaseModel):
21
  text: str
 
 
 
 
 
 
22
 
23
 
24
  class TextInputWithSentences(BaseModel):
25
  text: str
26
  num_sentences: Optional[int] = 3
 
 
 
 
 
 
27
 
28
 
29
  class BatchTextInput(BaseModel):
30
  texts: List[str]
 
 
 
 
 
 
31
 
32
 
33
  class PreprocessingInput(BaseModel):
34
  text: str
35
- task_type: Optional[str] = "classification"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
 
38
  @app.get("/")
@@ -58,8 +201,8 @@ def read_root() -> Dict[str, Any]:
58
  }
59
 
60
 
61
- @app.post("/classify")
62
- def classify_text(data: TextInput) -> Dict[str, Any]:
63
  """Classify Arabic text with probability distribution and metadata."""
64
  try:
65
  result = classifier.predict(data.text)
@@ -68,8 +211,8 @@ def classify_text(data: TextInput) -> Dict[str, Any]:
68
  raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
69
 
70
 
71
- @app.post("/classify/batch")
72
- def classify_texts(data: BatchTextInput) -> Dict[str, Any]:
73
  """Classify multiple Arabic texts in batch."""
74
  try:
75
  results = classifier.predict_batch(data.texts)
@@ -82,8 +225,8 @@ def classify_texts(data: BatchTextInput) -> Dict[str, Any]:
82
  raise HTTPException(status_code=500, detail=f"Batch classification failed: {str(e)}")
83
 
84
 
85
- @app.post("/summarize")
86
- def summarize_text(data: TextInputWithSentences) -> Dict[str, Any]:
87
  """Summarize Arabic text with sentence analysis."""
88
  try:
89
  result = summarizer.summarize(data.text, data.num_sentences)
@@ -92,8 +235,8 @@ def summarize_text(data: TextInputWithSentences) -> Dict[str, Any]:
92
  raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
93
 
94
 
95
- @app.post("/sentence-analysis")
96
- def analyze_sentences(data: TextInput) -> Dict[str, Any]:
97
  """Analyze all sentences with scores and rankings."""
98
  try:
99
  result = summarizer.get_sentence_analysis(data.text)
@@ -102,8 +245,8 @@ def analyze_sentences(data: TextInput) -> Dict[str, Any]:
102
  raise HTTPException(status_code=500, detail=f"Sentence analysis failed: {str(e)}")
103
 
104
 
105
- @app.post("/analyze")
106
- def analyze_text_complete(data: TextInputWithSentences) -> Dict[str, Any]:
107
  """Complete analysis: classification, summarization, and text statistics."""
108
  try:
109
  classification_result = classifier.predict(data.text)
@@ -120,21 +263,21 @@ def analyze_text_complete(data: TextInputWithSentences) -> Dict[str, Any]:
120
  raise HTTPException(status_code=500, detail=f"Complete analysis failed: {str(e)}")
121
 
122
 
123
- @app.post("/preprocess")
124
- def preprocess_text(data: PreprocessingInput) -> Dict[str, Any]:
125
  """Preprocess text with step-by-step breakdown."""
126
  try:
127
- steps = preprocessor.get_preprocessing_steps(data.text, data.task_type)
128
  return {
129
- "task_type": data.task_type,
130
  "preprocessing_steps": steps
131
  }
132
  except Exception as e:
133
  raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
134
 
135
 
136
- @app.post("/text-analysis")
137
- def analyze_text_characteristics(data: TextInput) -> Dict[str, Any]:
138
  """Analyze text characteristics and statistics."""
139
  try:
140
  analysis = preprocessor.analyze_text(data.text)
@@ -146,8 +289,8 @@ def analyze_text_characteristics(data: TextInput) -> Dict[str, Any]:
146
  raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
147
 
148
 
149
- @app.get("/model-info")
150
- def get_model_info() -> Dict[str, Any]:
151
  """Get information about loaded models."""
152
  try:
153
  classifier_info = classifier.get_model_info()
 
1
  from typing import Optional, List, Dict, Any
2
  from fastapi import FastAPI, HTTPException
3
  from pydantic import BaseModel
4
+ from enum import Enum
5
 
6
  from classifier import ArabicClassifier
7
  from summarizer import ArabicSummarizer
8
  from preprocessor import ArabicPreprocessor
9
+ from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
10
+
11
+
12
+ class TaskType(str, Enum):
13
+ CLASSIFICATION = "classification"
14
+ SUMMARIZATION = "summarization"
15
+
16
 
17
  app = FastAPI(
18
  title="Arabic Text Analysis API",
 
27
 
28
  class TextInput(BaseModel):
29
  text: str
30
+
31
+ model_config = {
32
+ "json_schema_extra": {
33
+ "example": REQUEST_EXAMPLES["text_input"]
34
+ }
35
+ }
36
 
37
 
38
  class TextInputWithSentences(BaseModel):
39
  text: str
40
  num_sentences: Optional[int] = 3
41
+
42
+ model_config = {
43
+ "json_schema_extra": {
44
+ "example": REQUEST_EXAMPLES["text_input_with_sentences"]
45
+ }
46
+ }
47
 
48
 
49
  class BatchTextInput(BaseModel):
50
  texts: List[str]
51
+
52
+ model_config = {
53
+ "json_schema_extra": {
54
+ "example": REQUEST_EXAMPLES["batch_text_input"]
55
+ }
56
+ }
57
 
58
 
59
  class PreprocessingInput(BaseModel):
60
  text: str
61
+ task_type: TaskType = TaskType.CLASSIFICATION
62
+
63
+ model_config = {
64
+ "json_schema_extra": {
65
+ "example": REQUEST_EXAMPLES["preprocessing_input"]
66
+ }
67
+ }
68
+
69
+
70
+ class ClassificationResponse(BaseModel):
71
+ prediction: str
72
+ prediction_index: int
73
+ confidence: float
74
+ probability_distribution: Dict[str, float]
75
+ cleaned_text: str
76
+ model_used: str
77
+ prediction_metadata: Dict[str, Any]
78
+
79
+ model_config = {
80
+ "protected_namespaces": (),
81
+ "json_schema_extra": {
82
+ "example": RESPONSE_EXAMPLES["classification"],
83
+ "schema_extra": {
84
+ "properties": {
85
+ "prediction_index": {
86
+ "description": "Numerical index of the predicted class (0=culture, 1=economy, 2=international, 3=local, 4=religion, 5=sports)"
87
+ }
88
+ }
89
+ }
90
+ }
91
+ }
92
+
93
+
94
+ class SummarizationResponse(BaseModel):
95
+ summary: str
96
+ original_sentence_count: int
97
+ summary_sentence_count: int
98
+ sentences: List[str]
99
+ selected_indices: List[int]
100
+ sentence_scores: Optional[List[float]]
101
+ top_sentence_scores: Optional[List[float]]
102
+
103
+ model_config = {
104
+ "json_schema_extra": {
105
+ "example": RESPONSE_EXAMPLES["summarization"]
106
+ }
107
+ }
108
+
109
+
110
+ class TextAnalysisResponse(BaseModel):
111
+ text: str
112
+ analysis: Dict[str, Any]
113
+
114
+ model_config = {
115
+ "json_schema_extra": {
116
+ "example": RESPONSE_EXAMPLES["text_analysis"]
117
+ }
118
+ }
119
+
120
+
121
+ class BatchClassificationResponse(BaseModel):
122
+ results: List[ClassificationResponse]
123
+ total_texts: int
124
+ model_used: str
125
+
126
+ model_config = {
127
+ "protected_namespaces": (),
128
+ "json_schema_extra": {
129
+ "example": RESPONSE_EXAMPLES["batch_classification"]
130
+ }
131
+ }
132
+
133
+
134
+ class SentenceAnalysisResponse(BaseModel):
135
+ sentences: List[Dict[str, Any]]
136
+ total_sentences: int
137
+ score_statistics: Dict[str, float]
138
+
139
+ model_config = {
140
+ "json_schema_extra": {
141
+ "example": RESPONSE_EXAMPLES["sentence_analysis"]
142
+ }
143
+ }
144
+
145
+
146
+ class CompleteAnalysisResponse(BaseModel):
147
+ original_text: str
148
+ text_analysis: Dict[str, Any]
149
+ classification: ClassificationResponse
150
+ summarization: SummarizationResponse
151
+
152
+ model_config = {
153
+ "json_schema_extra": {
154
+ "example": RESPONSE_EXAMPLES["complete_analysis"]
155
+ }
156
+ }
157
+
158
+
159
+ class PreprocessingResponse(BaseModel):
160
+ task_type: str
161
+ preprocessing_steps: Dict[str, Any]
162
+
163
+ model_config = {
164
+ "json_schema_extra": {
165
+ "example": RESPONSE_EXAMPLES["preprocessing"]
166
+ }
167
+ }
168
+
169
+
170
+ class ModelInfoResponse(BaseModel):
171
+ classifier: Dict[str, Any]
172
+ summarizer: Dict[str, Any]
173
+
174
+ model_config = {
175
+ "json_schema_extra": {
176
+ "example": RESPONSE_EXAMPLES["model_info"]
177
+ }
178
+ }
179
 
180
 
181
  @app.get("/")
 
201
  }
202
 
203
 
204
+ @app.post("/classify", response_model=ClassificationResponse)
205
+ def classify_text(data: TextInput) -> ClassificationResponse:
206
  """Classify Arabic text with probability distribution and metadata."""
207
  try:
208
  result = classifier.predict(data.text)
 
211
  raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
212
 
213
 
214
+ @app.post("/classify/batch", response_model=BatchClassificationResponse)
215
+ def classify_texts(data: BatchTextInput) -> BatchClassificationResponse:
216
  """Classify multiple Arabic texts in batch."""
217
  try:
218
  results = classifier.predict_batch(data.texts)
 
225
  raise HTTPException(status_code=500, detail=f"Batch classification failed: {str(e)}")
226
 
227
 
228
+ @app.post("/summarize", response_model=SummarizationResponse)
229
+ def summarize_text(data: TextInputWithSentences) -> SummarizationResponse:
230
  """Summarize Arabic text with sentence analysis."""
231
  try:
232
  result = summarizer.summarize(data.text, data.num_sentences)
 
235
  raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
236
 
237
 
238
+ @app.post("/sentence-analysis", response_model=SentenceAnalysisResponse)
239
+ def analyze_sentences(data: TextInput) -> SentenceAnalysisResponse:
240
  """Analyze all sentences with scores and rankings."""
241
  try:
242
  result = summarizer.get_sentence_analysis(data.text)
 
245
  raise HTTPException(status_code=500, detail=f"Sentence analysis failed: {str(e)}")
246
 
247
 
248
+ @app.post("/analyze", response_model=CompleteAnalysisResponse)
249
+ def analyze_text_complete(data: TextInputWithSentences) -> CompleteAnalysisResponse:
250
  """Complete analysis: classification, summarization, and text statistics."""
251
  try:
252
  classification_result = classifier.predict(data.text)
 
263
  raise HTTPException(status_code=500, detail=f"Complete analysis failed: {str(e)}")
264
 
265
 
266
+ @app.post("/preprocess", response_model=PreprocessingResponse)
267
+ def preprocess_text(data: PreprocessingInput) -> PreprocessingResponse:
268
  """Preprocess text with step-by-step breakdown."""
269
  try:
270
+ steps = preprocessor.get_preprocessing_steps(data.text, data.task_type.value)
271
  return {
272
+ "task_type": data.task_type.value,
273
  "preprocessing_steps": steps
274
  }
275
  except Exception as e:
276
  raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
277
 
278
 
279
+ @app.post("/text-analysis", response_model=TextAnalysisResponse)
280
+ def analyze_text_characteristics(data: TextInput) -> TextAnalysisResponse:
281
  """Analyze text characteristics and statistics."""
282
  try:
283
  analysis = preprocessor.analyze_text(data.text)
 
289
  raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
290
 
291
 
292
+ @app.get("/model-info", response_model=ModelInfoResponse)
293
+ def get_model_info() -> ModelInfoResponse:
294
  """Get information about loaded models."""
295
  try:
296
  classifier_info = classifier.get_model_info()
classifier.py CHANGED
@@ -6,52 +6,61 @@ from preprocessor import preprocess_for_classification
6
 
7
  class ArabicClassifier:
8
  """Arabic text classifier with probability distributions and metadata."""
9
-
10
- def __init__(self,
11
- classifier_path: str = "svm_classifier.joblib",
12
- vectorizer_path: str = "tfidf_vectorizer_classifier.joblib"):
 
 
13
  self.model = joblib.load(classifier_path)
14
  self.vectorizer = joblib.load(vectorizer_path)
15
  self.model_name = classifier_path.split("/")[-1].replace(".joblib", "")
16
-
17
  def predict(self, text: str) -> Dict[str, Any]:
18
  """Predict class with full probability distribution and metadata."""
19
  cleaned_text = preprocess_for_classification(text)
20
-
21
  if self.vectorizer:
22
  text_vector = self.vectorizer.transform([cleaned_text])
23
  else:
24
  text_vector = [cleaned_text]
25
-
26
  prediction = self.model.predict(text_vector)[0]
27
-
28
- classes = getattr(self.model, 'classes_', None)
29
  if classes is not None:
30
  prediction_index = int(np.where(classes == prediction)[0][0])
31
  else:
32
- prediction_index = int(prediction) if isinstance(prediction, (int, np.integer)) else 0
33
-
34
- if hasattr(self.model, 'predict_proba'):
 
 
35
  probabilities = self.model.predict_proba(text_vector)[0]
36
  confidence = float(probabilities[prediction_index])
37
  else:
38
- if hasattr(self.model, 'decision_function'):
39
  decision_scores = self.model.decision_function(text_vector)[0]
40
  if len(decision_scores.shape) == 0:
41
- probabilities = np.array([1 / (1 + np.exp(decision_scores)), 1 / (1 + np.exp(-decision_scores))])
 
 
 
 
 
42
  else:
43
  exp_scores = np.exp(decision_scores - np.max(decision_scores))
44
  probabilities = exp_scores / np.sum(exp_scores)
45
  confidence = float(probabilities[prediction_index])
46
  else:
47
- classes = getattr(self.model, 'classes_', None)
48
  num_classes = len(classes) if classes is not None else 2
49
  probabilities = np.zeros(num_classes)
50
  probabilities[prediction_index] = 1.0
51
  confidence = 1.0
52
-
53
- classes = getattr(self.model, 'classes_', None)
54
-
55
  prob_distribution = {}
56
  if classes is not None:
57
  for i, class_label in enumerate(classes):
@@ -59,36 +68,36 @@ class ArabicClassifier:
59
  else:
60
  for i, prob in enumerate(probabilities):
61
  prob_distribution[f"class_{i}"] = float(prob)
62
-
63
  return {
64
  "prediction": str(prediction),
65
- "prediction_label": str(prediction),
66
  "prediction_index": int(prediction_index),
67
  "confidence": confidence,
68
  "probability_distribution": prob_distribution,
69
- "all_probabilities": probabilities.tolist(),
70
  "cleaned_text": cleaned_text,
71
  "model_used": self.model_name,
72
  "prediction_metadata": {
73
  "max_probability": float(np.max(probabilities)),
74
  "min_probability": float(np.min(probabilities)),
75
- "entropy": float(-np.sum(probabilities * np.log(probabilities + 1e-10))),
76
- "num_classes": len(probabilities)
77
- }
 
 
78
  }
79
-
80
  def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
81
  """Predict classes for multiple texts."""
82
  cleaned_texts = [preprocess_for_classification(text) for text in texts]
83
-
84
  if self.vectorizer:
85
  text_vectors = self.vectorizer.transform(cleaned_texts)
86
  else:
87
  text_vectors = cleaned_texts
88
-
89
  predictions = self.model.predict(text_vectors)
90
- classes = getattr(self.model, 'classes_', None)
91
-
92
  prediction_indices = []
93
  for pred in predictions:
94
  if classes is not None:
@@ -96,29 +105,38 @@ class ArabicClassifier:
96
  else:
97
  pred_index = int(pred) if isinstance(pred, (int, np.integer)) else 0
98
  prediction_indices.append(pred_index)
99
-
100
- if hasattr(self.model, 'predict_proba'):
101
  probabilities = self.model.predict_proba(text_vectors)
102
  else:
103
- if hasattr(self.model, 'decision_function'):
104
  decision_scores = self.model.decision_function(text_vectors)
105
  if len(decision_scores.shape) == 1:
106
- probabilities = np.column_stack([1 / (1 + np.exp(decision_scores)), 1 / (1 + np.exp(-decision_scores))])
 
 
 
 
 
107
  else:
108
- exp_scores = np.exp(decision_scores - np.max(decision_scores, axis=1, keepdims=True))
109
- probabilities = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
 
 
 
 
110
  else:
111
- classes = getattr(self.model, 'classes_', None)
112
  num_classes = len(classes) if classes is not None else 2
113
  probabilities = np.zeros((len(predictions), num_classes))
114
  for i, pred_idx in enumerate(prediction_indices):
115
  probabilities[i, pred_idx] = 1.0
116
-
117
  results = []
118
-
119
  for i, (pred, pred_idx) in enumerate(zip(predictions, prediction_indices)):
120
  confidence = float(probabilities[i][pred_idx])
121
-
122
  prob_distribution = {}
123
  if classes is not None:
124
  for j, class_label in enumerate(classes):
@@ -126,35 +144,39 @@ class ArabicClassifier:
126
  else:
127
  for j, prob in enumerate(probabilities[i]):
128
  prob_distribution[f"class_{j}"] = float(prob)
129
-
130
- results.append({
131
- "prediction": str(pred),
132
- "prediction_label": str(pred),
133
- "prediction_index": int(pred_idx),
134
- "confidence": confidence,
135
- "probability_distribution": prob_distribution,
136
- "all_probabilities": probabilities[i].tolist(),
137
- "cleaned_text": cleaned_texts[i],
138
- "model_used": self.model_name,
139
- "prediction_metadata": {
140
- "max_probability": float(np.max(probabilities[i])),
141
- "min_probability": float(np.min(probabilities[i])),
142
- "entropy": float(-np.sum(probabilities[i] * np.log(probabilities[i] + 1e-10))),
143
- "num_classes": len(probabilities[i])
 
 
144
  }
145
- })
146
-
147
  return results
148
-
149
  def get_model_info(self) -> Dict[str, Any]:
150
  """Get model information and capabilities."""
151
- classes = getattr(self.model, 'classes_', None)
152
  return {
153
  "model_name": self.model_name,
154
  "model_type": type(self.model).__name__,
155
  "num_classes": len(classes) if classes is not None else "unknown",
156
  "classes": classes.tolist() if classes is not None else None,
157
- "has_predict_proba": hasattr(self.model, 'predict_proba'),
158
  "has_vectorizer": self.vectorizer is not None,
159
- "vectorizer_type": type(self.vectorizer).__name__ if self.vectorizer else None
 
 
160
  }
 
6
 
7
  class ArabicClassifier:
8
  """Arabic text classifier with probability distributions and metadata."""
9
+
10
+ def __init__(
11
+ self,
12
+ classifier_path: str = "svm_classifier.joblib",
13
+ vectorizer_path: str = "tfidf_vectorizer_classifier.joblib",
14
+ ):
15
  self.model = joblib.load(classifier_path)
16
  self.vectorizer = joblib.load(vectorizer_path)
17
  self.model_name = classifier_path.split("/")[-1].replace(".joblib", "")
18
+
19
  def predict(self, text: str) -> Dict[str, Any]:
20
  """Predict class with full probability distribution and metadata."""
21
  cleaned_text = preprocess_for_classification(text)
22
+
23
  if self.vectorizer:
24
  text_vector = self.vectorizer.transform([cleaned_text])
25
  else:
26
  text_vector = [cleaned_text]
27
+
28
  prediction = self.model.predict(text_vector)[0]
29
+
30
+ classes = getattr(self.model, "classes_", None)
31
  if classes is not None:
32
  prediction_index = int(np.where(classes == prediction)[0][0])
33
  else:
34
+ prediction_index = (
35
+ int(prediction) if isinstance(prediction, (int, np.integer)) else 0
36
+ )
37
+
38
+ if hasattr(self.model, "predict_proba"):
39
  probabilities = self.model.predict_proba(text_vector)[0]
40
  confidence = float(probabilities[prediction_index])
41
  else:
42
+ if hasattr(self.model, "decision_function"):
43
  decision_scores = self.model.decision_function(text_vector)[0]
44
  if len(decision_scores.shape) == 0:
45
+ probabilities = np.array(
46
+ [
47
+ 1 / (1 + np.exp(decision_scores)),
48
+ 1 / (1 + np.exp(-decision_scores)),
49
+ ]
50
+ )
51
  else:
52
  exp_scores = np.exp(decision_scores - np.max(decision_scores))
53
  probabilities = exp_scores / np.sum(exp_scores)
54
  confidence = float(probabilities[prediction_index])
55
  else:
56
+ classes = getattr(self.model, "classes_", None)
57
  num_classes = len(classes) if classes is not None else 2
58
  probabilities = np.zeros(num_classes)
59
  probabilities[prediction_index] = 1.0
60
  confidence = 1.0
61
+
62
+ classes = getattr(self.model, "classes_", None)
63
+
64
  prob_distribution = {}
65
  if classes is not None:
66
  for i, class_label in enumerate(classes):
 
68
  else:
69
  for i, prob in enumerate(probabilities):
70
  prob_distribution[f"class_{i}"] = float(prob)
71
+
72
  return {
73
  "prediction": str(prediction),
 
74
  "prediction_index": int(prediction_index),
75
  "confidence": confidence,
76
  "probability_distribution": prob_distribution,
 
77
  "cleaned_text": cleaned_text,
78
  "model_used": self.model_name,
79
  "prediction_metadata": {
80
  "max_probability": float(np.max(probabilities)),
81
  "min_probability": float(np.min(probabilities)),
82
+ "entropy": float(
83
+ -np.sum(probabilities * np.log(probabilities + 1e-10))
84
+ ),
85
+ "num_classes": len(probabilities),
86
+ },
87
  }
88
+
89
  def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
90
  """Predict classes for multiple texts."""
91
  cleaned_texts = [preprocess_for_classification(text) for text in texts]
92
+
93
  if self.vectorizer:
94
  text_vectors = self.vectorizer.transform(cleaned_texts)
95
  else:
96
  text_vectors = cleaned_texts
97
+
98
  predictions = self.model.predict(text_vectors)
99
+ classes = getattr(self.model, "classes_", None)
100
+
101
  prediction_indices = []
102
  for pred in predictions:
103
  if classes is not None:
 
105
  else:
106
  pred_index = int(pred) if isinstance(pred, (int, np.integer)) else 0
107
  prediction_indices.append(pred_index)
108
+
109
+ if hasattr(self.model, "predict_proba"):
110
  probabilities = self.model.predict_proba(text_vectors)
111
  else:
112
+ if hasattr(self.model, "decision_function"):
113
  decision_scores = self.model.decision_function(text_vectors)
114
  if len(decision_scores.shape) == 1:
115
+ probabilities = np.column_stack(
116
+ [
117
+ 1 / (1 + np.exp(decision_scores)),
118
+ 1 / (1 + np.exp(-decision_scores)),
119
+ ]
120
+ )
121
  else:
122
+ exp_scores = np.exp(
123
+ decision_scores - np.max(decision_scores, axis=1, keepdims=True)
124
+ )
125
+ probabilities = exp_scores / np.sum(
126
+ exp_scores, axis=1, keepdims=True
127
+ )
128
  else:
129
+ classes = getattr(self.model, "classes_", None)
130
  num_classes = len(classes) if classes is not None else 2
131
  probabilities = np.zeros((len(predictions), num_classes))
132
  for i, pred_idx in enumerate(prediction_indices):
133
  probabilities[i, pred_idx] = 1.0
134
+
135
  results = []
136
+
137
  for i, (pred, pred_idx) in enumerate(zip(predictions, prediction_indices)):
138
  confidence = float(probabilities[i][pred_idx])
139
+
140
  prob_distribution = {}
141
  if classes is not None:
142
  for j, class_label in enumerate(classes):
 
144
  else:
145
  for j, prob in enumerate(probabilities[i]):
146
  prob_distribution[f"class_{j}"] = float(prob)
147
+
148
+ results.append(
149
+ {
150
+ "prediction": str(pred),
151
+ "prediction_index": int(pred_idx),
152
+ "confidence": confidence,
153
+ "probability_distribution": prob_distribution,
154
+ "cleaned_text": cleaned_texts[i],
155
+ "model_used": self.model_name,
156
+ "prediction_metadata": {
157
+ "max_probability": float(np.max(probabilities[i])),
158
+ "min_probability": float(np.min(probabilities[i])),
159
+ "entropy": float(
160
+ -np.sum(probabilities[i] * np.log(probabilities[i] + 1e-10))
161
+ ),
162
+ "num_classes": len(probabilities[i]),
163
+ },
164
  }
165
+ )
166
+
167
  return results
168
+
169
  def get_model_info(self) -> Dict[str, Any]:
170
  """Get model information and capabilities."""
171
+ classes = getattr(self.model, "classes_", None)
172
  return {
173
  "model_name": self.model_name,
174
  "model_type": type(self.model).__name__,
175
  "num_classes": len(classes) if classes is not None else "unknown",
176
  "classes": classes.tolist() if classes is not None else None,
177
+ "has_predict_proba": hasattr(self.model, "predict_proba"),
178
  "has_vectorizer": self.vectorizer is not None,
179
+ "vectorizer_type": type(self.vectorizer).__name__
180
+ if self.vectorizer
181
+ else None,
182
  }
examples.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """API request and response examples for documentation."""
2
+
3
+ EXAMPLE_TEXT = "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم. ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات. لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه. في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى."
4
+
5
+ REQUEST_EXAMPLES = {
6
+ "text_input": {"text": EXAMPLE_TEXT},
7
+ "text_input_with_sentences": {"text": EXAMPLE_TEXT, "num_sentences": 2},
8
+ "batch_text_input": {
9
+ "texts": [
10
+ EXAMPLE_TEXT,
11
+ "هذا نص تجريبي آخر للتصنيف باللغة العربية.",
12
+ "المطاعم في المدينة تقدم أطباق شهية ومتنوعة.",
13
+ ]
14
+ },
15
+ "preprocessing_input": {"text": EXAMPLE_TEXT, "task_type": "classification"},
16
+ }
17
+
18
+ RESPONSE_EXAMPLES = {
19
+ "classification": {
20
+ "prediction": "culture",
21
+ "prediction_index": 0,
22
+ "confidence": 0.902,
23
+ "probability_distribution": {
24
+ "culture": 0.902,
25
+ "economy": 0.001,
26
+ "international": 0.0,
27
+ "local": 0.061,
28
+ "religion": 0.0,
29
+ "sports": 0.036,
30
+ },
31
+ "cleaned_text": "يكن سعر فاكه خضرو موسم انبات اقل غير موسم",
32
+ "model_used": "svm_classifier",
33
+ "prediction_metadata": {
34
+ "max_probability": 0.902,
35
+ "min_probability": 0.0,
36
+ "entropy": 0.393,
37
+ "num_classes": 6,
38
+ },
39
+ },
40
+ "summarization": {
41
+ "summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
42
+ "original_sentence_count": 4,
43
+ "summary_sentence_count": 2,
44
+ "sentences": [
45
+ "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
46
+ "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
47
+ "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
48
+ "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
49
+ ],
50
+ "selected_indices": [1, 2],
51
+ "sentence_scores": [2.968, 3.224, 3.234, 2.642],
52
+ "top_sentence_scores": [3.224, 3.234],
53
+ },
54
+ "text_analysis": {
55
+ "text": EXAMPLE_TEXT,
56
+ "analysis": {
57
+ "character_count": 282,
58
+ "word_count": 46,
59
+ "sentence_count": 4,
60
+ "arabic_character_count": 252,
61
+ "arabic_character_ratio": 0.8936,
62
+ "average_word_length": 5.48,
63
+ "average_sentence_length": 11.5,
64
+ "has_diacritics": False,
65
+ "punctuation_count": 3,
66
+ },
67
+ },
68
+ "batch_classification": {
69
+ "results": [
70
+ {
71
+ "prediction": "culture",
72
+ "prediction_index": 0,
73
+ "confidence": 0.902,
74
+ "probability_distribution": {
75
+ "culture": 0.902,
76
+ "economy": 0.001,
77
+ "international": 0.0,
78
+ "local": 0.061,
79
+ "religion": 0.0,
80
+ "sports": 0.036,
81
+ },
82
+ "cleaned_text": "يكن سعر فاكه خضرو موسم انبات اقل غير موسم",
83
+ "model_used": "svm_classifier",
84
+ "prediction_metadata": {
85
+ "max_probability": 0.902,
86
+ "min_probability": 0.0,
87
+ "entropy": 0.393,
88
+ "num_classes": 6,
89
+ },
90
+ }
91
+ ],
92
+ "total_texts": 3,
93
+ "model_used": "svm_classifier",
94
+ },
95
+ "sentence_analysis": {
96
+ "sentences": [
97
+ {
98
+ "index": 0,
99
+ "sentence": "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
100
+ "score": 2.968,
101
+ "rank": 3,
102
+ },
103
+ {
104
+ "index": 1,
105
+ "sentence": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
106
+ "score": 3.224,
107
+ "rank": 2,
108
+ },
109
+ {
110
+ "index": 2,
111
+ "sentence": "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
112
+ "score": 3.234,
113
+ "rank": 1,
114
+ },
115
+ {
116
+ "index": 3,
117
+ "sentence": "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
118
+ "score": 2.642,
119
+ "rank": 4,
120
+ },
121
+ ],
122
+ "total_sentences": 4,
123
+ "score_statistics": {"mean": 3.017, "std": 0.254, "min": 2.642, "max": 3.234},
124
+ },
125
+ "complete_analysis": {
126
+ "original_text": EXAMPLE_TEXT,
127
+ "text_analysis": {
128
+ "character_count": 282,
129
+ "word_count": 46,
130
+ "sentence_count": 4,
131
+ "arabic_character_count": 252,
132
+ "arabic_character_ratio": 0.8936,
133
+ "average_word_length": 5.48,
134
+ "average_sentence_length": 11.5,
135
+ "has_diacritics": False,
136
+ "punctuation_count": 3,
137
+ },
138
+ "classification": {
139
+ "prediction": "culture",
140
+ "prediction_index": 0,
141
+ "confidence": 0.902,
142
+ "probability_distribution": {
143
+ "culture": 0.902,
144
+ "economy": 0.001,
145
+ "international": 0.0,
146
+ "local": 0.061,
147
+ "religion": 0.0,
148
+ "sports": 0.036,
149
+ },
150
+ "cleaned_text": "يكن سعر فاكه خضرو موسم انبات اقل غير موسم",
151
+ "model_used": "svm_classifier",
152
+ "prediction_metadata": {
153
+ "max_probability": 0.902,
154
+ "min_probability": 0.0,
155
+ "entropy": 0.393,
156
+ "num_classes": 6,
157
+ },
158
+ },
159
+ "summarization": {
160
+ "summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
161
+ "original_sentence_count": 4,
162
+ "summary_sentence_count": 2,
163
+ "sentences": [
164
+ "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
165
+ "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
166
+ "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
167
+ "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
168
+ ],
169
+ "selected_indices": [1, 2],
170
+ "sentence_scores": [2.968, 3.224, 3.234, 2.642],
171
+ "top_sentence_scores": [3.224, 3.234],
172
+ },
173
+ },
174
+ "preprocessing": {
175
+ "task_type": "classification",
176
+ "preprocessing_steps": {
177
+ "original_text": EXAMPLE_TEXT,
178
+ "step_1_remove_diacritics": "يكون سعر الفاكهة والخضراوات في موسم انباتها اقل من غيره من المواسم",
179
+ "step_2_remove_punctuation": "يكون سعر الفاكهة والخضراوات في موسم انباتها اقل من غيره من المواسم",
180
+ "step_3_normalize_text": "يكون سعر الفاكهة والخضراوات في موسم انباتها اقل من غيره من المواسم",
181
+ "step_4_remove_stopwords": "سعر فاكهة خضراوات موسم انباتها اقل غيره مواسم",
182
+ "step_5_stem_words": "سعر فاكه خضرو موسم انبات اقل غير موسم",
183
+ "final_result": "سعر فاكه خضرو موسم انبات اقل غير موسم",
184
+ "preprocessing_summary": {
185
+ "original_length": 282,
186
+ "final_length": 47,
187
+ "reduction_percentage": 83.3,
188
+ "words_removed": 39,
189
+ "words_remaining": 7,
190
+ },
191
+ },
192
+ },
193
+ "model_info": {
194
+ "classifier": {
195
+ "model_name": "svm_classifier",
196
+ "vectorizer_loaded": True,
197
+ "model_loaded": True,
198
+ "classes": [
199
+ "culture",
200
+ "economy",
201
+ "international",
202
+ "local",
203
+ "religion",
204
+ "sports",
205
+ ],
206
+ "num_classes": 6,
207
+ "model_type": "SVM with TF-IDF vectorization",
208
+ },
209
+ "summarizer": {
210
+ "vectorizer_loaded": True,
211
+ "model_type": "TF-IDF based summarization",
212
+ },
213
+ },
214
+ }