moabos
commited on
Commit
·
29dedef
1
Parent(s):
354c6a0
chore: add req and res samples for API reqs, remove redundant lines
Browse files- app.py +162 -19
- classifier.py +82 -60
- examples.py +214 -0
app.py
CHANGED
@@ -1,10 +1,18 @@
|
|
1 |
from typing import Optional, List, Dict, Any
|
2 |
from fastapi import FastAPI, HTTPException
|
3 |
from pydantic import BaseModel
|
|
|
4 |
|
5 |
from classifier import ArabicClassifier
|
6 |
from summarizer import ArabicSummarizer
|
7 |
from preprocessor import ArabicPreprocessor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
app = FastAPI(
|
10 |
title="Arabic Text Analysis API",
|
@@ -19,20 +27,155 @@ preprocessor = ArabicPreprocessor()
|
|
19 |
|
20 |
class TextInput(BaseModel):
|
21 |
text: str
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
|
24 |
class TextInputWithSentences(BaseModel):
|
25 |
text: str
|
26 |
num_sentences: Optional[int] = 3
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
|
29 |
class BatchTextInput(BaseModel):
|
30 |
texts: List[str]
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
|
33 |
class PreprocessingInput(BaseModel):
|
34 |
text: str
|
35 |
-
task_type:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
|
38 |
@app.get("/")
|
@@ -58,8 +201,8 @@ def read_root() -> Dict[str, Any]:
|
|
58 |
}
|
59 |
|
60 |
|
61 |
-
@app.post("/classify")
|
62 |
-
def classify_text(data: TextInput) ->
|
63 |
"""Classify Arabic text with probability distribution and metadata."""
|
64 |
try:
|
65 |
result = classifier.predict(data.text)
|
@@ -68,8 +211,8 @@ def classify_text(data: TextInput) -> Dict[str, Any]:
|
|
68 |
raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
|
69 |
|
70 |
|
71 |
-
@app.post("/classify/batch")
|
72 |
-
def classify_texts(data: BatchTextInput) ->
|
73 |
"""Classify multiple Arabic texts in batch."""
|
74 |
try:
|
75 |
results = classifier.predict_batch(data.texts)
|
@@ -82,8 +225,8 @@ def classify_texts(data: BatchTextInput) -> Dict[str, Any]:
|
|
82 |
raise HTTPException(status_code=500, detail=f"Batch classification failed: {str(e)}")
|
83 |
|
84 |
|
85 |
-
@app.post("/summarize")
|
86 |
-
def summarize_text(data: TextInputWithSentences) ->
|
87 |
"""Summarize Arabic text with sentence analysis."""
|
88 |
try:
|
89 |
result = summarizer.summarize(data.text, data.num_sentences)
|
@@ -92,8 +235,8 @@ def summarize_text(data: TextInputWithSentences) -> Dict[str, Any]:
|
|
92 |
raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
|
93 |
|
94 |
|
95 |
-
@app.post("/sentence-analysis")
|
96 |
-
def analyze_sentences(data: TextInput) ->
|
97 |
"""Analyze all sentences with scores and rankings."""
|
98 |
try:
|
99 |
result = summarizer.get_sentence_analysis(data.text)
|
@@ -102,8 +245,8 @@ def analyze_sentences(data: TextInput) -> Dict[str, Any]:
|
|
102 |
raise HTTPException(status_code=500, detail=f"Sentence analysis failed: {str(e)}")
|
103 |
|
104 |
|
105 |
-
@app.post("/analyze")
|
106 |
-
def analyze_text_complete(data: TextInputWithSentences) ->
|
107 |
"""Complete analysis: classification, summarization, and text statistics."""
|
108 |
try:
|
109 |
classification_result = classifier.predict(data.text)
|
@@ -120,21 +263,21 @@ def analyze_text_complete(data: TextInputWithSentences) -> Dict[str, Any]:
|
|
120 |
raise HTTPException(status_code=500, detail=f"Complete analysis failed: {str(e)}")
|
121 |
|
122 |
|
123 |
-
@app.post("/preprocess")
|
124 |
-
def preprocess_text(data: PreprocessingInput) ->
|
125 |
"""Preprocess text with step-by-step breakdown."""
|
126 |
try:
|
127 |
-
steps = preprocessor.get_preprocessing_steps(data.text, data.task_type)
|
128 |
return {
|
129 |
-
"task_type": data.task_type,
|
130 |
"preprocessing_steps": steps
|
131 |
}
|
132 |
except Exception as e:
|
133 |
raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
|
134 |
|
135 |
|
136 |
-
@app.post("/text-analysis")
|
137 |
-
def analyze_text_characteristics(data: TextInput) ->
|
138 |
"""Analyze text characteristics and statistics."""
|
139 |
try:
|
140 |
analysis = preprocessor.analyze_text(data.text)
|
@@ -146,8 +289,8 @@ def analyze_text_characteristics(data: TextInput) -> Dict[str, Any]:
|
|
146 |
raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
|
147 |
|
148 |
|
149 |
-
@app.get("/model-info")
|
150 |
-
def get_model_info() ->
|
151 |
"""Get information about loaded models."""
|
152 |
try:
|
153 |
classifier_info = classifier.get_model_info()
|
|
|
1 |
from typing import Optional, List, Dict, Any
|
2 |
from fastapi import FastAPI, HTTPException
|
3 |
from pydantic import BaseModel
|
4 |
+
from enum import Enum
|
5 |
|
6 |
from classifier import ArabicClassifier
|
7 |
from summarizer import ArabicSummarizer
|
8 |
from preprocessor import ArabicPreprocessor
|
9 |
+
from examples import REQUEST_EXAMPLES, RESPONSE_EXAMPLES
|
10 |
+
|
11 |
+
|
12 |
+
class TaskType(str, Enum):
|
13 |
+
CLASSIFICATION = "classification"
|
14 |
+
SUMMARIZATION = "summarization"
|
15 |
+
|
16 |
|
17 |
app = FastAPI(
|
18 |
title="Arabic Text Analysis API",
|
|
|
27 |
|
28 |
class TextInput(BaseModel):
|
29 |
text: str
|
30 |
+
|
31 |
+
model_config = {
|
32 |
+
"json_schema_extra": {
|
33 |
+
"example": REQUEST_EXAMPLES["text_input"]
|
34 |
+
}
|
35 |
+
}
|
36 |
|
37 |
|
38 |
class TextInputWithSentences(BaseModel):
|
39 |
text: str
|
40 |
num_sentences: Optional[int] = 3
|
41 |
+
|
42 |
+
model_config = {
|
43 |
+
"json_schema_extra": {
|
44 |
+
"example": REQUEST_EXAMPLES["text_input_with_sentences"]
|
45 |
+
}
|
46 |
+
}
|
47 |
|
48 |
|
49 |
class BatchTextInput(BaseModel):
|
50 |
texts: List[str]
|
51 |
+
|
52 |
+
model_config = {
|
53 |
+
"json_schema_extra": {
|
54 |
+
"example": REQUEST_EXAMPLES["batch_text_input"]
|
55 |
+
}
|
56 |
+
}
|
57 |
|
58 |
|
59 |
class PreprocessingInput(BaseModel):
|
60 |
text: str
|
61 |
+
task_type: TaskType = TaskType.CLASSIFICATION
|
62 |
+
|
63 |
+
model_config = {
|
64 |
+
"json_schema_extra": {
|
65 |
+
"example": REQUEST_EXAMPLES["preprocessing_input"]
|
66 |
+
}
|
67 |
+
}
|
68 |
+
|
69 |
+
|
70 |
+
class ClassificationResponse(BaseModel):
|
71 |
+
prediction: str
|
72 |
+
prediction_index: int
|
73 |
+
confidence: float
|
74 |
+
probability_distribution: Dict[str, float]
|
75 |
+
cleaned_text: str
|
76 |
+
model_used: str
|
77 |
+
prediction_metadata: Dict[str, Any]
|
78 |
+
|
79 |
+
model_config = {
|
80 |
+
"protected_namespaces": (),
|
81 |
+
"json_schema_extra": {
|
82 |
+
"example": RESPONSE_EXAMPLES["classification"],
|
83 |
+
"schema_extra": {
|
84 |
+
"properties": {
|
85 |
+
"prediction_index": {
|
86 |
+
"description": "Numerical index of the predicted class (0=culture, 1=economy, 2=international, 3=local, 4=religion, 5=sports)"
|
87 |
+
}
|
88 |
+
}
|
89 |
+
}
|
90 |
+
}
|
91 |
+
}
|
92 |
+
|
93 |
+
|
94 |
+
class SummarizationResponse(BaseModel):
|
95 |
+
summary: str
|
96 |
+
original_sentence_count: int
|
97 |
+
summary_sentence_count: int
|
98 |
+
sentences: List[str]
|
99 |
+
selected_indices: List[int]
|
100 |
+
sentence_scores: Optional[List[float]]
|
101 |
+
top_sentence_scores: Optional[List[float]]
|
102 |
+
|
103 |
+
model_config = {
|
104 |
+
"json_schema_extra": {
|
105 |
+
"example": RESPONSE_EXAMPLES["summarization"]
|
106 |
+
}
|
107 |
+
}
|
108 |
+
|
109 |
+
|
110 |
+
class TextAnalysisResponse(BaseModel):
|
111 |
+
text: str
|
112 |
+
analysis: Dict[str, Any]
|
113 |
+
|
114 |
+
model_config = {
|
115 |
+
"json_schema_extra": {
|
116 |
+
"example": RESPONSE_EXAMPLES["text_analysis"]
|
117 |
+
}
|
118 |
+
}
|
119 |
+
|
120 |
+
|
121 |
+
class BatchClassificationResponse(BaseModel):
|
122 |
+
results: List[ClassificationResponse]
|
123 |
+
total_texts: int
|
124 |
+
model_used: str
|
125 |
+
|
126 |
+
model_config = {
|
127 |
+
"protected_namespaces": (),
|
128 |
+
"json_schema_extra": {
|
129 |
+
"example": RESPONSE_EXAMPLES["batch_classification"]
|
130 |
+
}
|
131 |
+
}
|
132 |
+
|
133 |
+
|
134 |
+
class SentenceAnalysisResponse(BaseModel):
|
135 |
+
sentences: List[Dict[str, Any]]
|
136 |
+
total_sentences: int
|
137 |
+
score_statistics: Dict[str, float]
|
138 |
+
|
139 |
+
model_config = {
|
140 |
+
"json_schema_extra": {
|
141 |
+
"example": RESPONSE_EXAMPLES["sentence_analysis"]
|
142 |
+
}
|
143 |
+
}
|
144 |
+
|
145 |
+
|
146 |
+
class CompleteAnalysisResponse(BaseModel):
|
147 |
+
original_text: str
|
148 |
+
text_analysis: Dict[str, Any]
|
149 |
+
classification: ClassificationResponse
|
150 |
+
summarization: SummarizationResponse
|
151 |
+
|
152 |
+
model_config = {
|
153 |
+
"json_schema_extra": {
|
154 |
+
"example": RESPONSE_EXAMPLES["complete_analysis"]
|
155 |
+
}
|
156 |
+
}
|
157 |
+
|
158 |
+
|
159 |
+
class PreprocessingResponse(BaseModel):
|
160 |
+
task_type: str
|
161 |
+
preprocessing_steps: Dict[str, Any]
|
162 |
+
|
163 |
+
model_config = {
|
164 |
+
"json_schema_extra": {
|
165 |
+
"example": RESPONSE_EXAMPLES["preprocessing"]
|
166 |
+
}
|
167 |
+
}
|
168 |
+
|
169 |
+
|
170 |
+
class ModelInfoResponse(BaseModel):
|
171 |
+
classifier: Dict[str, Any]
|
172 |
+
summarizer: Dict[str, Any]
|
173 |
+
|
174 |
+
model_config = {
|
175 |
+
"json_schema_extra": {
|
176 |
+
"example": RESPONSE_EXAMPLES["model_info"]
|
177 |
+
}
|
178 |
+
}
|
179 |
|
180 |
|
181 |
@app.get("/")
|
|
|
201 |
}
|
202 |
|
203 |
|
204 |
+
@app.post("/classify", response_model=ClassificationResponse)
|
205 |
+
def classify_text(data: TextInput) -> ClassificationResponse:
|
206 |
"""Classify Arabic text with probability distribution and metadata."""
|
207 |
try:
|
208 |
result = classifier.predict(data.text)
|
|
|
211 |
raise HTTPException(status_code=500, detail=f"Classification failed: {str(e)}")
|
212 |
|
213 |
|
214 |
+
@app.post("/classify/batch", response_model=BatchClassificationResponse)
|
215 |
+
def classify_texts(data: BatchTextInput) -> BatchClassificationResponse:
|
216 |
"""Classify multiple Arabic texts in batch."""
|
217 |
try:
|
218 |
results = classifier.predict_batch(data.texts)
|
|
|
225 |
raise HTTPException(status_code=500, detail=f"Batch classification failed: {str(e)}")
|
226 |
|
227 |
|
228 |
+
@app.post("/summarize", response_model=SummarizationResponse)
|
229 |
+
def summarize_text(data: TextInputWithSentences) -> SummarizationResponse:
|
230 |
"""Summarize Arabic text with sentence analysis."""
|
231 |
try:
|
232 |
result = summarizer.summarize(data.text, data.num_sentences)
|
|
|
235 |
raise HTTPException(status_code=500, detail=f"Summarization failed: {str(e)}")
|
236 |
|
237 |
|
238 |
+
@app.post("/sentence-analysis", response_model=SentenceAnalysisResponse)
|
239 |
+
def analyze_sentences(data: TextInput) -> SentenceAnalysisResponse:
|
240 |
"""Analyze all sentences with scores and rankings."""
|
241 |
try:
|
242 |
result = summarizer.get_sentence_analysis(data.text)
|
|
|
245 |
raise HTTPException(status_code=500, detail=f"Sentence analysis failed: {str(e)}")
|
246 |
|
247 |
|
248 |
+
@app.post("/analyze", response_model=CompleteAnalysisResponse)
|
249 |
+
def analyze_text_complete(data: TextInputWithSentences) -> CompleteAnalysisResponse:
|
250 |
"""Complete analysis: classification, summarization, and text statistics."""
|
251 |
try:
|
252 |
classification_result = classifier.predict(data.text)
|
|
|
263 |
raise HTTPException(status_code=500, detail=f"Complete analysis failed: {str(e)}")
|
264 |
|
265 |
|
266 |
+
@app.post("/preprocess", response_model=PreprocessingResponse)
|
267 |
+
def preprocess_text(data: PreprocessingInput) -> PreprocessingResponse:
|
268 |
"""Preprocess text with step-by-step breakdown."""
|
269 |
try:
|
270 |
+
steps = preprocessor.get_preprocessing_steps(data.text, data.task_type.value)
|
271 |
return {
|
272 |
+
"task_type": data.task_type.value,
|
273 |
"preprocessing_steps": steps
|
274 |
}
|
275 |
except Exception as e:
|
276 |
raise HTTPException(status_code=500, detail=f"Preprocessing failed: {str(e)}")
|
277 |
|
278 |
|
279 |
+
@app.post("/text-analysis", response_model=TextAnalysisResponse)
|
280 |
+
def analyze_text_characteristics(data: TextInput) -> TextAnalysisResponse:
|
281 |
"""Analyze text characteristics and statistics."""
|
282 |
try:
|
283 |
analysis = preprocessor.analyze_text(data.text)
|
|
|
289 |
raise HTTPException(status_code=500, detail=f"Text analysis failed: {str(e)}")
|
290 |
|
291 |
|
292 |
+
@app.get("/model-info", response_model=ModelInfoResponse)
|
293 |
+
def get_model_info() -> ModelInfoResponse:
|
294 |
"""Get information about loaded models."""
|
295 |
try:
|
296 |
classifier_info = classifier.get_model_info()
|
classifier.py
CHANGED
@@ -6,52 +6,61 @@ from preprocessor import preprocess_for_classification
|
|
6 |
|
7 |
class ArabicClassifier:
|
8 |
"""Arabic text classifier with probability distributions and metadata."""
|
9 |
-
|
10 |
-
def __init__(
|
11 |
-
|
12 |
-
|
|
|
|
|
13 |
self.model = joblib.load(classifier_path)
|
14 |
self.vectorizer = joblib.load(vectorizer_path)
|
15 |
self.model_name = classifier_path.split("/")[-1].replace(".joblib", "")
|
16 |
-
|
17 |
def predict(self, text: str) -> Dict[str, Any]:
|
18 |
"""Predict class with full probability distribution and metadata."""
|
19 |
cleaned_text = preprocess_for_classification(text)
|
20 |
-
|
21 |
if self.vectorizer:
|
22 |
text_vector = self.vectorizer.transform([cleaned_text])
|
23 |
else:
|
24 |
text_vector = [cleaned_text]
|
25 |
-
|
26 |
prediction = self.model.predict(text_vector)[0]
|
27 |
-
|
28 |
-
classes = getattr(self.model,
|
29 |
if classes is not None:
|
30 |
prediction_index = int(np.where(classes == prediction)[0][0])
|
31 |
else:
|
32 |
-
prediction_index =
|
33 |
-
|
34 |
-
|
|
|
|
|
35 |
probabilities = self.model.predict_proba(text_vector)[0]
|
36 |
confidence = float(probabilities[prediction_index])
|
37 |
else:
|
38 |
-
if hasattr(self.model,
|
39 |
decision_scores = self.model.decision_function(text_vector)[0]
|
40 |
if len(decision_scores.shape) == 0:
|
41 |
-
probabilities = np.array(
|
|
|
|
|
|
|
|
|
|
|
42 |
else:
|
43 |
exp_scores = np.exp(decision_scores - np.max(decision_scores))
|
44 |
probabilities = exp_scores / np.sum(exp_scores)
|
45 |
confidence = float(probabilities[prediction_index])
|
46 |
else:
|
47 |
-
classes = getattr(self.model,
|
48 |
num_classes = len(classes) if classes is not None else 2
|
49 |
probabilities = np.zeros(num_classes)
|
50 |
probabilities[prediction_index] = 1.0
|
51 |
confidence = 1.0
|
52 |
-
|
53 |
-
classes = getattr(self.model,
|
54 |
-
|
55 |
prob_distribution = {}
|
56 |
if classes is not None:
|
57 |
for i, class_label in enumerate(classes):
|
@@ -59,36 +68,36 @@ class ArabicClassifier:
|
|
59 |
else:
|
60 |
for i, prob in enumerate(probabilities):
|
61 |
prob_distribution[f"class_{i}"] = float(prob)
|
62 |
-
|
63 |
return {
|
64 |
"prediction": str(prediction),
|
65 |
-
"prediction_label": str(prediction),
|
66 |
"prediction_index": int(prediction_index),
|
67 |
"confidence": confidence,
|
68 |
"probability_distribution": prob_distribution,
|
69 |
-
"all_probabilities": probabilities.tolist(),
|
70 |
"cleaned_text": cleaned_text,
|
71 |
"model_used": self.model_name,
|
72 |
"prediction_metadata": {
|
73 |
"max_probability": float(np.max(probabilities)),
|
74 |
"min_probability": float(np.min(probabilities)),
|
75 |
-
"entropy": float(
|
76 |
-
|
77 |
-
|
|
|
|
|
78 |
}
|
79 |
-
|
80 |
def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
|
81 |
"""Predict classes for multiple texts."""
|
82 |
cleaned_texts = [preprocess_for_classification(text) for text in texts]
|
83 |
-
|
84 |
if self.vectorizer:
|
85 |
text_vectors = self.vectorizer.transform(cleaned_texts)
|
86 |
else:
|
87 |
text_vectors = cleaned_texts
|
88 |
-
|
89 |
predictions = self.model.predict(text_vectors)
|
90 |
-
classes = getattr(self.model,
|
91 |
-
|
92 |
prediction_indices = []
|
93 |
for pred in predictions:
|
94 |
if classes is not None:
|
@@ -96,29 +105,38 @@ class ArabicClassifier:
|
|
96 |
else:
|
97 |
pred_index = int(pred) if isinstance(pred, (int, np.integer)) else 0
|
98 |
prediction_indices.append(pred_index)
|
99 |
-
|
100 |
-
if hasattr(self.model,
|
101 |
probabilities = self.model.predict_proba(text_vectors)
|
102 |
else:
|
103 |
-
if hasattr(self.model,
|
104 |
decision_scores = self.model.decision_function(text_vectors)
|
105 |
if len(decision_scores.shape) == 1:
|
106 |
-
probabilities = np.column_stack(
|
|
|
|
|
|
|
|
|
|
|
107 |
else:
|
108 |
-
exp_scores = np.exp(
|
109 |
-
|
|
|
|
|
|
|
|
|
110 |
else:
|
111 |
-
classes = getattr(self.model,
|
112 |
num_classes = len(classes) if classes is not None else 2
|
113 |
probabilities = np.zeros((len(predictions), num_classes))
|
114 |
for i, pred_idx in enumerate(prediction_indices):
|
115 |
probabilities[i, pred_idx] = 1.0
|
116 |
-
|
117 |
results = []
|
118 |
-
|
119 |
for i, (pred, pred_idx) in enumerate(zip(predictions, prediction_indices)):
|
120 |
confidence = float(probabilities[i][pred_idx])
|
121 |
-
|
122 |
prob_distribution = {}
|
123 |
if classes is not None:
|
124 |
for j, class_label in enumerate(classes):
|
@@ -126,35 +144,39 @@ class ArabicClassifier:
|
|
126 |
else:
|
127 |
for j, prob in enumerate(probabilities[i]):
|
128 |
prob_distribution[f"class_{j}"] = float(prob)
|
129 |
-
|
130 |
-
results.append(
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
|
|
144 |
}
|
145 |
-
|
146 |
-
|
147 |
return results
|
148 |
-
|
149 |
def get_model_info(self) -> Dict[str, Any]:
|
150 |
"""Get model information and capabilities."""
|
151 |
-
classes = getattr(self.model,
|
152 |
return {
|
153 |
"model_name": self.model_name,
|
154 |
"model_type": type(self.model).__name__,
|
155 |
"num_classes": len(classes) if classes is not None else "unknown",
|
156 |
"classes": classes.tolist() if classes is not None else None,
|
157 |
-
"has_predict_proba": hasattr(self.model,
|
158 |
"has_vectorizer": self.vectorizer is not None,
|
159 |
-
"vectorizer_type": type(self.vectorizer).__name__
|
|
|
|
|
160 |
}
|
|
|
6 |
|
7 |
class ArabicClassifier:
|
8 |
"""Arabic text classifier with probability distributions and metadata."""
|
9 |
+
|
10 |
+
def __init__(
|
11 |
+
self,
|
12 |
+
classifier_path: str = "svm_classifier.joblib",
|
13 |
+
vectorizer_path: str = "tfidf_vectorizer_classifier.joblib",
|
14 |
+
):
|
15 |
self.model = joblib.load(classifier_path)
|
16 |
self.vectorizer = joblib.load(vectorizer_path)
|
17 |
self.model_name = classifier_path.split("/")[-1].replace(".joblib", "")
|
18 |
+
|
19 |
def predict(self, text: str) -> Dict[str, Any]:
|
20 |
"""Predict class with full probability distribution and metadata."""
|
21 |
cleaned_text = preprocess_for_classification(text)
|
22 |
+
|
23 |
if self.vectorizer:
|
24 |
text_vector = self.vectorizer.transform([cleaned_text])
|
25 |
else:
|
26 |
text_vector = [cleaned_text]
|
27 |
+
|
28 |
prediction = self.model.predict(text_vector)[0]
|
29 |
+
|
30 |
+
classes = getattr(self.model, "classes_", None)
|
31 |
if classes is not None:
|
32 |
prediction_index = int(np.where(classes == prediction)[0][0])
|
33 |
else:
|
34 |
+
prediction_index = (
|
35 |
+
int(prediction) if isinstance(prediction, (int, np.integer)) else 0
|
36 |
+
)
|
37 |
+
|
38 |
+
if hasattr(self.model, "predict_proba"):
|
39 |
probabilities = self.model.predict_proba(text_vector)[0]
|
40 |
confidence = float(probabilities[prediction_index])
|
41 |
else:
|
42 |
+
if hasattr(self.model, "decision_function"):
|
43 |
decision_scores = self.model.decision_function(text_vector)[0]
|
44 |
if len(decision_scores.shape) == 0:
|
45 |
+
probabilities = np.array(
|
46 |
+
[
|
47 |
+
1 / (1 + np.exp(decision_scores)),
|
48 |
+
1 / (1 + np.exp(-decision_scores)),
|
49 |
+
]
|
50 |
+
)
|
51 |
else:
|
52 |
exp_scores = np.exp(decision_scores - np.max(decision_scores))
|
53 |
probabilities = exp_scores / np.sum(exp_scores)
|
54 |
confidence = float(probabilities[prediction_index])
|
55 |
else:
|
56 |
+
classes = getattr(self.model, "classes_", None)
|
57 |
num_classes = len(classes) if classes is not None else 2
|
58 |
probabilities = np.zeros(num_classes)
|
59 |
probabilities[prediction_index] = 1.0
|
60 |
confidence = 1.0
|
61 |
+
|
62 |
+
classes = getattr(self.model, "classes_", None)
|
63 |
+
|
64 |
prob_distribution = {}
|
65 |
if classes is not None:
|
66 |
for i, class_label in enumerate(classes):
|
|
|
68 |
else:
|
69 |
for i, prob in enumerate(probabilities):
|
70 |
prob_distribution[f"class_{i}"] = float(prob)
|
71 |
+
|
72 |
return {
|
73 |
"prediction": str(prediction),
|
|
|
74 |
"prediction_index": int(prediction_index),
|
75 |
"confidence": confidence,
|
76 |
"probability_distribution": prob_distribution,
|
|
|
77 |
"cleaned_text": cleaned_text,
|
78 |
"model_used": self.model_name,
|
79 |
"prediction_metadata": {
|
80 |
"max_probability": float(np.max(probabilities)),
|
81 |
"min_probability": float(np.min(probabilities)),
|
82 |
+
"entropy": float(
|
83 |
+
-np.sum(probabilities * np.log(probabilities + 1e-10))
|
84 |
+
),
|
85 |
+
"num_classes": len(probabilities),
|
86 |
+
},
|
87 |
}
|
88 |
+
|
89 |
def predict_batch(self, texts: List[str]) -> List[Dict[str, Any]]:
|
90 |
"""Predict classes for multiple texts."""
|
91 |
cleaned_texts = [preprocess_for_classification(text) for text in texts]
|
92 |
+
|
93 |
if self.vectorizer:
|
94 |
text_vectors = self.vectorizer.transform(cleaned_texts)
|
95 |
else:
|
96 |
text_vectors = cleaned_texts
|
97 |
+
|
98 |
predictions = self.model.predict(text_vectors)
|
99 |
+
classes = getattr(self.model, "classes_", None)
|
100 |
+
|
101 |
prediction_indices = []
|
102 |
for pred in predictions:
|
103 |
if classes is not None:
|
|
|
105 |
else:
|
106 |
pred_index = int(pred) if isinstance(pred, (int, np.integer)) else 0
|
107 |
prediction_indices.append(pred_index)
|
108 |
+
|
109 |
+
if hasattr(self.model, "predict_proba"):
|
110 |
probabilities = self.model.predict_proba(text_vectors)
|
111 |
else:
|
112 |
+
if hasattr(self.model, "decision_function"):
|
113 |
decision_scores = self.model.decision_function(text_vectors)
|
114 |
if len(decision_scores.shape) == 1:
|
115 |
+
probabilities = np.column_stack(
|
116 |
+
[
|
117 |
+
1 / (1 + np.exp(decision_scores)),
|
118 |
+
1 / (1 + np.exp(-decision_scores)),
|
119 |
+
]
|
120 |
+
)
|
121 |
else:
|
122 |
+
exp_scores = np.exp(
|
123 |
+
decision_scores - np.max(decision_scores, axis=1, keepdims=True)
|
124 |
+
)
|
125 |
+
probabilities = exp_scores / np.sum(
|
126 |
+
exp_scores, axis=1, keepdims=True
|
127 |
+
)
|
128 |
else:
|
129 |
+
classes = getattr(self.model, "classes_", None)
|
130 |
num_classes = len(classes) if classes is not None else 2
|
131 |
probabilities = np.zeros((len(predictions), num_classes))
|
132 |
for i, pred_idx in enumerate(prediction_indices):
|
133 |
probabilities[i, pred_idx] = 1.0
|
134 |
+
|
135 |
results = []
|
136 |
+
|
137 |
for i, (pred, pred_idx) in enumerate(zip(predictions, prediction_indices)):
|
138 |
confidence = float(probabilities[i][pred_idx])
|
139 |
+
|
140 |
prob_distribution = {}
|
141 |
if classes is not None:
|
142 |
for j, class_label in enumerate(classes):
|
|
|
144 |
else:
|
145 |
for j, prob in enumerate(probabilities[i]):
|
146 |
prob_distribution[f"class_{j}"] = float(prob)
|
147 |
+
|
148 |
+
results.append(
|
149 |
+
{
|
150 |
+
"prediction": str(pred),
|
151 |
+
"prediction_index": int(pred_idx),
|
152 |
+
"confidence": confidence,
|
153 |
+
"probability_distribution": prob_distribution,
|
154 |
+
"cleaned_text": cleaned_texts[i],
|
155 |
+
"model_used": self.model_name,
|
156 |
+
"prediction_metadata": {
|
157 |
+
"max_probability": float(np.max(probabilities[i])),
|
158 |
+
"min_probability": float(np.min(probabilities[i])),
|
159 |
+
"entropy": float(
|
160 |
+
-np.sum(probabilities[i] * np.log(probabilities[i] + 1e-10))
|
161 |
+
),
|
162 |
+
"num_classes": len(probabilities[i]),
|
163 |
+
},
|
164 |
}
|
165 |
+
)
|
166 |
+
|
167 |
return results
|
168 |
+
|
169 |
def get_model_info(self) -> Dict[str, Any]:
|
170 |
"""Get model information and capabilities."""
|
171 |
+
classes = getattr(self.model, "classes_", None)
|
172 |
return {
|
173 |
"model_name": self.model_name,
|
174 |
"model_type": type(self.model).__name__,
|
175 |
"num_classes": len(classes) if classes is not None else "unknown",
|
176 |
"classes": classes.tolist() if classes is not None else None,
|
177 |
+
"has_predict_proba": hasattr(self.model, "predict_proba"),
|
178 |
"has_vectorizer": self.vectorizer is not None,
|
179 |
+
"vectorizer_type": type(self.vectorizer).__name__
|
180 |
+
if self.vectorizer
|
181 |
+
else None,
|
182 |
}
|
examples.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""API request and response examples for documentation."""
|
2 |
+
|
3 |
+
EXAMPLE_TEXT = "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم. ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات. لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه. في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى."
|
4 |
+
|
5 |
+
REQUEST_EXAMPLES = {
|
6 |
+
"text_input": {"text": EXAMPLE_TEXT},
|
7 |
+
"text_input_with_sentences": {"text": EXAMPLE_TEXT, "num_sentences": 2},
|
8 |
+
"batch_text_input": {
|
9 |
+
"texts": [
|
10 |
+
EXAMPLE_TEXT,
|
11 |
+
"هذا نص تجريبي آخر للتصنيف باللغة العربية.",
|
12 |
+
"المطاعم في المدينة تقدم أطباق شهية ومتنوعة.",
|
13 |
+
]
|
14 |
+
},
|
15 |
+
"preprocessing_input": {"text": EXAMPLE_TEXT, "task_type": "classification"},
|
16 |
+
}
|
17 |
+
|
18 |
+
RESPONSE_EXAMPLES = {
|
19 |
+
"classification": {
|
20 |
+
"prediction": "culture",
|
21 |
+
"prediction_index": 0,
|
22 |
+
"confidence": 0.902,
|
23 |
+
"probability_distribution": {
|
24 |
+
"culture": 0.902,
|
25 |
+
"economy": 0.001,
|
26 |
+
"international": 0.0,
|
27 |
+
"local": 0.061,
|
28 |
+
"religion": 0.0,
|
29 |
+
"sports": 0.036,
|
30 |
+
},
|
31 |
+
"cleaned_text": "يكن سعر فاكه خضرو موسم انبات اقل غير موسم",
|
32 |
+
"model_used": "svm_classifier",
|
33 |
+
"prediction_metadata": {
|
34 |
+
"max_probability": 0.902,
|
35 |
+
"min_probability": 0.0,
|
36 |
+
"entropy": 0.393,
|
37 |
+
"num_classes": 6,
|
38 |
+
},
|
39 |
+
},
|
40 |
+
"summarization": {
|
41 |
+
"summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
42 |
+
"original_sentence_count": 4,
|
43 |
+
"summary_sentence_count": 2,
|
44 |
+
"sentences": [
|
45 |
+
"يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
|
46 |
+
"ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
|
47 |
+
"لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
48 |
+
"في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
|
49 |
+
],
|
50 |
+
"selected_indices": [1, 2],
|
51 |
+
"sentence_scores": [2.968, 3.224, 3.234, 2.642],
|
52 |
+
"top_sentence_scores": [3.224, 3.234],
|
53 |
+
},
|
54 |
+
"text_analysis": {
|
55 |
+
"text": EXAMPLE_TEXT,
|
56 |
+
"analysis": {
|
57 |
+
"character_count": 282,
|
58 |
+
"word_count": 46,
|
59 |
+
"sentence_count": 4,
|
60 |
+
"arabic_character_count": 252,
|
61 |
+
"arabic_character_ratio": 0.8936,
|
62 |
+
"average_word_length": 5.48,
|
63 |
+
"average_sentence_length": 11.5,
|
64 |
+
"has_diacritics": False,
|
65 |
+
"punctuation_count": 3,
|
66 |
+
},
|
67 |
+
},
|
68 |
+
"batch_classification": {
|
69 |
+
"results": [
|
70 |
+
{
|
71 |
+
"prediction": "culture",
|
72 |
+
"prediction_index": 0,
|
73 |
+
"confidence": 0.902,
|
74 |
+
"probability_distribution": {
|
75 |
+
"culture": 0.902,
|
76 |
+
"economy": 0.001,
|
77 |
+
"international": 0.0,
|
78 |
+
"local": 0.061,
|
79 |
+
"religion": 0.0,
|
80 |
+
"sports": 0.036,
|
81 |
+
},
|
82 |
+
"cleaned_text": "يكن سعر فاكه خضرو موسم انبات اقل غير موسم",
|
83 |
+
"model_used": "svm_classifier",
|
84 |
+
"prediction_metadata": {
|
85 |
+
"max_probability": 0.902,
|
86 |
+
"min_probability": 0.0,
|
87 |
+
"entropy": 0.393,
|
88 |
+
"num_classes": 6,
|
89 |
+
},
|
90 |
+
}
|
91 |
+
],
|
92 |
+
"total_texts": 3,
|
93 |
+
"model_used": "svm_classifier",
|
94 |
+
},
|
95 |
+
"sentence_analysis": {
|
96 |
+
"sentences": [
|
97 |
+
{
|
98 |
+
"index": 0,
|
99 |
+
"sentence": "يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
|
100 |
+
"score": 2.968,
|
101 |
+
"rank": 3,
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"index": 1,
|
105 |
+
"sentence": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
|
106 |
+
"score": 3.224,
|
107 |
+
"rank": 2,
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"index": 2,
|
111 |
+
"sentence": "لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
112 |
+
"score": 3.234,
|
113 |
+
"rank": 1,
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"index": 3,
|
117 |
+
"sentence": "في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
|
118 |
+
"score": 2.642,
|
119 |
+
"rank": 4,
|
120 |
+
},
|
121 |
+
],
|
122 |
+
"total_sentences": 4,
|
123 |
+
"score_statistics": {"mean": 3.017, "std": 0.254, "min": 2.642, "max": 3.234},
|
124 |
+
},
|
125 |
+
"complete_analysis": {
|
126 |
+
"original_text": EXAMPLE_TEXT,
|
127 |
+
"text_analysis": {
|
128 |
+
"character_count": 282,
|
129 |
+
"word_count": 46,
|
130 |
+
"sentence_count": 4,
|
131 |
+
"arabic_character_count": 252,
|
132 |
+
"arabic_character_ratio": 0.8936,
|
133 |
+
"average_word_length": 5.48,
|
134 |
+
"average_sentence_length": 11.5,
|
135 |
+
"has_diacritics": False,
|
136 |
+
"punctuation_count": 3,
|
137 |
+
},
|
138 |
+
"classification": {
|
139 |
+
"prediction": "culture",
|
140 |
+
"prediction_index": 0,
|
141 |
+
"confidence": 0.902,
|
142 |
+
"probability_distribution": {
|
143 |
+
"culture": 0.902,
|
144 |
+
"economy": 0.001,
|
145 |
+
"international": 0.0,
|
146 |
+
"local": 0.061,
|
147 |
+
"religion": 0.0,
|
148 |
+
"sports": 0.036,
|
149 |
+
},
|
150 |
+
"cleaned_text": "يكن سعر فاكه خضرو موسم انبات اقل غير موسم",
|
151 |
+
"model_used": "svm_classifier",
|
152 |
+
"prediction_metadata": {
|
153 |
+
"max_probability": 0.902,
|
154 |
+
"min_probability": 0.0,
|
155 |
+
"entropy": 0.393,
|
156 |
+
"num_classes": 6,
|
157 |
+
},
|
158 |
+
},
|
159 |
+
"summarization": {
|
160 |
+
"summary": "ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
161 |
+
"original_sentence_count": 4,
|
162 |
+
"summary_sentence_count": 2,
|
163 |
+
"sentences": [
|
164 |
+
"يكون سعر الفاكهة والخضراوات في موسم إنباتها أقل من غيره من المواسم",
|
165 |
+
"ستلجأ محلات الخضروات إلى عرض الفاكهة بأسعار مناسبة في موسمها بسبب توفر المنتجات",
|
166 |
+
"لا يقتصر الأمر على السعر الأقل، بل سيكون طعامك أشهى وألذ عند تناوله في موسمه",
|
167 |
+
"في فصل الخريف يتوفر التفاح والتين والبنجر والكمثرى",
|
168 |
+
],
|
169 |
+
"selected_indices": [1, 2],
|
170 |
+
"sentence_scores": [2.968, 3.224, 3.234, 2.642],
|
171 |
+
"top_sentence_scores": [3.224, 3.234],
|
172 |
+
},
|
173 |
+
},
|
174 |
+
"preprocessing": {
|
175 |
+
"task_type": "classification",
|
176 |
+
"preprocessing_steps": {
|
177 |
+
"original_text": EXAMPLE_TEXT,
|
178 |
+
"step_1_remove_diacritics": "يكون سعر الفاكهة والخضراوات في موسم انباتها اقل من غيره من المواسم",
|
179 |
+
"step_2_remove_punctuation": "يكون سعر الفاكهة والخضراوات في موسم انباتها اقل من غيره من المواسم",
|
180 |
+
"step_3_normalize_text": "يكون سعر الفاكهة والخضراوات في موسم انباتها اقل من غيره من المواسم",
|
181 |
+
"step_4_remove_stopwords": "سعر فاكهة خضراوات موسم انباتها اقل غيره مواسم",
|
182 |
+
"step_5_stem_words": "سعر فاكه خضرو موسم انبات اقل غير موسم",
|
183 |
+
"final_result": "سعر فاكه خضرو موسم انبات اقل غير موسم",
|
184 |
+
"preprocessing_summary": {
|
185 |
+
"original_length": 282,
|
186 |
+
"final_length": 47,
|
187 |
+
"reduction_percentage": 83.3,
|
188 |
+
"words_removed": 39,
|
189 |
+
"words_remaining": 7,
|
190 |
+
},
|
191 |
+
},
|
192 |
+
},
|
193 |
+
"model_info": {
|
194 |
+
"classifier": {
|
195 |
+
"model_name": "svm_classifier",
|
196 |
+
"vectorizer_loaded": True,
|
197 |
+
"model_loaded": True,
|
198 |
+
"classes": [
|
199 |
+
"culture",
|
200 |
+
"economy",
|
201 |
+
"international",
|
202 |
+
"local",
|
203 |
+
"religion",
|
204 |
+
"sports",
|
205 |
+
],
|
206 |
+
"num_classes": 6,
|
207 |
+
"model_type": "SVM with TF-IDF vectorization",
|
208 |
+
},
|
209 |
+
"summarizer": {
|
210 |
+
"vectorizer_loaded": True,
|
211 |
+
"model_type": "TF-IDF based summarization",
|
212 |
+
},
|
213 |
+
},
|
214 |
+
}
|