File size: 4,196 Bytes
333e68b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# tfidf_based_models/tfidf_xgb.py

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import numpy as np
import pandas as pd
import joblib
import os

from config import TEXT_COLUMN, LABEL_COLUMNS, TFIDF_MAX_FEATURES, MODEL_SAVE_DIR

class TfidfXGBoost:
    """
    TF-IDF based XGBoost model for multi-output classification.
    Trains a separate XGBoost classifier for each target label using TF-IDF features.
    """
    def __init__(self, label_encoders):
        self.label_encoders = label_encoders
        self.models = {}  # Dictionary to hold trained pipelines

    def train(self, X_train_text, y_train_df):
        print("Training TF-IDF + XGBoost models...")
        for i, col in enumerate(LABEL_COLUMNS):
            print(f"  Training for {col}...")

            num_classes = len(self.label_encoders[col].classes_)
            objective = 'multi:softprob' if num_classes > 2 else 'binary:logistic'
            num_class_param = {'num_class': num_classes} if num_classes > 2 else {}

            pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)),
                ('xgb', xgb.XGBClassifier(
                    objective=objective,
                    **num_class_param,
                    random_state=42,
                    n_estimators=100,
                    use_label_encoder=False,
                    eval_metric='mlogloss' if num_classes > 2 else 'logloss'
                ))
            ])

            pipeline.fit(X_train_text, y_train_df[col])
            self.models[col] = pipeline

        print("TF-IDF + XGBoost training complete.")

    def predict(self, X_test_text):
        predictions = {}
        for col, model_pipeline in self.models.items():
            predictions[col] = model_pipeline.predict(X_test_text)
        return predictions

    def predict_proba(self, X_test_text):
        probabilities = []
        for col in LABEL_COLUMNS:
            if col in self.models:
                probabilities.append(self.models[col].predict_proba(X_test_text))
            else:
                print(f"Warning: Model for {col} not found, cannot predict probabilities.")
                probabilities.append(np.array([]))
        return probabilities

    def evaluate(self, X_test_text, y_test_df):
        reports = {}
        truths = [[] for _ in range(len(LABEL_COLUMNS))]
        preds = [[] for _ in range(len(LABEL_COLUMNS))]

        for i, col in enumerate(LABEL_COLUMNS):
            if col in self.models:
                y_pred = self.models[col].predict(X_test_text)
                y_true = y_test_df[col].values
                try:
                    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
                    reports[col] = report
                except ValueError:
                    print(f"Warning: Could not generate classification report for {col}. Skipping.")
                    reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
                truths[i].extend(y_true)
                preds[i].extend(y_pred)
            else:
                print(f"Warning: Model for {col} not found for evaluation.")
        return reports, truths, preds

    def save_model(self, model_name="tfidf_xgb", save_format='pickle'):
        if save_format != 'pickle':
            raise ValueError("TF-IDF models only support 'pickle' format")
        save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
        joblib.dump(self.models, save_path)
        print(f"TF-IDF XGBoost models saved to {save_path}")

    def load_model(self, model_name="tfidf_xgb"):
        load_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
        if os.path.exists(load_path):
            self.models = joblib.load(load_path)
            print(f"TF-IDF XGBoost models loaded from {load_path}")
        else:
            print(f"Error: Model file not found at {load_path}. Initializing models as empty.")
            self.models = {}