XGB / models /tfidf_xgb.py
subbunanepalli's picture
Create models/tfidf_xgb.py
333e68b verified
# tfidf_based_models/tfidf_xgb.py
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import numpy as np
import pandas as pd
import joblib
import os
from config import TEXT_COLUMN, LABEL_COLUMNS, TFIDF_MAX_FEATURES, MODEL_SAVE_DIR
class TfidfXGBoost:
"""
TF-IDF based XGBoost model for multi-output classification.
Trains a separate XGBoost classifier for each target label using TF-IDF features.
"""
def __init__(self, label_encoders):
self.label_encoders = label_encoders
self.models = {} # Dictionary to hold trained pipelines
def train(self, X_train_text, y_train_df):
print("Training TF-IDF + XGBoost models...")
for i, col in enumerate(LABEL_COLUMNS):
print(f" Training for {col}...")
num_classes = len(self.label_encoders[col].classes_)
objective = 'multi:softprob' if num_classes > 2 else 'binary:logistic'
num_class_param = {'num_class': num_classes} if num_classes > 2 else {}
pipeline = Pipeline([
('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)),
('xgb', xgb.XGBClassifier(
objective=objective,
**num_class_param,
random_state=42,
n_estimators=100,
use_label_encoder=False,
eval_metric='mlogloss' if num_classes > 2 else 'logloss'
))
])
pipeline.fit(X_train_text, y_train_df[col])
self.models[col] = pipeline
print("TF-IDF + XGBoost training complete.")
def predict(self, X_test_text):
predictions = {}
for col, model_pipeline in self.models.items():
predictions[col] = model_pipeline.predict(X_test_text)
return predictions
def predict_proba(self, X_test_text):
probabilities = []
for col in LABEL_COLUMNS:
if col in self.models:
probabilities.append(self.models[col].predict_proba(X_test_text))
else:
print(f"Warning: Model for {col} not found, cannot predict probabilities.")
probabilities.append(np.array([]))
return probabilities
def evaluate(self, X_test_text, y_test_df):
reports = {}
truths = [[] for _ in range(len(LABEL_COLUMNS))]
preds = [[] for _ in range(len(LABEL_COLUMNS))]
for i, col in enumerate(LABEL_COLUMNS):
if col in self.models:
y_pred = self.models[col].predict(X_test_text)
y_true = y_test_df[col].values
try:
report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
reports[col] = report
except ValueError:
print(f"Warning: Could not generate classification report for {col}. Skipping.")
reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
truths[i].extend(y_true)
preds[i].extend(y_pred)
else:
print(f"Warning: Model for {col} not found for evaluation.")
return reports, truths, preds
def save_model(self, model_name="tfidf_xgb", save_format='pickle'):
if save_format != 'pickle':
raise ValueError("TF-IDF models only support 'pickle' format")
save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
joblib.dump(self.models, save_path)
print(f"TF-IDF XGBoost models saved to {save_path}")
def load_model(self, model_name="tfidf_xgb"):
load_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
if os.path.exists(load_path):
self.models = joblib.load(load_path)
print(f"TF-IDF XGBoost models loaded from {load_path}")
else:
print(f"Error: Model file not found at {load_path}. Initializing models as empty.")
self.models = {}