Spaces:
Sleeping
Sleeping
# tfidf_based_models/tfidf_xgb.py | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.pipeline import Pipeline | |
from sklearn.metrics import classification_report | |
from sklearn.preprocessing import LabelEncoder | |
import xgboost as xgb | |
import numpy as np | |
import pandas as pd | |
import joblib | |
import os | |
from config import TEXT_COLUMN, LABEL_COLUMNS, TFIDF_MAX_FEATURES, MODEL_SAVE_DIR | |
class TfidfXGBoost: | |
""" | |
TF-IDF based XGBoost model for multi-output classification. | |
Trains a separate XGBoost classifier for each target label using TF-IDF features. | |
""" | |
def __init__(self, label_encoders): | |
self.label_encoders = label_encoders | |
self.models = {} # Dictionary to hold trained pipelines | |
def train(self, X_train_text, y_train_df): | |
print("Training TF-IDF + XGBoost models...") | |
for i, col in enumerate(LABEL_COLUMNS): | |
print(f" Training for {col}...") | |
num_classes = len(self.label_encoders[col].classes_) | |
objective = 'multi:softprob' if num_classes > 2 else 'binary:logistic' | |
num_class_param = {'num_class': num_classes} if num_classes > 2 else {} | |
pipeline = Pipeline([ | |
('tfidf', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES)), | |
('xgb', xgb.XGBClassifier( | |
objective=objective, | |
**num_class_param, | |
random_state=42, | |
n_estimators=100, | |
use_label_encoder=False, | |
eval_metric='mlogloss' if num_classes > 2 else 'logloss' | |
)) | |
]) | |
pipeline.fit(X_train_text, y_train_df[col]) | |
self.models[col] = pipeline | |
print("TF-IDF + XGBoost training complete.") | |
def predict(self, X_test_text): | |
predictions = {} | |
for col, model_pipeline in self.models.items(): | |
predictions[col] = model_pipeline.predict(X_test_text) | |
return predictions | |
def predict_proba(self, X_test_text): | |
probabilities = [] | |
for col in LABEL_COLUMNS: | |
if col in self.models: | |
probabilities.append(self.models[col].predict_proba(X_test_text)) | |
else: | |
print(f"Warning: Model for {col} not found, cannot predict probabilities.") | |
probabilities.append(np.array([])) | |
return probabilities | |
def evaluate(self, X_test_text, y_test_df): | |
reports = {} | |
truths = [[] for _ in range(len(LABEL_COLUMNS))] | |
preds = [[] for _ in range(len(LABEL_COLUMNS))] | |
for i, col in enumerate(LABEL_COLUMNS): | |
if col in self.models: | |
y_pred = self.models[col].predict(X_test_text) | |
y_true = y_test_df[col].values | |
try: | |
report = classification_report(y_true, y_pred, output_dict=True, zero_division=0) | |
reports[col] = report | |
except ValueError: | |
print(f"Warning: Could not generate classification report for {col}. Skipping.") | |
reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}} | |
truths[i].extend(y_true) | |
preds[i].extend(y_pred) | |
else: | |
print(f"Warning: Model for {col} not found for evaluation.") | |
return reports, truths, preds | |
def save_model(self, model_name="tfidf_xgb", save_format='pickle'): | |
if save_format != 'pickle': | |
raise ValueError("TF-IDF models only support 'pickle' format") | |
save_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") | |
joblib.dump(self.models, save_path) | |
print(f"TF-IDF XGBoost models saved to {save_path}") | |
def load_model(self, model_name="tfidf_xgb"): | |
load_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") | |
if os.path.exists(load_path): | |
self.models = joblib.load(load_path) | |
print(f"TF-IDF XGBoost models loaded from {load_path}") | |
else: | |
print(f"Error: Model file not found at {load_path}. Initializing models as empty.") | |
self.models = {} | |