Spaces:

subbunanepalli
/

XGB

Sleeping

App Files Files Community

subbunanepalli commited on Jun 20

Commit

4a9cc93

verified ·

1 Parent(s): 6df3606

Update train_utils.py

Browse files

Files changed (1) hide show

train_utils.py +242 -93

train_utils.py CHANGED Viewed

@@ -1,78 +1,202 @@
 import os
 import joblib
-import pandas as pd
-import numpy as np
-from sklearn.metrics import classification_report
-from sklearn.preprocessing import LabelEncoder
-from xgboost import XGBClassifier
-from config import LABEL_COLUMNS, MODEL_SAVE_DIR
-def train_xgb_models(X_train, y_train, label_encoders):
     """
-    Trains a separate XGBoost model for each label in a multi-output classification setup.
     Args:
-        X_train (array): Feature matrix (TF-IDF).
-        y_train (DataFrame): DataFrame with one column per label.
-        label_encoders (dict): LabelEncoders for each label column.
     Returns:
-        dict: Trained XGBoost models for each label.
     """
-    models = {}
-    for label in LABEL_COLUMNS:
-        y_encoded = label_encoders[label].transform(y_train[label])
-        model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', verbosity=0)
-        model.fit(X_train, y_encoded)
-        models[label] = model
-        print(f"Trained XGBoost model for {label}")
-    return models
-def evaluate_xgb_models(models, X_test, y_test, label_encoders):
     """
-    Evaluates XGBoost models on the test data.
     Args:
-        models (dict): Trained models.
-        X_test (array): TF-IDF features.
-        y_test (DataFrame): Ground truth.
-        label_encoders (dict): Label encoders used to encode labels.
     Returns:
-        tuple: classification reports, ground truths, predictions
     """
-    reports = {}
-    predictions = []
-    truths = []
-    for label in LABEL_COLUMNS:
-        model = models[label]
-        y_true = label_encoders[label].transform(y_test[label])
-        y_pred = model.predict(X_test)
-        truths.append(y_true)
-        predictions.append(y_pred)
         try:
-            reports[label] = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
         except ValueError:
-            print(f"Skipping classification report for {label} due to error.")
-            reports[label] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
     return reports, truths, predictions
 def summarize_metrics(metrics):
     """
     Summarizes classification reports into a readable Pandas DataFrame.
     Args:
-        metrics (dict): Dictionary of classification reports.
     Returns:
-        pd.DataFrame: Summary metrics per label.
     """
     summary = []
     for field, report in metrics.items():
-        precision = report['weighted avg']['precision']
-        recall = report['weighted avg']['recall']
-        f1 = report['weighted avg']['f1-score']
-        support = report['weighted avg']['support']
-        accuracy = report.get('accuracy', 0)
         summary.append({
             "Field": field,
             "Precision": precision,
@@ -83,62 +207,87 @@ def summarize_metrics(metrics):
         })
     return pd.DataFrame(summary)
-def save_xgb_models(models):
     """
-    Saves each XGBoost model to disk using joblib.
     Args:
-        models (dict): Dictionary of trained models.
     """
-    os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
-    for label, model in models.items():
-        model_path = os.path.join(MODEL_SAVE_DIR, f"{label}_xgb_model.pkl")
         joblib.dump(model, model_path)
-        print(f"Saved model for {label} to {model_path}")
-def load_xgb_models():
     """
-    Loads XGBoost models from disk.
     Returns:
-        dict: Loaded models.
-    """
-    models = {}
-    for label in LABEL_COLUMNS:
-        model_path = os.path.join(MODEL_SAVE_DIR, f"{label}_xgb_model.pkl")
-        if os.path.exists(model_path):
-            models[label] = joblib.load(model_path)
-            print(f"Loaded model for {label}")
         else:
-            print(f"Model not found: {model_path}")
-    return models
-def predict_xgb(models, X):
     """
-    Predicts classes for each label using trained models.
     Args:
-        models (dict): Dictionary of models.
-        X (array): TF-IDF input features.
     Returns:
-        dict: Predictions per label.
     """
-    predictions = {}
-    for label in LABEL_COLUMNS:
-        model = models[label]
-        preds = model.predict(X)
-        predictions[label] = preds
-    return predictions
-def predict_xgb_proba(models, X):
-    """
-    Predicts probabilities for each label using trained models.
-    Args:
-        models (dict): Dictionary of models.
-        X (array): TF-IDF features.
-    Returns:
-        dict: Predicted probabilities per label.
-    """
-    probabilities = {}
-    for label in LABEL_COLUMNS:
-        model = models[label]
-        probs = model.predict_proba(X)
-        probabilities[label] = probs
-    return probabilities

+import torch
+import torch.nn as nn
+from torch.optim import AdamW
+from sklearn.metrics import classification_report
+from sklearn.utils.class_weight import compute_class_weight
+import numpy as np
+from tqdm import tqdm
+import pandas as pd
 import os
 import joblib
+from config import DEVICE, LABEL_COLUMNS, NUM_EPOCHS, LEARNING_RATE, MODEL_SAVE_DIR
+def get_class_weights(data_df, field, label_encoder):
+    """
+    Computes balanced class weights for a given target field.
+    These weights can be used in the loss function to mitigate class imbalance.
+    Args:
+        data_df (pd.DataFrame): The DataFrame containing the original (unencoded) label data.
+        field (str): The name of the label column for which to compute weights.
+        label_encoder (sklearn.preprocessing.LabelEncoder): The label encoder fitted for this field.
+    Returns:
+        torch.Tensor: A tensor of class weights for the specified field.
+    """
+    # Get the original labels for the specified field
+    y = data_df[field].values
+    # Use label_encoder.transform directly - it will handle unseen labels
+    try:
+        y_encoded = label_encoder.transform(y)
+    except ValueError as e:
+        print(f"Warning: {e}")
+        print(f"Using only seen labels for class weights calculation")
+        # Filter out unseen labels
+        seen_labels = set(label_encoder.classes_)
+        y_filtered = [label for label in y if label in seen_labels]
+        y_encoded = label_encoder.transform(y_filtered)
+    # Ensure y_encoded is integer type
+    y_encoded = y_encoded.astype(int)
+    # Initialize counts for all possible classes
+    n_classes = len(label_encoder.classes_)
+    class_counts = np.zeros(n_classes, dtype=int)
+    # Count occurrences of each class
+    for i in range(n_classes):
+        class_counts[i] = np.sum(y_encoded == i)
+    # Calculate weights for all classes
+    total_samples = len(y_encoded)
+    class_weights = np.ones(n_classes)  # Default weight of 1 for unseen classes
+    seen_classes = class_counts > 0
+    if np.any(seen_classes):
+        class_weights[seen_classes] = total_samples / (np.sum(seen_classes) * class_counts[seen_classes])
+    return torch.tensor(class_weights, dtype=torch.float)
+def initialize_criterions(data_df, label_encoders):
     """
+    Initializes CrossEntropyLoss criteria for each label column, applying class weights.
     Args:
+        data_df (pd.DataFrame): The original (unencoded) DataFrame. Used to compute class weights.
+        label_encoders (dict): Dictionary of LabelEncoder objects.
     Returns:
+        dict: A dictionary where keys are label column names and values are
+              initialized `torch.nn.CrossEntropyLoss` objects.
     """
+    field_criterions = {}
+    for field in LABEL_COLUMNS:
+        # Get class weights for the current field
+        weights = get_class_weights(data_df, field, label_encoders[field])
+        # Initialize CrossEntropyLoss with the computed weights and move to the device
+        field_criterions[field] = torch.nn.CrossEntropyLoss(weight=weights.to(DEVICE))
+    return field_criterions
+def train_model(model, loader, optimizer, field_criterions, epoch):
     """
+    Trains the given PyTorch model for one epoch.
     Args:
+        model (torch.nn.Module): The model to train.
+        loader (torch.utils.data.DataLoader): DataLoader for training data.
+        optimizer (torch.optim.Optimizer): Optimizer for model parameters.
+        field_criterions (dict): Dictionary of loss functions for each label.
+        epoch (int): Current epoch number (for progress bar description).
     Returns:
+        float: Average training loss for the epoch.
     """
+    model.train() # Set the model to training mode
+    total_loss = 0
+    # Use tqdm for a progress bar during training
+    tqdm_loader = tqdm(loader, desc=f"Epoch {epoch + 1} Training")
+    for batch in tqdm_loader:
+        # Unpack batch based on whether it contains metadata
+        if len(batch) == 2: # Text-only models (inputs, labels)
+            inputs, labels = batch
+            input_ids = inputs['input_ids'].to(DEVICE)
+            attention_mask = inputs['attention_mask'].to(DEVICE)
+            labels = labels.to(DEVICE)
+            # Forward pass through the model
+            outputs = model(input_ids, attention_mask)
+        elif len(batch) == 3: # Text + Metadata models (inputs, metadata, labels)
+            inputs, metadata, labels = batch
+            input_ids = inputs['input_ids'].to(DEVICE)
+            attention_mask = inputs['attention_mask'].to(DEVICE)
+            metadata = metadata.to(DEVICE)
+            labels = labels.to(DEVICE)
+            # Forward pass through the hybrid model
+            outputs = model(input_ids, attention_mask, metadata)
+        else:
+            raise ValueError("Unsupported batch format. Expected 2 or 3 items in batch.")
+        loss = 0
+        # Calculate total loss by summing loss for each label column
+        # `outputs` is a list of logits, one for each label column
+        for i, output_logits in enumerate(outputs):
+            # `labels[:, i]` gets the true labels for the i-th label column
+            # `field_criterions[LABEL_COLUMNS[i]]` selects the appropriate loss function
+            loss += field_criterions[LABEL_COLUMNS[i]](output_logits, labels[:, i])
+        optimizer.zero_grad() # Clear previous gradients
+        loss.backward()       # Backpropagation
+        optimizer.step()      # Update model parameters
+        total_loss += loss.item() # Accumulate loss
+        tqdm_loader.set_postfix(loss=loss.item()) # Update progress bar with current batch loss
+    return total_loss / len(loader) # Return average loss for the epoch
+def evaluate_model(model, loader):
+    """
+    Evaluates the given PyTorch model on a validation/test set.
+    Args:
+        model (torch.nn.Module): The model to evaluate.
+        loader (torch.utils.data.DataLoader): DataLoader for evaluation data.
+    Returns:
+        tuple: A tuple containing:
+            - reports (dict): Classification reports (dict format) for each label column.
+            - truths (list): List of true label arrays for each label column.
+            - predictions (list): List of predicted label arrays for each label column.
+    """
+    model.eval() # Set the model to evaluation mode (disables dropout, batch norm updates, etc.)
+    # Initialize lists to store predictions and true labels for each output head
+    predictions = [[] for _ in range(len(LABEL_COLUMNS))]
+    truths = [[] for _ in range(len(LABEL_COLUMNS))]
+    with torch.no_grad(): # Disable gradient calculations during evaluation for efficiency
+        for batch in tqdm(loader, desc="Evaluation"):
+            if len(batch) == 2:
+                inputs, labels = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                labels = labels.to(DEVICE)
+                outputs = model(input_ids, attention_mask)
+            elif len(batch) == 3:
+                inputs, metadata, labels = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                metadata = metadata.to(DEVICE)
+                labels = labels.to(DEVICE)
+                outputs = model(input_ids, attention_mask, metadata)
+            else:
+                raise ValueError("Unsupported batch format.")
+            for i, output_logits in enumerate(outputs):
+                # Get the predicted class by taking the argmax of the logits
+                preds = torch.argmax(output_logits, dim=1).cpu().numpy()
+                predictions[i].extend(preds)
+                # Get the true labels for the current output head
+                truths[i].extend(labels[:, i].cpu().numpy())
+    reports = {}
+    # Generate classification report for each label column
+    for i, col in enumerate(LABEL_COLUMNS):
         try:
+            # `zero_division=0` handles cases where a class might have no true or predicted samples
+            reports[col] = classification_report(truths[i], predictions[i], output_dict=True, zero_division=0)
         except ValueError:
+            # Handle cases where a label might not appear in the validation set,
+            # which could cause classification_report to fail.
+            print(f"Warning: Could not generate classification report for {col}. Skipping.")
+            reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
     return reports, truths, predictions
 def summarize_metrics(metrics):
     """
     Summarizes classification reports into a readable Pandas DataFrame.
     Args:
+        metrics (dict): Dictionary of classification reports, as returned by `evaluate_model`.
     Returns:
+        pd.DataFrame: A DataFrame summarizing precision, recall, f1-score, accuracy, and support for each field.
     """
     summary = []
     for field, report in metrics.items():
+        # Safely get metrics, defaulting to 0 if not present (e.g., for empty reports)
+        precision = report['weighted avg']['precision'] if 'weighted avg' in report else 0
+        recall = report['weighted avg']['recall'] if 'weighted avg' in report else 0
+        f1 = report['weighted avg']['f1-score'] if 'weighted avg' in report else 0
+        support = report['weighted avg']['support'] if 'weighted avg' in report else 0
+        accuracy = report['accuracy'] if 'accuracy' in report else 0 # Accuracy is usually top-level
         summary.append({
             "Field": field,
             "Precision": precision,
         })
     return pd.DataFrame(summary)
+def save_model(model, model_name, save_format='pth'):
     """
+    Saves the state dictionary of a PyTorch model.
     Args:
+        model (torch.nn.Module): The trained PyTorch model.
+        model_name (str): A descriptive name for the model (used for filename).
+        save_format (str): Format to save the model in ('pth' for PyTorch models, 'pickle' for traditional ML models).
     """
+    # Construct the save path dynamically relative to the project root
+    if save_format == 'pth':
+        model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
+        torch.save(model.state_dict(), model_path)
+    elif save_format == 'pickle':
+        model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
         joblib.dump(model, model_path)
+    else:
+        raise ValueError(f"Unsupported save format: {save_format}")
+    print(f"Model saved to {model_path}")
+def load_model_state(model, model_name, model_class, num_labels, metadata_dim=0):
     """
+    Loads the state dictionary into a PyTorch model.
+    Args:
+        model (torch.nn.Module): An initialized model instance (architecture).
+        model_name (str): The name of the model to load.
+        model_class (class): The class of the model (e.g., BertMultiOutputModel).
+        num_labels (list): List of number of classes for each label.
+        metadata_dim (int): Dimensionality of metadata features, if applicable (default 0 for text-only).
     Returns:
+        torch.nn.Module: The model with loaded state_dict, moved to the correct device, and set to eval mode.
+    """
+    model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
+    if not os.path.exists(model_path):
+        print(f"Warning: Model file not found at {model_path}. Returning a newly initialized model instance.")
+        # Re-initialize the model if not found, to ensure it has the correct architecture
+        if metadata_dim > 0:
+            return model_class(num_labels, metadata_dim=metadata_dim).to(DEVICE)
         else:
+            return model_class(num_labels).to(DEVICE)
+    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+    model.to(DEVICE)
+    model.eval() # Set to evaluation mode after loading
+    print(f"Model loaded from {model_path}")
+    return model
+def predict_probabilities(model, loader):
     """
+    Generates prediction probabilities for each label for a given model.
+    This is used for confidence scoring and feeding into a voting ensemble.
     Args:
+        model (torch.nn.Module): The trained PyTorch model.
+        loader (torch.utils.data.DataLoader): DataLoader for the data to predict on.
     Returns:
+        list: A list of lists of numpy arrays. Each inner list corresponds to a label column,
+              containing the softmax probabilities for each sample for that label.
     """
+    model.eval() # Set to evaluation mode
+    # List to store probabilities for each output head
+    all_probabilities = [[] for _ in range(len(LABEL_COLUMNS))]
+    with torch.no_grad():
+        for batch in tqdm(loader, desc="Predicting Probabilities"):
+            # Unpack batch, ignoring labels as we only need inputs
+            if len(batch) == 2:
+                inputs, _ = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                outputs = model(input_ids, attention_mask)
+            elif len(batch) == 3:
+                inputs, metadata, _ = batch
+                input_ids = inputs['input_ids'].to(DEVICE)
+                attention_mask = inputs['attention_mask'].to(DEVICE)
+                metadata = metadata.to(DEVICE)
+                outputs = model(input_ids, attention_mask, metadata)
+            else:
+                raise ValueError("Unsupported batch format.")
+            for i, out_logits in enumerate(outputs):
+                # Apply softmax to logits to get probabilities
+                probs = torch.softmax(out_logits, dim=1).cpu().numpy()
+                all_probabilities[i].extend(probs)
+    return all_probabilities