|
|
|
|
|
import torch |
|
import torch.nn as nn |
|
from torch.optim import AdamW |
|
from sklearn.metrics import classification_report |
|
from sklearn.utils.class_weight import compute_class_weight |
|
import numpy as np |
|
from tqdm import tqdm |
|
import pandas as pd |
|
import os |
|
import joblib |
|
|
|
from config import DEVICE, LABEL_COLUMNS, NUM_EPOCHS, LEARNING_RATE, MODEL_SAVE_DIR |
|
|
|
def get_class_weights(data_df, field, label_encoder): |
|
""" |
|
Computes balanced class weights for a given target field. |
|
These weights can be used in the loss function to mitigate class imbalance. |
|
|
|
Args: |
|
data_df (pd.DataFrame): The DataFrame containing the original (unencoded) label data. |
|
field (str): The name of the label column for which to compute weights. |
|
label_encoder (sklearn.preprocessing.LabelEncoder): The label encoder fitted for this field. |
|
|
|
Returns: |
|
torch.Tensor: A tensor of class weights for the specified field. |
|
""" |
|
|
|
y = data_df[field].values |
|
|
|
try: |
|
y_encoded = label_encoder.transform(y) |
|
except ValueError as e: |
|
print(f"Warning: {e}") |
|
print(f"Using only seen labels for class weights calculation") |
|
|
|
seen_labels = set(label_encoder.classes_) |
|
y_filtered = [label for label in y if label in seen_labels] |
|
y_encoded = label_encoder.transform(y_filtered) |
|
|
|
|
|
y_encoded = y_encoded.astype(int) |
|
|
|
|
|
n_classes = len(label_encoder.classes_) |
|
class_counts = np.zeros(n_classes, dtype=int) |
|
|
|
|
|
for i in range(n_classes): |
|
class_counts[i] = np.sum(y_encoded == i) |
|
|
|
|
|
total_samples = len(y_encoded) |
|
class_weights = np.ones(n_classes) |
|
seen_classes = class_counts > 0 |
|
if np.any(seen_classes): |
|
class_weights[seen_classes] = total_samples / (np.sum(seen_classes) * class_counts[seen_classes]) |
|
|
|
return torch.tensor(class_weights, dtype=torch.float) |
|
|
|
def initialize_criterions(data_df, label_encoders): |
|
""" |
|
Initializes CrossEntropyLoss criteria for each label column, applying class weights. |
|
|
|
Args: |
|
data_df (pd.DataFrame): The original (unencoded) DataFrame. Used to compute class weights. |
|
label_encoders (dict): Dictionary of LabelEncoder objects. |
|
|
|
Returns: |
|
dict: A dictionary where keys are label column names and values are |
|
initialized `torch.nn.CrossEntropyLoss` objects. |
|
""" |
|
field_criterions = {} |
|
for field in LABEL_COLUMNS: |
|
|
|
weights = get_class_weights(data_df, field, label_encoders[field]) |
|
|
|
field_criterions[field] = torch.nn.CrossEntropyLoss(weight=weights.to(DEVICE)) |
|
return field_criterions |
|
|
|
def train_model(model, loader, optimizer, field_criterions, epoch): |
|
""" |
|
Trains the given PyTorch model for one epoch. |
|
|
|
Args: |
|
model (torch.nn.Module): The model to train. |
|
loader (torch.utils.data.DataLoader): DataLoader for training data. |
|
optimizer (torch.optim.Optimizer): Optimizer for model parameters. |
|
field_criterions (dict): Dictionary of loss functions for each label. |
|
epoch (int): Current epoch number (for progress bar description). |
|
|
|
Returns: |
|
float: Average training loss for the epoch. |
|
""" |
|
model.train() |
|
total_loss = 0 |
|
|
|
tqdm_loader = tqdm(loader, desc=f"Epoch {epoch + 1} Training") |
|
|
|
for batch in tqdm_loader: |
|
|
|
if len(batch) == 2: |
|
inputs, labels = batch |
|
input_ids = inputs['input_ids'].to(DEVICE) |
|
attention_mask = inputs['attention_mask'].to(DEVICE) |
|
labels = labels.to(DEVICE) |
|
|
|
outputs = model(input_ids, attention_mask) |
|
elif len(batch) == 3: |
|
inputs, metadata, labels = batch |
|
input_ids = inputs['input_ids'].to(DEVICE) |
|
attention_mask = inputs['attention_mask'].to(DEVICE) |
|
metadata = metadata.to(DEVICE) |
|
labels = labels.to(DEVICE) |
|
|
|
outputs = model(input_ids, attention_mask, metadata) |
|
else: |
|
raise ValueError("Unsupported batch format. Expected 2 or 3 items in batch.") |
|
|
|
loss = 0 |
|
|
|
|
|
for i, output_logits in enumerate(outputs): |
|
|
|
|
|
loss += field_criterions[LABEL_COLUMNS[i]](output_logits, labels[:, i]) |
|
|
|
optimizer.zero_grad() |
|
loss.backward() |
|
optimizer.step() |
|
total_loss += loss.item() |
|
tqdm_loader.set_postfix(loss=loss.item()) |
|
|
|
return total_loss / len(loader) |
|
|
|
def evaluate_model(model, loader): |
|
""" |
|
Evaluates the given PyTorch model on a validation/test set. |
|
|
|
Args: |
|
model (torch.nn.Module): The model to evaluate. |
|
loader (torch.utils.data.DataLoader): DataLoader for evaluation data. |
|
|
|
Returns: |
|
tuple: A tuple containing: |
|
- reports (dict): Classification reports (dict format) for each label column. |
|
- truths (list): List of true label arrays for each label column. |
|
- predictions (list): List of predicted label arrays for each label column. |
|
""" |
|
model.eval() |
|
|
|
predictions = [[] for _ in range(len(LABEL_COLUMNS))] |
|
truths = [[] for _ in range(len(LABEL_COLUMNS))] |
|
|
|
with torch.no_grad(): |
|
for batch in tqdm(loader, desc="Evaluation"): |
|
if len(batch) == 2: |
|
inputs, labels = batch |
|
input_ids = inputs['input_ids'].to(DEVICE) |
|
attention_mask = inputs['attention_mask'].to(DEVICE) |
|
labels = labels.to(DEVICE) |
|
outputs = model(input_ids, attention_mask) |
|
elif len(batch) == 3: |
|
inputs, metadata, labels = batch |
|
input_ids = inputs['input_ids'].to(DEVICE) |
|
attention_mask = inputs['attention_mask'].to(DEVICE) |
|
metadata = metadata.to(DEVICE) |
|
labels = labels.to(DEVICE) |
|
outputs = model(input_ids, attention_mask, metadata) |
|
else: |
|
raise ValueError("Unsupported batch format.") |
|
|
|
for i, output_logits in enumerate(outputs): |
|
|
|
preds = torch.argmax(output_logits, dim=1).cpu().numpy() |
|
predictions[i].extend(preds) |
|
|
|
truths[i].extend(labels[:, i].cpu().numpy()) |
|
|
|
reports = {} |
|
|
|
for i, col in enumerate(LABEL_COLUMNS): |
|
try: |
|
|
|
reports[col] = classification_report(truths[i], predictions[i], output_dict=True, zero_division=0) |
|
except ValueError: |
|
|
|
|
|
print(f"Warning: Could not generate classification report for {col}. Skipping.") |
|
reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}} |
|
return reports, truths, predictions |
|
|
|
def summarize_metrics(metrics): |
|
""" |
|
Summarizes classification reports into a readable Pandas DataFrame. |
|
|
|
Args: |
|
metrics (dict): Dictionary of classification reports, as returned by `evaluate_model`. |
|
|
|
Returns: |
|
pd.DataFrame: A DataFrame summarizing precision, recall, f1-score, accuracy, and support for each field. |
|
""" |
|
summary = [] |
|
for field, report in metrics.items(): |
|
|
|
precision = report['weighted avg']['precision'] if 'weighted avg' in report else 0 |
|
recall = report['weighted avg']['recall'] if 'weighted avg' in report else 0 |
|
f1 = report['weighted avg']['f1-score'] if 'weighted avg' in report else 0 |
|
support = report['weighted avg']['support'] if 'weighted avg' in report else 0 |
|
accuracy = report['accuracy'] if 'accuracy' in report else 0 |
|
summary.append({ |
|
"Field": field, |
|
"Precision": precision, |
|
"Recall": recall, |
|
"F1-Score": f1, |
|
"Accuracy": accuracy, |
|
"Support": support |
|
}) |
|
return pd.DataFrame(summary) |
|
|
|
def save_model(model, model_name, save_format='pth'): |
|
""" |
|
Saves the state dictionary of a PyTorch model. |
|
|
|
Args: |
|
model (torch.nn.Module): The trained PyTorch model. |
|
model_name (str): A descriptive name for the model (used for filename). |
|
save_format (str): Format to save the model in ('pth' for PyTorch models, 'pickle' for traditional ML models). |
|
""" |
|
|
|
if save_format == 'pth': |
|
model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth") |
|
torch.save(model.state_dict(), model_path) |
|
elif save_format == 'pickle': |
|
model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl") |
|
joblib.dump(model, model_path) |
|
else: |
|
raise ValueError(f"Unsupported save format: {save_format}") |
|
|
|
print(f"Model saved to {model_path}") |
|
|
|
def load_model_state(model, model_name, model_class, num_labels, metadata_dim=0): |
|
""" |
|
Loads the state dictionary into a PyTorch model. |
|
|
|
Args: |
|
model (torch.nn.Module): An initialized model instance (architecture). |
|
model_name (str): The name of the model to load. |
|
model_class (class): The class of the model (e.g., BertMultiOutputModel). |
|
num_labels (list): List of number of classes for each label. |
|
metadata_dim (int): Dimensionality of metadata features, if applicable (default 0 for text-only). |
|
|
|
Returns: |
|
torch.nn.Module: The model with loaded state_dict, moved to the correct device, and set to eval mode. |
|
""" |
|
model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth") |
|
if not os.path.exists(model_path): |
|
print(f"Warning: Model file not found at {model_path}. Returning a newly initialized model instance.") |
|
|
|
if metadata_dim > 0: |
|
return model_class(num_labels, metadata_dim=metadata_dim).to(DEVICE) |
|
else: |
|
return model_class(num_labels).to(DEVICE) |
|
|
|
model.load_state_dict(torch.load(model_path, map_location=DEVICE)) |
|
model.to(DEVICE) |
|
model.eval() |
|
print(f"Model loaded from {model_path}") |
|
return model |
|
|
|
def predict_probabilities(model, loader): |
|
""" |
|
Generates prediction probabilities for each label for a given model. |
|
This is used for confidence scoring and feeding into a voting ensemble. |
|
|
|
Args: |
|
model (torch.nn.Module): The trained PyTorch model. |
|
loader (torch.utils.data.DataLoader): DataLoader for the data to predict on. |
|
|
|
Returns: |
|
list: A list of lists of numpy arrays. Each inner list corresponds to a label column, |
|
containing the softmax probabilities for each sample for that label. |
|
""" |
|
model.eval() |
|
|
|
all_probabilities = [[] for _ in range(len(LABEL_COLUMNS))] |
|
|
|
with torch.no_grad(): |
|
for batch in tqdm(loader, desc="Predicting Probabilities"): |
|
|
|
if len(batch) == 2: |
|
inputs, _ = batch |
|
input_ids = inputs['input_ids'].to(DEVICE) |
|
attention_mask = inputs['attention_mask'].to(DEVICE) |
|
outputs = model(input_ids, attention_mask) |
|
elif len(batch) == 3: |
|
inputs, metadata, _ = batch |
|
input_ids = inputs['input_ids'].to(DEVICE) |
|
attention_mask = inputs['attention_mask'].to(DEVICE) |
|
metadata = metadata.to(DEVICE) |
|
outputs = model(input_ids, attention_mask, metadata) |
|
else: |
|
raise ValueError("Unsupported batch format.") |
|
|
|
for i, out_logits in enumerate(outputs): |
|
|
|
probs = torch.softmax(out_logits, dim=1).cpu().numpy() |
|
all_probabilities[i].extend(probs) |
|
return all_probabilities |