Spaces:

point9
/

bert

Running

App Files Files Community

bert / train_utils.py

namanpenguin

Upload 15 files

ad944b3 verified 11 days ago

raw

history blame contribute delete

13.9 kB

	# train_utils.py

	import torch
	import torch.nn as nn
	from torch.optim import AdamW
	from sklearn.metrics import classification_report
	from sklearn.utils.class_weight import compute_class_weight
	import numpy as np
	from tqdm import tqdm
	import pandas as pd
	import os
	import joblib

	from config import DEVICE, LABEL_COLUMNS, NUM_EPOCHS, LEARNING_RATE, MODEL_SAVE_DIR

	def get_class_weights(data_df, field, label_encoder):
	"""
	Computes balanced class weights for a given target field.
	These weights can be used in the loss function to mitigate class imbalance.

	Args:
	data_df (pd.DataFrame): The DataFrame containing the original (unencoded) label data.
	field (str): The name of the label column for which to compute weights.
	label_encoder (sklearn.preprocessing.LabelEncoder): The label encoder fitted for this field.

	Returns:
	torch.Tensor: A tensor of class weights for the specified field.
	"""
	# Get the original labels for the specified field
	y = data_df[field].values
	# Use label_encoder.transform directly - it will handle unseen labels
	try:
	y_encoded = label_encoder.transform(y)
	except ValueError as e:
	print(f"Warning: {e}")
	print(f"Using only seen labels for class weights calculation")
	# Filter out unseen labels
	seen_labels = set(label_encoder.classes_)
	y_filtered = [label for label in y if label in seen_labels]
	y_encoded = label_encoder.transform(y_filtered)

	# Ensure y_encoded is integer type
	y_encoded = y_encoded.astype(int)

	# Initialize counts for all possible classes
	n_classes = len(label_encoder.classes_)
	class_counts = np.zeros(n_classes, dtype=int)

	# Count occurrences of each class
	for i in range(n_classes):
	class_counts[i] = np.sum(y_encoded == i)

	# Calculate weights for all classes
	total_samples = len(y_encoded)
	class_weights = np.ones(n_classes) # Default weight of 1 for unseen classes
	seen_classes = class_counts > 0
	if np.any(seen_classes):
	class_weights[seen_classes] = total_samples / (np.sum(seen_classes) * class_counts[seen_classes])

	return torch.tensor(class_weights, dtype=torch.float)

	def initialize_criterions(data_df, label_encoders):
	"""
	Initializes CrossEntropyLoss criteria for each label column, applying class weights.

	Args:
	data_df (pd.DataFrame): The original (unencoded) DataFrame. Used to compute class weights.
	label_encoders (dict): Dictionary of LabelEncoder objects.

	Returns:
	dict: A dictionary where keys are label column names and values are
	initialized `torch.nn.CrossEntropyLoss` objects.
	"""
	field_criterions = {}
	for field in LABEL_COLUMNS:
	# Get class weights for the current field
	weights = get_class_weights(data_df, field, label_encoders[field])
	# Initialize CrossEntropyLoss with the computed weights and move to the device
	field_criterions[field] = torch.nn.CrossEntropyLoss(weight=weights.to(DEVICE))
	return field_criterions

	def train_model(model, loader, optimizer, field_criterions, epoch):
	"""
	Trains the given PyTorch model for one epoch.

	Args:
	model (torch.nn.Module): The model to train.
	loader (torch.utils.data.DataLoader): DataLoader for training data.
	optimizer (torch.optim.Optimizer): Optimizer for model parameters.
	field_criterions (dict): Dictionary of loss functions for each label.
	epoch (int): Current epoch number (for progress bar description).

	Returns:
	float: Average training loss for the epoch.
	"""
	model.train() # Set the model to training mode
	total_loss = 0
	# Use tqdm for a progress bar during training
	tqdm_loader = tqdm(loader, desc=f"Epoch {epoch + 1} Training")

	for batch in tqdm_loader:
	# Unpack batch based on whether it contains metadata
	if len(batch) == 2: # Text-only models (inputs, labels)
	inputs, labels = batch
	input_ids = inputs['input_ids'].to(DEVICE)
	attention_mask = inputs['attention_mask'].to(DEVICE)
	labels = labels.to(DEVICE)
	# Forward pass through the model
	outputs = model(input_ids, attention_mask)
	elif len(batch) == 3: # Text + Metadata models (inputs, metadata, labels)
	inputs, metadata, labels = batch
	input_ids = inputs['input_ids'].to(DEVICE)
	attention_mask = inputs['attention_mask'].to(DEVICE)
	metadata = metadata.to(DEVICE)
	labels = labels.to(DEVICE)
	# Forward pass through the hybrid model
	outputs = model(input_ids, attention_mask, metadata)
	else:
	raise ValueError("Unsupported batch format. Expected 2 or 3 items in batch.")

	loss = 0
	# Calculate total loss by summing loss for each label column
	# `outputs` is a list of logits, one for each label column
	for i, output_logits in enumerate(outputs):
	# `labels[:, i]` gets the true labels for the i-th label column
	# `field_criterions[LABEL_COLUMNS[i]]` selects the appropriate loss function
	loss += field_criterions[LABEL_COLUMNS[i]](output_logits, labels[:, i])

	optimizer.zero_grad() # Clear previous gradients
	loss.backward() # Backpropagation
	optimizer.step() # Update model parameters
	total_loss += loss.item() # Accumulate loss
	tqdm_loader.set_postfix(loss=loss.item()) # Update progress bar with current batch loss

	return total_loss / len(loader) # Return average loss for the epoch

	def evaluate_model(model, loader):
	"""
	Evaluates the given PyTorch model on a validation/test set.

	Args:
	model (torch.nn.Module): The model to evaluate.
	loader (torch.utils.data.DataLoader): DataLoader for evaluation data.

	Returns:
	tuple: A tuple containing:
	- reports (dict): Classification reports (dict format) for each label column.
	- truths (list): List of true label arrays for each label column.
	- predictions (list): List of predicted label arrays for each label column.
	"""
	model.eval() # Set the model to evaluation mode (disables dropout, batch norm updates, etc.)
	# Initialize lists to store predictions and true labels for each output head
	predictions = [[] for _ in range(len(LABEL_COLUMNS))]
	truths = [[] for _ in range(len(LABEL_COLUMNS))]

	with torch.no_grad(): # Disable gradient calculations during evaluation for efficiency
	for batch in tqdm(loader, desc="Evaluation"):
	if len(batch) == 2:
	inputs, labels = batch
	input_ids = inputs['input_ids'].to(DEVICE)
	attention_mask = inputs['attention_mask'].to(DEVICE)
	labels = labels.to(DEVICE)
	outputs = model(input_ids, attention_mask)
	elif len(batch) == 3:
	inputs, metadata, labels = batch
	input_ids = inputs['input_ids'].to(DEVICE)
	attention_mask = inputs['attention_mask'].to(DEVICE)
	metadata = metadata.to(DEVICE)
	labels = labels.to(DEVICE)
	outputs = model(input_ids, attention_mask, metadata)
	else:
	raise ValueError("Unsupported batch format.")

	for i, output_logits in enumerate(outputs):
	# Get the predicted class by taking the argmax of the logits
	preds = torch.argmax(output_logits, dim=1).cpu().numpy()
	predictions[i].extend(preds)
	# Get the true labels for the current output head
	truths[i].extend(labels[:, i].cpu().numpy())

	reports = {}
	# Generate classification report for each label column
	for i, col in enumerate(LABEL_COLUMNS):
	try:
	# `zero_division=0` handles cases where a class might have no true or predicted samples
	reports[col] = classification_report(truths[i], predictions[i], output_dict=True, zero_division=0)
	except ValueError:
	# Handle cases where a label might not appear in the validation set,
	# which could cause classification_report to fail.
	print(f"Warning: Could not generate classification report for {col}. Skipping.")
	reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
	return reports, truths, predictions

	def summarize_metrics(metrics):
	"""
	Summarizes classification reports into a readable Pandas DataFrame.

	Args:
	metrics (dict): Dictionary of classification reports, as returned by `evaluate_model`.

	Returns:
	pd.DataFrame: A DataFrame summarizing precision, recall, f1-score, accuracy, and support for each field.
	"""
	summary = []
	for field, report in metrics.items():
	# Safely get metrics, defaulting to 0 if not present (e.g., for empty reports)
	precision = report['weighted avg']['precision'] if 'weighted avg' in report else 0
	recall = report['weighted avg']['recall'] if 'weighted avg' in report else 0
	f1 = report['weighted avg']['f1-score'] if 'weighted avg' in report else 0
	support = report['weighted avg']['support'] if 'weighted avg' in report else 0
	accuracy = report['accuracy'] if 'accuracy' in report else 0 # Accuracy is usually top-level
	summary.append({
	"Field": field,
	"Precision": precision,
	"Recall": recall,
	"F1-Score": f1,
	"Accuracy": accuracy,
	"Support": support
	})
	return pd.DataFrame(summary)

	def save_model(model, model_name, save_format='pth'):
	"""
	Saves the state dictionary of a PyTorch model.

	Args:
	model (torch.nn.Module): The trained PyTorch model.
	model_name (str): A descriptive name for the model (used for filename).
	save_format (str): Format to save the model in ('pth' for PyTorch models, 'pickle' for traditional ML models).
	"""
	# Construct the save path dynamically relative to the project root
	if save_format == 'pth':
	model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
	torch.save(model.state_dict(), model_path)
	elif save_format == 'pickle':
	model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
	joblib.dump(model, model_path)
	else:
	raise ValueError(f"Unsupported save format: {save_format}")

	print(f"Model saved to {model_path}")

	def load_model_state(model, model_name, model_class, num_labels, metadata_dim=0):
	"""
	Loads the state dictionary into a PyTorch model.

	Args:
	model (torch.nn.Module): An initialized model instance (architecture).
	model_name (str): The name of the model to load.
	model_class (class): The class of the model (e.g., BertMultiOutputModel).
	num_labels (list): List of number of classes for each label.
	metadata_dim (int): Dimensionality of metadata features, if applicable (default 0 for text-only).

	Returns:
	torch.nn.Module: The model with loaded state_dict, moved to the correct device, and set to eval mode.
	"""
	model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
	if not os.path.exists(model_path):
	print(f"Warning: Model file not found at {model_path}. Returning a newly initialized model instance.")
	# Re-initialize the model if not found, to ensure it has the correct architecture
	if metadata_dim > 0:
	return model_class(num_labels, metadata_dim=metadata_dim).to(DEVICE)
	else:
	return model_class(num_labels).to(DEVICE)

	model.load_state_dict(torch.load(model_path, map_location=DEVICE))
	model.to(DEVICE)
	model.eval() # Set to evaluation mode after loading
	print(f"Model loaded from {model_path}")
	return model

	def predict_probabilities(model, loader):
	"""
	Generates prediction probabilities for each label for a given model.
	This is used for confidence scoring and feeding into a voting ensemble.

	Args:
	model (torch.nn.Module): The trained PyTorch model.
	loader (torch.utils.data.DataLoader): DataLoader for the data to predict on.

	Returns:
	list: A list of lists of numpy arrays. Each inner list corresponds to a label column,
	containing the softmax probabilities for each sample for that label.
	"""
	model.eval() # Set to evaluation mode
	# List to store probabilities for each output head
	all_probabilities = [[] for _ in range(len(LABEL_COLUMNS))]

	with torch.no_grad():
	for batch in tqdm(loader, desc="Predicting Probabilities"):
	# Unpack batch, ignoring labels as we only need inputs
	if len(batch) == 2:
	inputs, _ = batch
	input_ids = inputs['input_ids'].to(DEVICE)
	attention_mask = inputs['attention_mask'].to(DEVICE)
	outputs = model(input_ids, attention_mask)
	elif len(batch) == 3:
	inputs, metadata, _ = batch
	input_ids = inputs['input_ids'].to(DEVICE)
	attention_mask = inputs['attention_mask'].to(DEVICE)
	metadata = metadata.to(DEVICE)
	outputs = model(input_ids, attention_mask, metadata)
	else:
	raise ValueError("Unsupported batch format.")

	for i, out_logits in enumerate(outputs):
	# Apply softmax to logits to get probabilities
	probs = torch.softmax(out_logits, dim=1).cpu().numpy()
	all_probabilities[i].extend(probs)
	return all_probabilities