subbunanepalli commited on
Commit
4a9cc93
·
verified ·
1 Parent(s): 6df3606

Update train_utils.py

Browse files
Files changed (1) hide show
  1. train_utils.py +242 -93
train_utils.py CHANGED
@@ -1,78 +1,202 @@
 
 
 
 
 
 
 
 
1
  import os
2
  import joblib
3
- import pandas as pd
4
- import numpy as np
5
- from sklearn.metrics import classification_report
6
- from sklearn.preprocessing import LabelEncoder
7
- from xgboost import XGBClassifier
8
- from config import LABEL_COLUMNS, MODEL_SAVE_DIR
9
 
10
- def train_xgb_models(X_train, y_train, label_encoders):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  """
12
- Trains a separate XGBoost model for each label in a multi-output classification setup.
13
  Args:
14
- X_train (array): Feature matrix (TF-IDF).
15
- y_train (DataFrame): DataFrame with one column per label.
16
- label_encoders (dict): LabelEncoders for each label column.
17
  Returns:
18
- dict: Trained XGBoost models for each label.
 
19
  """
20
- models = {}
21
- for label in LABEL_COLUMNS:
22
- y_encoded = label_encoders[label].transform(y_train[label])
23
- model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', verbosity=0)
24
- model.fit(X_train, y_encoded)
25
- models[label] = model
26
- print(f"Trained XGBoost model for {label}")
27
- return models
28
 
29
- def evaluate_xgb_models(models, X_test, y_test, label_encoders):
30
  """
31
- Evaluates XGBoost models on the test data.
32
  Args:
33
- models (dict): Trained models.
34
- X_test (array): TF-IDF features.
35
- y_test (DataFrame): Ground truth.
36
- label_encoders (dict): Label encoders used to encode labels.
 
37
  Returns:
38
- tuple: classification reports, ground truths, predictions
39
  """
40
- reports = {}
41
- predictions = []
42
- truths = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- for label in LABEL_COLUMNS:
45
- model = models[label]
46
- y_true = label_encoders[label].transform(y_test[label])
47
- y_pred = model.predict(X_test)
 
 
 
48
 
49
- truths.append(y_true)
50
- predictions.append(y_pred)
 
 
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  try:
53
- reports[label] = classification_report(y_true, y_pred, output_dict=True, zero_division=0)
 
54
  except ValueError:
55
- print(f"Skipping classification report for {label} due to error.")
56
- reports[label] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
57
-
 
58
  return reports, truths, predictions
59
 
60
  def summarize_metrics(metrics):
61
  """
62
  Summarizes classification reports into a readable Pandas DataFrame.
63
  Args:
64
- metrics (dict): Dictionary of classification reports.
65
  Returns:
66
- pd.DataFrame: Summary metrics per label.
67
  """
68
  summary = []
69
  for field, report in metrics.items():
70
- precision = report['weighted avg']['precision']
71
- recall = report['weighted avg']['recall']
72
- f1 = report['weighted avg']['f1-score']
73
- support = report['weighted avg']['support']
74
- accuracy = report.get('accuracy', 0)
75
-
76
  summary.append({
77
  "Field": field,
78
  "Precision": precision,
@@ -83,62 +207,87 @@ def summarize_metrics(metrics):
83
  })
84
  return pd.DataFrame(summary)
85
 
86
- def save_xgb_models(models):
87
  """
88
- Saves each XGBoost model to disk using joblib.
89
  Args:
90
- models (dict): Dictionary of trained models.
 
 
91
  """
92
- os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
93
- for label, model in models.items():
94
- model_path = os.path.join(MODEL_SAVE_DIR, f"{label}_xgb_model.pkl")
 
 
 
95
  joblib.dump(model, model_path)
96
- print(f"Saved model for {label} to {model_path}")
 
 
 
97
 
98
- def load_xgb_models():
99
  """
100
- Loads XGBoost models from disk.
 
 
 
 
 
 
101
  Returns:
102
- dict: Loaded models.
103
- """
104
- models = {}
105
- for label in LABEL_COLUMNS:
106
- model_path = os.path.join(MODEL_SAVE_DIR, f"{label}_xgb_model.pkl")
107
- if os.path.exists(model_path):
108
- models[label] = joblib.load(model_path)
109
- print(f"Loaded model for {label}")
110
  else:
111
- print(f"Model not found: {model_path}")
112
- return models
 
 
 
 
 
113
 
114
- def predict_xgb(models, X):
115
  """
116
- Predicts classes for each label using trained models.
 
117
  Args:
118
- models (dict): Dictionary of models.
119
- X (array): TF-IDF input features.
120
  Returns:
121
- dict: Predictions per label.
 
122
  """
123
- predictions = {}
124
- for label in LABEL_COLUMNS:
125
- model = models[label]
126
- preds = model.predict(X)
127
- predictions[label] = preds
128
- return predictions
129
 
130
- def predict_xgb_proba(models, X):
131
- """
132
- Predicts probabilities for each label using trained models.
133
- Args:
134
- models (dict): Dictionary of models.
135
- X (array): TF-IDF features.
136
- Returns:
137
- dict: Predicted probabilities per label.
138
- """
139
- probabilities = {}
140
- for label in LABEL_COLUMNS:
141
- model = models[label]
142
- probs = model.predict_proba(X)
143
- probabilities[label] = probs
144
- return probabilities
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.optim import AdamW
4
+ from sklearn.metrics import classification_report
5
+ from sklearn.utils.class_weight import compute_class_weight
6
+ import numpy as np
7
+ from tqdm import tqdm
8
+ import pandas as pd
9
  import os
10
  import joblib
 
 
 
 
 
 
11
 
12
+ from config import DEVICE, LABEL_COLUMNS, NUM_EPOCHS, LEARNING_RATE, MODEL_SAVE_DIR
13
+
14
+ def get_class_weights(data_df, field, label_encoder):
15
+ """
16
+ Computes balanced class weights for a given target field.
17
+ These weights can be used in the loss function to mitigate class imbalance.
18
+ Args:
19
+ data_df (pd.DataFrame): The DataFrame containing the original (unencoded) label data.
20
+ field (str): The name of the label column for which to compute weights.
21
+ label_encoder (sklearn.preprocessing.LabelEncoder): The label encoder fitted for this field.
22
+ Returns:
23
+ torch.Tensor: A tensor of class weights for the specified field.
24
+ """
25
+ # Get the original labels for the specified field
26
+ y = data_df[field].values
27
+ # Use label_encoder.transform directly - it will handle unseen labels
28
+ try:
29
+ y_encoded = label_encoder.transform(y)
30
+ except ValueError as e:
31
+ print(f"Warning: {e}")
32
+ print(f"Using only seen labels for class weights calculation")
33
+ # Filter out unseen labels
34
+ seen_labels = set(label_encoder.classes_)
35
+ y_filtered = [label for label in y if label in seen_labels]
36
+ y_encoded = label_encoder.transform(y_filtered)
37
+
38
+ # Ensure y_encoded is integer type
39
+ y_encoded = y_encoded.astype(int)
40
+
41
+ # Initialize counts for all possible classes
42
+ n_classes = len(label_encoder.classes_)
43
+ class_counts = np.zeros(n_classes, dtype=int)
44
+
45
+ # Count occurrences of each class
46
+ for i in range(n_classes):
47
+ class_counts[i] = np.sum(y_encoded == i)
48
+
49
+ # Calculate weights for all classes
50
+ total_samples = len(y_encoded)
51
+ class_weights = np.ones(n_classes) # Default weight of 1 for unseen classes
52
+ seen_classes = class_counts > 0
53
+ if np.any(seen_classes):
54
+ class_weights[seen_classes] = total_samples / (np.sum(seen_classes) * class_counts[seen_classes])
55
+
56
+ return torch.tensor(class_weights, dtype=torch.float)
57
+
58
+ def initialize_criterions(data_df, label_encoders):
59
  """
60
+ Initializes CrossEntropyLoss criteria for each label column, applying class weights.
61
  Args:
62
+ data_df (pd.DataFrame): The original (unencoded) DataFrame. Used to compute class weights.
63
+ label_encoders (dict): Dictionary of LabelEncoder objects.
 
64
  Returns:
65
+ dict: A dictionary where keys are label column names and values are
66
+ initialized `torch.nn.CrossEntropyLoss` objects.
67
  """
68
+ field_criterions = {}
69
+ for field in LABEL_COLUMNS:
70
+ # Get class weights for the current field
71
+ weights = get_class_weights(data_df, field, label_encoders[field])
72
+ # Initialize CrossEntropyLoss with the computed weights and move to the device
73
+ field_criterions[field] = torch.nn.CrossEntropyLoss(weight=weights.to(DEVICE))
74
+ return field_criterions
 
75
 
76
+ def train_model(model, loader, optimizer, field_criterions, epoch):
77
  """
78
+ Trains the given PyTorch model for one epoch.
79
  Args:
80
+ model (torch.nn.Module): The model to train.
81
+ loader (torch.utils.data.DataLoader): DataLoader for training data.
82
+ optimizer (torch.optim.Optimizer): Optimizer for model parameters.
83
+ field_criterions (dict): Dictionary of loss functions for each label.
84
+ epoch (int): Current epoch number (for progress bar description).
85
  Returns:
86
+ float: Average training loss for the epoch.
87
  """
88
+ model.train() # Set the model to training mode
89
+ total_loss = 0
90
+ # Use tqdm for a progress bar during training
91
+ tqdm_loader = tqdm(loader, desc=f"Epoch {epoch + 1} Training")
92
+
93
+ for batch in tqdm_loader:
94
+ # Unpack batch based on whether it contains metadata
95
+ if len(batch) == 2: # Text-only models (inputs, labels)
96
+ inputs, labels = batch
97
+ input_ids = inputs['input_ids'].to(DEVICE)
98
+ attention_mask = inputs['attention_mask'].to(DEVICE)
99
+ labels = labels.to(DEVICE)
100
+ # Forward pass through the model
101
+ outputs = model(input_ids, attention_mask)
102
+ elif len(batch) == 3: # Text + Metadata models (inputs, metadata, labels)
103
+ inputs, metadata, labels = batch
104
+ input_ids = inputs['input_ids'].to(DEVICE)
105
+ attention_mask = inputs['attention_mask'].to(DEVICE)
106
+ metadata = metadata.to(DEVICE)
107
+ labels = labels.to(DEVICE)
108
+ # Forward pass through the hybrid model
109
+ outputs = model(input_ids, attention_mask, metadata)
110
+ else:
111
+ raise ValueError("Unsupported batch format. Expected 2 or 3 items in batch.")
112
 
113
+ loss = 0
114
+ # Calculate total loss by summing loss for each label column
115
+ # `outputs` is a list of logits, one for each label column
116
+ for i, output_logits in enumerate(outputs):
117
+ # `labels[:, i]` gets the true labels for the i-th label column
118
+ # `field_criterions[LABEL_COLUMNS[i]]` selects the appropriate loss function
119
+ loss += field_criterions[LABEL_COLUMNS[i]](output_logits, labels[:, i])
120
 
121
+ optimizer.zero_grad() # Clear previous gradients
122
+ loss.backward() # Backpropagation
123
+ optimizer.step() # Update model parameters
124
+ total_loss += loss.item() # Accumulate loss
125
+ tqdm_loader.set_postfix(loss=loss.item()) # Update progress bar with current batch loss
126
 
127
+ return total_loss / len(loader) # Return average loss for the epoch
128
+
129
+ def evaluate_model(model, loader):
130
+ """
131
+ Evaluates the given PyTorch model on a validation/test set.
132
+ Args:
133
+ model (torch.nn.Module): The model to evaluate.
134
+ loader (torch.utils.data.DataLoader): DataLoader for evaluation data.
135
+ Returns:
136
+ tuple: A tuple containing:
137
+ - reports (dict): Classification reports (dict format) for each label column.
138
+ - truths (list): List of true label arrays for each label column.
139
+ - predictions (list): List of predicted label arrays for each label column.
140
+ """
141
+ model.eval() # Set the model to evaluation mode (disables dropout, batch norm updates, etc.)
142
+ # Initialize lists to store predictions and true labels for each output head
143
+ predictions = [[] for _ in range(len(LABEL_COLUMNS))]
144
+ truths = [[] for _ in range(len(LABEL_COLUMNS))]
145
+
146
+ with torch.no_grad(): # Disable gradient calculations during evaluation for efficiency
147
+ for batch in tqdm(loader, desc="Evaluation"):
148
+ if len(batch) == 2:
149
+ inputs, labels = batch
150
+ input_ids = inputs['input_ids'].to(DEVICE)
151
+ attention_mask = inputs['attention_mask'].to(DEVICE)
152
+ labels = labels.to(DEVICE)
153
+ outputs = model(input_ids, attention_mask)
154
+ elif len(batch) == 3:
155
+ inputs, metadata, labels = batch
156
+ input_ids = inputs['input_ids'].to(DEVICE)
157
+ attention_mask = inputs['attention_mask'].to(DEVICE)
158
+ metadata = metadata.to(DEVICE)
159
+ labels = labels.to(DEVICE)
160
+ outputs = model(input_ids, attention_mask, metadata)
161
+ else:
162
+ raise ValueError("Unsupported batch format.")
163
+
164
+ for i, output_logits in enumerate(outputs):
165
+ # Get the predicted class by taking the argmax of the logits
166
+ preds = torch.argmax(output_logits, dim=1).cpu().numpy()
167
+ predictions[i].extend(preds)
168
+ # Get the true labels for the current output head
169
+ truths[i].extend(labels[:, i].cpu().numpy())
170
+
171
+ reports = {}
172
+ # Generate classification report for each label column
173
+ for i, col in enumerate(LABEL_COLUMNS):
174
  try:
175
+ # `zero_division=0` handles cases where a class might have no true or predicted samples
176
+ reports[col] = classification_report(truths[i], predictions[i], output_dict=True, zero_division=0)
177
  except ValueError:
178
+ # Handle cases where a label might not appear in the validation set,
179
+ # which could cause classification_report to fail.
180
+ print(f"Warning: Could not generate classification report for {col}. Skipping.")
181
+ reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
182
  return reports, truths, predictions
183
 
184
  def summarize_metrics(metrics):
185
  """
186
  Summarizes classification reports into a readable Pandas DataFrame.
187
  Args:
188
+ metrics (dict): Dictionary of classification reports, as returned by `evaluate_model`.
189
  Returns:
190
+ pd.DataFrame: A DataFrame summarizing precision, recall, f1-score, accuracy, and support for each field.
191
  """
192
  summary = []
193
  for field, report in metrics.items():
194
+ # Safely get metrics, defaulting to 0 if not present (e.g., for empty reports)
195
+ precision = report['weighted avg']['precision'] if 'weighted avg' in report else 0
196
+ recall = report['weighted avg']['recall'] if 'weighted avg' in report else 0
197
+ f1 = report['weighted avg']['f1-score'] if 'weighted avg' in report else 0
198
+ support = report['weighted avg']['support'] if 'weighted avg' in report else 0
199
+ accuracy = report['accuracy'] if 'accuracy' in report else 0 # Accuracy is usually top-level
200
  summary.append({
201
  "Field": field,
202
  "Precision": precision,
 
207
  })
208
  return pd.DataFrame(summary)
209
 
210
+ def save_model(model, model_name, save_format='pth'):
211
  """
212
+ Saves the state dictionary of a PyTorch model.
213
  Args:
214
+ model (torch.nn.Module): The trained PyTorch model.
215
+ model_name (str): A descriptive name for the model (used for filename).
216
+ save_format (str): Format to save the model in ('pth' for PyTorch models, 'pickle' for traditional ML models).
217
  """
218
+ # Construct the save path dynamically relative to the project root
219
+ if save_format == 'pth':
220
+ model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
221
+ torch.save(model.state_dict(), model_path)
222
+ elif save_format == 'pickle':
223
+ model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
224
  joblib.dump(model, model_path)
225
+ else:
226
+ raise ValueError(f"Unsupported save format: {save_format}")
227
+
228
+ print(f"Model saved to {model_path}")
229
 
230
+ def load_model_state(model, model_name, model_class, num_labels, metadata_dim=0):
231
  """
232
+ Loads the state dictionary into a PyTorch model.
233
+ Args:
234
+ model (torch.nn.Module): An initialized model instance (architecture).
235
+ model_name (str): The name of the model to load.
236
+ model_class (class): The class of the model (e.g., BertMultiOutputModel).
237
+ num_labels (list): List of number of classes for each label.
238
+ metadata_dim (int): Dimensionality of metadata features, if applicable (default 0 for text-only).
239
  Returns:
240
+ torch.nn.Module: The model with loaded state_dict, moved to the correct device, and set to eval mode.
241
+ """
242
+ model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
243
+ if not os.path.exists(model_path):
244
+ print(f"Warning: Model file not found at {model_path}. Returning a newly initialized model instance.")
245
+ # Re-initialize the model if not found, to ensure it has the correct architecture
246
+ if metadata_dim > 0:
247
+ return model_class(num_labels, metadata_dim=metadata_dim).to(DEVICE)
248
  else:
249
+ return model_class(num_labels).to(DEVICE)
250
+
251
+ model.load_state_dict(torch.load(model_path, map_location=DEVICE))
252
+ model.to(DEVICE)
253
+ model.eval() # Set to evaluation mode after loading
254
+ print(f"Model loaded from {model_path}")
255
+ return model
256
 
257
+ def predict_probabilities(model, loader):
258
  """
259
+ Generates prediction probabilities for each label for a given model.
260
+ This is used for confidence scoring and feeding into a voting ensemble.
261
  Args:
262
+ model (torch.nn.Module): The trained PyTorch model.
263
+ loader (torch.utils.data.DataLoader): DataLoader for the data to predict on.
264
  Returns:
265
+ list: A list of lists of numpy arrays. Each inner list corresponds to a label column,
266
+ containing the softmax probabilities for each sample for that label.
267
  """
268
+ model.eval() # Set to evaluation mode
269
+ # List to store probabilities for each output head
270
+ all_probabilities = [[] for _ in range(len(LABEL_COLUMNS))]
 
 
 
271
 
272
+ with torch.no_grad():
273
+ for batch in tqdm(loader, desc="Predicting Probabilities"):
274
+ # Unpack batch, ignoring labels as we only need inputs
275
+ if len(batch) == 2:
276
+ inputs, _ = batch
277
+ input_ids = inputs['input_ids'].to(DEVICE)
278
+ attention_mask = inputs['attention_mask'].to(DEVICE)
279
+ outputs = model(input_ids, attention_mask)
280
+ elif len(batch) == 3:
281
+ inputs, metadata, _ = batch
282
+ input_ids = inputs['input_ids'].to(DEVICE)
283
+ attention_mask = inputs['attention_mask'].to(DEVICE)
284
+ metadata = metadata.to(DEVICE)
285
+ outputs = model(input_ids, attention_mask, metadata)
286
+ else:
287
+ raise ValueError("Unsupported batch format.")
288
+
289
+ for i, out_logits in enumerate(outputs):
290
+ # Apply softmax to logits to get probabilities
291
+ probs = torch.softmax(out_logits, dim=1).cpu().numpy()
292
+ all_probabilities[i].extend(probs)
293
+ return all_probabilities