subbunanepalli commited on
Commit
80abfe1
·
verified ·
1 Parent(s): 31f3076

Create train_utils.py

Browse files
Files changed (1) hide show
  1. train_utils.py +210 -0
train_utils.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from torch.optim import AdamW
4
+ from sklearn.metrics import classification_report
5
+ from sklearn.utils.class_weight import compute_class_weight
6
+ import numpy as np
7
+ from tqdm import tqdm
8
+ import pandas as pd
9
+ import os
10
+ import joblib
11
+
12
+ from config import DEVICE, LABEL_COLUMNS, NUM_EPOCHS, LEARNING_RATE, MODEL_SAVE_DIR
13
+
14
+ def get_class_weights(data_df, field, label_encoder):
15
+ """
16
+ Computes balanced class weights for a given target field.
17
+ These weights are used with RoBERTa model training to handle class imbalance.
18
+ """
19
+ y = data_df[field].values
20
+ try:
21
+ y_encoded = label_encoder.transform(y)
22
+ except ValueError as e:
23
+ print(f"Warning: {e}")
24
+ print("Using only seen labels for class weights calculation")
25
+ seen_labels = set(label_encoder.classes_)
26
+ y_filtered = [label for label in y if label in seen_labels]
27
+ y_encoded = label_encoder.transform(y_filtered)
28
+
29
+ y_encoded = y_encoded.astype(int)
30
+ n_classes = len(label_encoder.classes_)
31
+ class_counts = np.zeros(n_classes, dtype=int)
32
+
33
+ for i in range(n_classes):
34
+ class_counts[i] = np.sum(y_encoded == i)
35
+
36
+ total_samples = len(y_encoded)
37
+ class_weights = np.ones(n_classes)
38
+ seen_classes = class_counts > 0
39
+ if np.any(seen_classes):
40
+ class_weights[seen_classes] = total_samples / (np.sum(seen_classes) * class_counts[seen_classes])
41
+
42
+ return torch.tensor(class_weights, dtype=torch.float)
43
+
44
+ def initialize_criterions(data_df, label_encoders):
45
+ """
46
+ Initializes loss functions with class weights for each label field for RoBERTa.
47
+ """
48
+ field_criterions = {}
49
+ for field in LABEL_COLUMNS:
50
+ weights = get_class_weights(data_df, field, label_encoders[field])
51
+ field_criterions[field] = torch.nn.CrossEntropyLoss(weight=weights.to(DEVICE))
52
+ return field_criterions
53
+
54
+ def train_model(model, loader, optimizer, field_criterions, epoch):
55
+ """
56
+ Trains the RoBERTa-based model for one epoch.
57
+ """
58
+ model.train()
59
+ total_loss = 0
60
+ tqdm_loader = tqdm(loader, desc=f"RoBERTa Epoch {epoch + 1} Training")
61
+
62
+ for batch in tqdm_loader:
63
+ if len(batch) == 2:
64
+ inputs, labels = batch
65
+ input_ids = inputs['input_ids'].to(DEVICE)
66
+ attention_mask = inputs['attention_mask'].to(DEVICE)
67
+ labels = labels.to(DEVICE)
68
+ outputs = model(input_ids, attention_mask)
69
+ elif len(batch) == 3:
70
+ inputs, metadata, labels = batch
71
+ input_ids = inputs['input_ids'].to(DEVICE)
72
+ attention_mask = inputs['attention_mask'].to(DEVICE)
73
+ metadata = metadata.to(DEVICE)
74
+ labels = labels.to(DEVICE)
75
+ outputs = model(input_ids, attention_mask, metadata)
76
+ else:
77
+ raise ValueError("Unsupported batch format.")
78
+
79
+ loss = 0
80
+ for i, output_logits in enumerate(outputs):
81
+ loss += field_criterions[LABEL_COLUMNS[i]](output_logits, labels[:, i])
82
+
83
+ optimizer.zero_grad()
84
+ loss.backward()
85
+ optimizer.step()
86
+ total_loss += loss.item()
87
+ tqdm_loader.set_postfix(loss=loss.item())
88
+
89
+ return total_loss / len(loader)
90
+
91
+ def evaluate_model(model, loader):
92
+ """
93
+ Evaluates the RoBERTa model and returns classification reports and metrics.
94
+ """
95
+ model.eval()
96
+ predictions = [[] for _ in range(len(LABEL_COLUMNS))]
97
+ truths = [[] for _ in range(len(LABEL_COLUMNS))]
98
+
99
+ with torch.no_grad():
100
+ for batch in tqdm(loader, desc="RoBERTa Evaluation"):
101
+ if len(batch) == 2:
102
+ inputs, labels = batch
103
+ input_ids = inputs['input_ids'].to(DEVICE)
104
+ attention_mask = inputs['attention_mask'].to(DEVICE)
105
+ labels = labels.to(DEVICE)
106
+ outputs = model(input_ids, attention_mask)
107
+ elif len(batch) == 3:
108
+ inputs, metadata, labels = batch
109
+ input_ids = inputs['input_ids'].to(DEVICE)
110
+ attention_mask = inputs['attention_mask'].to(DEVICE)
111
+ metadata = metadata.to(DEVICE)
112
+ labels = labels.to(DEVICE)
113
+ outputs = model(input_ids, attention_mask, metadata)
114
+ else:
115
+ raise ValueError("Unsupported batch format.")
116
+
117
+ for i, output_logits in enumerate(outputs):
118
+ preds = torch.argmax(output_logits, dim=1).cpu().numpy()
119
+ predictions[i].extend(preds)
120
+ truths[i].extend(labels[:, i].cpu().numpy())
121
+
122
+ reports = {}
123
+ for i, col in enumerate(LABEL_COLUMNS):
124
+ try:
125
+ reports[col] = classification_report(truths[i], predictions[i], output_dict=True, zero_division=0)
126
+ except ValueError:
127
+ print(f"Warning: Classification report failed for {col}")
128
+ reports[col] = {'accuracy': 0, 'weighted avg': {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}}
129
+ return reports, truths, predictions
130
+
131
+ def summarize_metrics(metrics):
132
+ """
133
+ Summarizes classification reports into a Pandas DataFrame (RoBERTa).
134
+ """
135
+ summary = []
136
+ for field, report in metrics.items():
137
+ precision = report['weighted avg']['precision'] if 'weighted avg' in report else 0
138
+ recall = report['weighted avg']['recall'] if 'weighted avg' in report else 0
139
+ f1 = report['weighted avg']['f1-score'] if 'weighted avg' in report else 0
140
+ support = report['weighted avg']['support'] if 'weighted avg' in report else 0
141
+ accuracy = report['accuracy'] if 'accuracy' in report else 0
142
+ summary.append({
143
+ "Field": field,
144
+ "Precision": precision,
145
+ "Recall": recall,
146
+ "F1-Score": f1,
147
+ "Accuracy": accuracy,
148
+ "Support": support
149
+ })
150
+ return pd.DataFrame(summary)
151
+
152
+ def save_model(model, model_name, save_format='pth'):
153
+ """
154
+ Saves RoBERTa model weights.
155
+ """
156
+ if save_format == 'pth':
157
+ model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
158
+ torch.save(model.state_dict(), model_path)
159
+ elif save_format == 'pickle':
160
+ model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}.pkl")
161
+ joblib.dump(model, model_path)
162
+ else:
163
+ raise ValueError(f"Unsupported save format: {save_format}")
164
+ print(f"Model saved to {model_path}")
165
+
166
+ def load_model_state(model, model_name, model_class, num_labels, metadata_dim=0):
167
+ """
168
+ Loads a saved RoBERTa model from disk.
169
+ """
170
+ model_path = os.path.join(MODEL_SAVE_DIR, f"{model_name}_model.pth")
171
+ if not os.path.exists(model_path):
172
+ print(f"Warning: {model_path} not found. Returning a new model instance.")
173
+ if metadata_dim > 0:
174
+ return model_class(num_labels, metadata_dim=metadata_dim).to(DEVICE)
175
+ else:
176
+ return model_class(num_labels).to(DEVICE)
177
+
178
+ model.load_state_dict(torch.load(model_path, map_location=DEVICE))
179
+ model.to(DEVICE)
180
+ model.eval()
181
+ print(f"RoBERTa model loaded from {model_path}")
182
+ return model
183
+
184
+ def predict_probabilities(model, loader):
185
+ """
186
+ Generates softmax prediction probabilities from a trained RoBERTa model.
187
+ """
188
+ model.eval()
189
+ all_probabilities = [[] for _ in range(len(LABEL_COLUMNS))]
190
+
191
+ with torch.no_grad():
192
+ for batch in tqdm(loader, desc="RoBERTa Predicting Probabilities"):
193
+ if len(batch) == 2:
194
+ inputs, _ = batch
195
+ input_ids = inputs['input_ids'].to(DEVICE)
196
+ attention_mask = inputs['attention_mask'].to(DEVICE)
197
+ outputs = model(input_ids, attention_mask)
198
+ elif len(batch) == 3:
199
+ inputs, metadata, _ = batch
200
+ input_ids = inputs['input_ids'].to(DEVICE)
201
+ attention_mask = inputs['attention_mask'].to(DEVICE)
202
+ metadata = metadata.to(DEVICE)
203
+ outputs = model(input_ids, attention_mask, metadata)
204
+ else:
205
+ raise ValueError("Unsupported batch format.")
206
+
207
+ for i, out_logits in enumerate(outputs):
208
+ probs = torch.softmax(out_logits, dim=1).cpu().numpy()
209
+ all_probabilities[i].extend(probs)
210
+ return all_probabilities