|
import torch |
|
from torch import nn |
|
from transformers import AutoModel |
|
|
|
|
|
class BiLSTMAttentionBERT(nn.Module): |
|
def __init__(self, |
|
hidden_dim=256, |
|
num_classes=22, |
|
num_layers=2, |
|
dropout=0.1): |
|
super().__init__() |
|
|
|
|
|
self.bert_model = AutoModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2') |
|
bert_dim = self.bert_model.config.hidden_size |
|
|
|
self.dropout_bert = nn.Dropout(dropout) |
|
|
|
self.lstm = nn.LSTM( |
|
input_size=bert_dim, |
|
hidden_size=hidden_dim, |
|
num_layers=num_layers, |
|
bidirectional=True, |
|
batch_first=True, |
|
dropout=dropout if num_layers > 1 else 0 |
|
) |
|
|
|
|
|
self.attention = nn.MultiheadAttention( |
|
embed_dim=hidden_dim * 2, |
|
num_heads=1, |
|
dropout=dropout, |
|
batch_first=True |
|
) |
|
|
|
|
|
self.dropout1 = nn.Dropout(dropout) |
|
self.dropout2 = nn.Dropout(dropout + 0.1) |
|
self.layer_norm = nn.LayerNorm(hidden_dim * 2) |
|
self.batch_norm = nn.BatchNorm1d(hidden_dim * 2) |
|
|
|
|
|
self.classifier = nn.Sequential( |
|
nn.Linear(hidden_dim * 2, hidden_dim), |
|
nn.ReLU(), |
|
nn.Dropout(dropout), |
|
nn.BatchNorm1d(hidden_dim), |
|
nn.Linear(hidden_dim, num_classes) |
|
) |
|
|
|
def forward(self, input_ids, attention_mask): |
|
|
|
bert_output = self.bert_model( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
return_dict=True |
|
) |
|
sequence_output = self.dropout_bert(bert_output.last_hidden_state) |
|
|
|
|
|
lstm_out, _ = self.lstm(sequence_output) |
|
lstm_out = self.layer_norm(lstm_out) |
|
|
|
|
|
attn_out, _ = self.attention( |
|
query=lstm_out, |
|
key=lstm_out, |
|
value=lstm_out, |
|
need_weights=False |
|
) |
|
|
|
|
|
pooled = torch.mean(attn_out, dim=1) |
|
pooled = self.batch_norm(pooled) |
|
pooled = self.dropout2(pooled) |
|
|
|
|
|
return self.classifier(pooled) |