import torch
from torch import nn
from transformers import AutoModel


class BiLSTMAttentionBERT(nn.Module):
    def __init__(self,
                 hidden_dim=256,
                 num_classes=22,  # Based on the label distribution
                 num_layers=2,    # Multiple LSTM layers
                 dropout=0.1):
        super().__init__()

        # Load BioBERT instead of BERT
        self.bert_model = AutoModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
        bert_dim = self.bert_model.config.hidden_size  # Still 768 for BioBERT basee
        # Dropout for BERT outputs
        self.dropout_bert = nn.Dropout(dropout)
        # Multi-layer BiLSTM
        self.lstm = nn.LSTM(
            input_size=bert_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )

        # Multi-head attention
        self.attention = nn.MultiheadAttention(
            embed_dim=hidden_dim * 2,  # *2 for bidirectional
            num_heads=1,
            dropout=dropout,
            batch_first=True
        )

        # Regularization layers
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout + 0.1)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)

        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.BatchNorm1d(hidden_dim),
            nn.Linear(hidden_dim, num_classes)
        )

    def forward(self, input_ids, attention_mask):
        # BERT encoding
        bert_output = self.bert_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True
        )
        sequence_output = self.dropout_bert(bert_output.last_hidden_state)

        # BiLSTM processing
        lstm_out, _ = self.lstm(sequence_output)
        lstm_out = self.layer_norm(lstm_out)

        # Self-attention
        attn_out, _ = self.attention(
            query=lstm_out,
            key=lstm_out,
            value=lstm_out,
            need_weights=False
        )

        # Pooling and normalization
        pooled = torch.mean(attn_out, dim=1)
        pooled = self.batch_norm(pooled)
        pooled = self.dropout2(pooled)

        # Classification
        return self.classifier(pooled)