Spaces:

joko333
/

logical_structure_analysis

Sleeping

App Files Files Community

joko333 commited on Dec 14, 2024

Commit

ca5c473

1 Parent(s): 7fd3f3b

Implement sentence analysis functionality in Analysis page; add BiLSTM model and prediction utilities

Browse files

Files changed (5) hide show

pages/Analysis.py +99 -11
requirements.txt +4 -2
utils/BiLSTM.py +79 -0
utils/__init__.py +0 -0
utils/prediction.py +122 -0

pages/Analysis.py CHANGED Viewed

@@ -1,15 +1,103 @@
 import streamlit as st
-#import transformers
-#from transformers import pipeline
-st.title('Text Analysis')
-# classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
-# input_text = st.text_area("Enter text to analyze:")
-# if st.button('Analyze'):
-#     if input_text:
-#         result = classifier(input_text)
-#         st.write(result)
-#     else:
-#         st.warning('Please enter some text')

 import streamlit as st
+import pandas as pd
+import re
+from utils.prediction import predict_sentence
+def split_sentences_regex(text):
+    # Clean the text
+    text = re.sub(r'[\n\r]', ' ', text)  # Remove newlines
+    text = re.sub(r'["\']', '', text)     # Remove quotes
+    text = re.sub(r'\s+', ' ', text)      # Normalize whitespace
+    # More aggressive pattern that looks for sentence endings
+    #pattern = r'[.!?]+[\s]+|[.!?]+$'
+    pattern = r'[.]'
+    # Split and clean resulting sentences
+    sentences = [s.strip() for s in re.split(pattern, text) if s]
+    # Filter out empty strings but keep sentences that don't start with capitals
+    return [s for s in sentences if len(s) > 0]
+def split_sentences_with_abbrev(text):
+    # Common abbreviations to ignore
+    abbreviations = {'mr.', 'mrs.', 'dr.', 'sr.', 'jr.', 'vs.', 'e.g.', 'i.e.', 'etc.'}
+    # Split initially by potential sentence endings
+    parts = text.split('. ')
+    sentences = []
+    current = parts[0]
+    for part in parts[1:]:
+        # Check if the previous part ends with an abbreviation
+        ends_with_abbrev = any(current.lower().endswith(abbr) for abbr in abbreviations)
+        if ends_with_abbrev:
+            current = current + '. ' + part
+        else:
+            sentences.append(current)
+            current = part
+    sentences.append(current)
+    return sentences
+def show_analysis():
+    st.title("Text Analysis")
+    st.write("Use this section to analyze the logical structure of your text.")
+    try:
+        if 'model' not in st.session_state:
+            st.error("Please initialize the model from the home page first.")
+            return
+        model = st.session_state.model
+        label_encoder = st.session_state.label_encoder
+        tokenizer = st.session_state.tokenizer
+        # Text input section
+        st.header("Analyze Your Text")
+        user_text = st.text_area("Enter your text here (multiple sentences allowed):", height=150)
+        if st.button("Analyze"):
+            if user_text:
+                # Split and analyze sentences
+                sentences = split_sentences_regex(user_text)
+                st.subheader("Analysis Results:")
+                for i, sentence in enumerate(sentences, 1):
+                    with st.container():
+                        label, confidence = predict_sentence(
+                            model, sentence, tokenizer, label_encoder
+                        )
+                        if label not in ("Unknown", "Error"):
+                            st.write("---")
+                            st.write(f"**Sentence:** {sentence}")
+                            st.write(f"**Predicted:** {label}")
+                            st.progress(confidence)
+            else:
+                st.warning("Please enter some text to analyze.")
+        # Example Analysis Section
+        st.header("Example Analysis")
+        show_examples = st.checkbox("Show example analysis", key='show_examples')
+        if show_examples:
+            try:
+                df = pd.read_csv('data/raw/history_01.csv')
+                for sentence in df['Sentence'].head(5):  # Limit to 5 examples
+                    with st.container():
+                        label, confidence = predict_sentence(
+                            model, sentence, tokenizer, label_encoder
+                        )
+                        if label not in ("Unknown", "Error"):
+                            st.write("---")
+                            st.write(f"**Sentence:** {sentence}")
+                            st.write(f"**Predicted:** {label}")
+                            st.progress(confidence)
+            except FileNotFoundError:
+                st.warning("Example file not found. Please check the data path.")
+    except Exception as e:
+        st.error(f"Error: {str(e)}")
+if __name__ == "__main__":
+    show_analysis()

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 streamlit
-#transformers
-#torch

 streamlit
+pandas
+numpy
+transformers
+torch

utils/BiLSTM.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+from torch import nn
+from transformers import AutoModel
+class BiLSTMAttentionBERT(nn.Module):
+    def __init__(self,
+                 hidden_dim=256,
+                 num_classes=22,  # Based on the label distribution
+                 num_layers=2,    # Multiple LSTM layers
+                 dropout=0.1):
+        super().__init__()
+        # Load BioBERT instead of BERT
+        self.bert_model = AutoModel.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
+        bert_dim = self.bert_model.config.hidden_size  # Still 768 for BioBERT basee
+        # Dropout for BERT outputs
+        self.dropout_bert = nn.Dropout(dropout)
+        # Multi-layer BiLSTM
+        self.lstm = nn.LSTM(
+            input_size=bert_dim,
+            hidden_size=hidden_dim,
+            num_layers=num_layers,
+            bidirectional=True,
+            batch_first=True,
+            dropout=dropout if num_layers > 1 else 0
+        )
+        # Multi-head attention
+        self.attention = nn.MultiheadAttention(
+            embed_dim=hidden_dim * 2,  # *2 for bidirectional
+            num_heads=1,
+            dropout=dropout,
+            batch_first=True
+        )
+        # Regularization layers
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout + 0.1)
+        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
+        self.batch_norm = nn.BatchNorm1d(hidden_dim * 2)
+        # Classification head
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_dim * 2, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.BatchNorm1d(hidden_dim),
+            nn.Linear(hidden_dim, num_classes)
+        )
+    def forward(self, input_ids, attention_mask):
+        # BERT encoding
+        bert_output = self.bert_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True
+        )
+        sequence_output = self.dropout_bert(bert_output.last_hidden_state)
+        # BiLSTM processing
+        lstm_out, _ = self.lstm(sequence_output)
+        lstm_out = self.layer_norm(lstm_out)
+        # Self-attention
+        attn_out, _ = self.attention(
+            query=lstm_out,
+            key=lstm_out,
+            value=lstm_out,
+            need_weights=False
+        )
+        # Pooling and normalization
+        pooled = torch.mean(attn_out, dim=1)
+        pooled = self.batch_norm(pooled)
+        pooled = self.dropout2(pooled)
+        # Classification
+        return self.classifier(pooled)

utils/__init__.py ADDED Viewed

File without changes

utils/prediction.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import torch
+from transformers import AutoTokenizer
+from sklearn.preprocessing import LabelEncoder
+from BiLSTM import BiLSTMAttentionBERT
+import numpy as np
+def load_model_for_prediction():
+    # Force CPU
+    device = torch.device('cpu')
+    torch.backends.mps.enabled = False
+    try:
+        # Load model from Hugging Face Hub
+        model = BiLSTMAttentionBERT.from_pretrained(
+            "joko333/BiLSTM_v01",
+            hidden_dim=128,
+            num_classes=22,
+            num_layers=2,
+            dropout=0.5
+        ).to(device)
+        model.eval()
+        # Initialize label encoder with predefined classes
+        label_encoder = LabelEncoder()
+        label_encoder.classes_ = np.array(['Addition', 'Causal', 'Cause and Effect',
+                                         'Clarification', 'Comparison', 'Concession',
+                                         'Conditional', 'Contrast', 'Contrastive Emphasis',
+                                         'Definition', 'Elaboration', 'Emphasis',
+                                         'Enumeration', 'Explanation', 'Generalization',
+                                         'Illustration', 'Inference', 'Problem Solution',
+                                         'Purpose', 'Sequential', 'Summary',
+                                         'Temporal Sequence'])
+        # Initialize tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            'dmis-lab/biobert-base-cased-v1.2'
+        )
+        return model, label_encoder, tokenizer
+    except Exception as e:
+        print(f"Error loading model components: {str(e)}")
+        return None, None, None
+def predict_sentence(model, sentence, tokenizer, label_encoder, device=None):
+    """
+    Make prediction for a single sentence with label validation.
+    """
+    device = torch.device('cpu')
+    model = model.to(device)
+    model.eval()
+    # Tokenize
+    encoding = tokenizer(
+        sentence,
+        add_special_tokens=True,
+        max_length=512,
+        padding='max_length',
+        truncation=True,
+        return_tensors='pt'
+    ).to(device)
+    try:
+        with torch.no_grad():
+            # Get model outputs
+            outputs = model(encoding['input_ids'], encoding['attention_mask'])
+            probabilities = torch.softmax(outputs, dim=1)
+            # Get prediction and probability
+            prob, pred_idx = torch.max(probabilities, dim=1)
+            # Validate prediction index
+            if pred_idx.item() >= len(label_encoder.classes_):
+                print(f"Warning: Model predicted invalid label index {pred_idx.item()}")
+                return "Unknown", 0.0
+            # Convert to label
+            try:
+                predicted_class = label_encoder.classes_[pred_idx.item()]
+                return predicted_class, prob.item()
+            except IndexError:
+                print(f"Warning: Invalid label index {pred_idx.item()}")
+                return "Unknown", 0.0
+    except Exception as e:
+        print(f"Prediction error: {str(e)}")
+        return "Error", 0.0
+def print_labels(label_encoder, show_counts=False):
+    """Print all labels and their corresponding indices"""
+    print("\nAvailable labels:")
+    print("-" * 40)
+    for idx, label in enumerate(label_encoder.classes_):
+        print(f"Index {idx}: {label}")
+    print("-" * 40)
+    print(f"Total number of classes: {len(label_encoder.classes_)}\n")
+def predict_sentence2(sentence, model, tokenizer, label_encoder):
+    # Tokenize the input
+    inputs = tokenizer(sentence,
+                      padding=True,
+                      truncation=True,
+                      return_tensors='pt',
+                      max_length=512)
+    # Move inputs to the same device as model
+    device = next(model.parameters()).device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Make prediction
+    with torch.no_grad():
+        outputs = model(**inputs)
+        predictions = torch.argmax(outputs.logits, dim=1)
+    # Convert prediction to label
+    predicted_label = label_encoder.inverse_transform(predictions.cpu().numpy())[0]
+    return predicted_label