joko333's picture
Implement sentence analysis functionality in Analysis page; add BiLSTM model and prediction utilities
ca5c473
raw
history blame
3.99 kB
import streamlit as st
import pandas as pd
import re
from utils.prediction import predict_sentence
def split_sentences_regex(text):
# Clean the text
text = re.sub(r'[\n\r]', ' ', text) # Remove newlines
text = re.sub(r'["\']', '', text) # Remove quotes
text = re.sub(r'\s+', ' ', text) # Normalize whitespace
# More aggressive pattern that looks for sentence endings
#pattern = r'[.!?]+[\s]+|[.!?]+$'
pattern = r'[.]'
# Split and clean resulting sentences
sentences = [s.strip() for s in re.split(pattern, text) if s]
# Filter out empty strings but keep sentences that don't start with capitals
return [s for s in sentences if len(s) > 0]
def split_sentences_with_abbrev(text):
# Common abbreviations to ignore
abbreviations = {'mr.', 'mrs.', 'dr.', 'sr.', 'jr.', 'vs.', 'e.g.', 'i.e.', 'etc.'}
# Split initially by potential sentence endings
parts = text.split('. ')
sentences = []
current = parts[0]
for part in parts[1:]:
# Check if the previous part ends with an abbreviation
ends_with_abbrev = any(current.lower().endswith(abbr) for abbr in abbreviations)
if ends_with_abbrev:
current = current + '. ' + part
else:
sentences.append(current)
current = part
sentences.append(current)
return sentences
def show_analysis():
st.title("Text Analysis")
st.write("Use this section to analyze the logical structure of your text.")
try:
if 'model' not in st.session_state:
st.error("Please initialize the model from the home page first.")
return
model = st.session_state.model
label_encoder = st.session_state.label_encoder
tokenizer = st.session_state.tokenizer
# Text input section
st.header("Analyze Your Text")
user_text = st.text_area("Enter your text here (multiple sentences allowed):", height=150)
if st.button("Analyze"):
if user_text:
# Split and analyze sentences
sentences = split_sentences_regex(user_text)
st.subheader("Analysis Results:")
for i, sentence in enumerate(sentences, 1):
with st.container():
label, confidence = predict_sentence(
model, sentence, tokenizer, label_encoder
)
if label not in ("Unknown", "Error"):
st.write("---")
st.write(f"**Sentence:** {sentence}")
st.write(f"**Predicted:** {label}")
st.progress(confidence)
else:
st.warning("Please enter some text to analyze.")
# Example Analysis Section
st.header("Example Analysis")
show_examples = st.checkbox("Show example analysis", key='show_examples')
if show_examples:
try:
df = pd.read_csv('data/raw/history_01.csv')
for sentence in df['Sentence'].head(5): # Limit to 5 examples
with st.container():
label, confidence = predict_sentence(
model, sentence, tokenizer, label_encoder
)
if label not in ("Unknown", "Error"):
st.write("---")
st.write(f"**Sentence:** {sentence}")
st.write(f"**Predicted:** {label}")
st.progress(confidence)
except FileNotFoundError:
st.warning("Example file not found. Please check the data path.")
except Exception as e:
st.error(f"Error: {str(e)}")
if __name__ == "__main__":
show_analysis()