Spaces:

joko333
/

logical_structure_analysis

Sleeping

File size: 3,985 Bytes

import streamlit as st
import pandas as pd
import re
from utils.prediction import predict_sentence

def split_sentences_regex(text):
    # Clean the text
    text = re.sub(r'[\n\r]', ' ', text)  # Remove newlines
    text = re.sub(r'["\']', '', text)     # Remove quotes
    text = re.sub(r'\s+', ' ', text)      # Normalize whitespace
    
    # More aggressive pattern that looks for sentence endings
    #pattern = r'[.!?]+[\s]+|[.!?]+$'
    pattern = r'[.]'
    # Split and clean resulting sentences
    sentences = [s.strip() for s in re.split(pattern, text) if s]
    
    # Filter out empty strings but keep sentences that don't start with capitals
    return [s for s in sentences if len(s) > 0]

def split_sentences_with_abbrev(text):
    # Common abbreviations to ignore
    abbreviations = {'mr.', 'mrs.', 'dr.', 'sr.', 'jr.', 'vs.', 'e.g.', 'i.e.', 'etc.'}
    
    # Split initially by potential sentence endings
    parts = text.split('. ')
    sentences = []
    current = parts[0]
    
    for part in parts[1:]:
        # Check if the previous part ends with an abbreviation
        ends_with_abbrev = any(current.lower().endswith(abbr) for abbr in abbreviations)
        
        if ends_with_abbrev:
            current = current + '. ' + part
        else:
            sentences.append(current)
            current = part
            
    sentences.append(current)
    return sentences

def show_analysis():
    st.title("Text Analysis")
    st.write("Use this section to analyze the logical structure of your text.")
    
    try:
        if 'model' not in st.session_state:
            st.error("Please initialize the model from the home page first.")
            return
            
        model = st.session_state.model
        label_encoder = st.session_state.label_encoder
        tokenizer = st.session_state.tokenizer
        
        # Text input section
        st.header("Analyze Your Text")
        user_text = st.text_area("Enter your text here (multiple sentences allowed):", height=150)
        
        if st.button("Analyze"):
            if user_text:
                # Split and analyze sentences
                sentences = split_sentences_regex(user_text)
                
                st.subheader("Analysis Results:")
                for i, sentence in enumerate(sentences, 1):
                    with st.container():
                        label, confidence = predict_sentence(
                            model, sentence, tokenizer, label_encoder
                        )
                        if label not in ("Unknown", "Error"):
                            st.write("---")
                            st.write(f"**Sentence:** {sentence}")
                            st.write(f"**Predicted:** {label}")
                            st.progress(confidence)
            else:
                st.warning("Please enter some text to analyze.")
        
        # Example Analysis Section
        st.header("Example Analysis")
        show_examples = st.checkbox("Show example analysis", key='show_examples')
        
        if show_examples:
            try:
                df = pd.read_csv('data/raw/history_01.csv')
                for sentence in df['Sentence'].head(5):  # Limit to 5 examples
                    with st.container():
                        label, confidence = predict_sentence(
                            model, sentence, tokenizer, label_encoder
                        )
                        if label not in ("Unknown", "Error"):
                            st.write("---")
                            st.write(f"**Sentence:** {sentence}")
                            st.write(f"**Predicted:** {label}")
                            st.progress(confidence)
            except FileNotFoundError:
                st.warning("Example file not found. Please check the data path.")
                
    except Exception as e:
        st.error(f"Error: {str(e)}")

if __name__ == "__main__":
    show_analysis()