Spaces:

joko333
/

logical_structure_analysis

Sleeping

File size: 3,279 Bytes

4b141dc
ca5c473
 
 
4b141dc
ca5c473
 
 
 
 
 
 
 
 
 
 
 
 
 
4b141dc
ca5c473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b141dc
ca5c473
 
 
1e4dfde
 
 
 
 
ca5c473
1e4dfde
 
 
ca5c473
1e4dfde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca5c473
1e4dfde
ca5c473
1e4dfde
 
 
 
 
 
 
 
 
 
 
 
ca5c473
1e4dfde
 
ca5c473

import streamlit as st
import pandas as pd
import re
from utils.prediction import predict_sentence

def split_sentences_regex(text):
    # Clean the text
    text = re.sub(r'[\n\r]', ' ', text)  # Remove newlines
    text = re.sub(r'["\']', '', text)     # Remove quotes
    text = re.sub(r'\s+', ' ', text)      # Normalize whitespace
    
    # More aggressive pattern that looks for sentence endings
    #pattern = r'[.!?]+[\s]+|[.!?]+$'
    pattern = r'[.]'
    # Split and clean resulting sentences
    sentences = [s.strip() for s in re.split(pattern, text) if s]
    
    # Filter out empty strings but keep sentences that don't start with capitals
    return [s for s in sentences if len(s) > 0]

def split_sentences_with_abbrev(text):
    # Common abbreviations to ignore
    abbreviations = {'mr.', 'mrs.', 'dr.', 'sr.', 'jr.', 'vs.', 'e.g.', 'i.e.', 'etc.'}
    
    # Split initially by potential sentence endings
    parts = text.split('. ')
    sentences = []
    current = parts[0]
    
    for part in parts[1:]:
        # Check if the previous part ends with an abbreviation
        ends_with_abbrev = any(current.lower().endswith(abbr) for abbr in abbreviations)
        
        if ends_with_abbrev:
            current = current + '. ' + part
        else:
            sentences.append(current)
            current = part
            
    sentences.append(current)
    return sentences

def show_analysis():
    st.title("Text Analysis")
    
    # Check model loading state
    if not all(key in st.session_state for key in ['model', 'label_encoder', 'tokenizer']):
        st.warning("Model components not found in session state")
        st.info("Please go to the Home page first to load the model")
        return
        
    if any(st.session_state[key] is None for key in ['model', 'label_encoder', 'tokenizer']):
        st.error("One or more model components failed to load")
        return
        
    # Get model components
    model = st.session_state.model
    label_encoder = st.session_state.label_encoder
    tokenizer = st.session_state.tokenizer
    
    # Text input section
    st.header("Analyze Your Text")
    user_text = st.text_area("Enter your text here (multiple sentences allowed):", height=150)
    
    if st.button("Analyze"):
        if not user_text:
            st.warning("Please enter some text to analyze")
            return
            
        with st.spinner("Analyzing text..."):
            try:
                # Split text into sentences
                sentences = split_sentences_regex(user_text)
                results = []
                
                # Process each sentence
                for sentence in sentences:
                    label, confidence = predict_sentence(model, sentence, tokenizer, label_encoder)
                    results.append({
                        "Sentence": sentence,
                        "Label": label,
                        "Confidence": f"{confidence:.2%}"
                    })
                
                # Display results
                df = pd.DataFrame(results)
                st.dataframe(df)
                
            except Exception as e:
                st.error(f"Analysis failed: {str(e)}")

if __name__ == "__main__":
    show_analysis()