File size: 3,279 Bytes
4b141dc
ca5c473
 
 
4b141dc
ca5c473
 
 
 
 
 
 
 
 
 
 
 
 
 
4b141dc
ca5c473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b141dc
ca5c473
 
 
1e4dfde
 
 
 
 
ca5c473
1e4dfde
 
 
ca5c473
1e4dfde
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca5c473
1e4dfde
ca5c473
1e4dfde
 
 
 
 
 
 
 
 
 
 
 
ca5c473
1e4dfde
 
ca5c473
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import streamlit as st
import pandas as pd
import re
from utils.prediction import predict_sentence

def split_sentences_regex(text):
    # Clean the text
    text = re.sub(r'[\n\r]', ' ', text)  # Remove newlines
    text = re.sub(r'["\']', '', text)     # Remove quotes
    text = re.sub(r'\s+', ' ', text)      # Normalize whitespace
    
    # More aggressive pattern that looks for sentence endings
    #pattern = r'[.!?]+[\s]+|[.!?]+$'
    pattern = r'[.]'
    # Split and clean resulting sentences
    sentences = [s.strip() for s in re.split(pattern, text) if s]
    
    # Filter out empty strings but keep sentences that don't start with capitals
    return [s for s in sentences if len(s) > 0]

def split_sentences_with_abbrev(text):
    # Common abbreviations to ignore
    abbreviations = {'mr.', 'mrs.', 'dr.', 'sr.', 'jr.', 'vs.', 'e.g.', 'i.e.', 'etc.'}
    
    # Split initially by potential sentence endings
    parts = text.split('. ')
    sentences = []
    current = parts[0]
    
    for part in parts[1:]:
        # Check if the previous part ends with an abbreviation
        ends_with_abbrev = any(current.lower().endswith(abbr) for abbr in abbreviations)
        
        if ends_with_abbrev:
            current = current + '. ' + part
        else:
            sentences.append(current)
            current = part
            
    sentences.append(current)
    return sentences

def show_analysis():
    st.title("Text Analysis")
    
    # Check model loading state
    if not all(key in st.session_state for key in ['model', 'label_encoder', 'tokenizer']):
        st.warning("Model components not found in session state")
        st.info("Please go to the Home page first to load the model")
        return
        
    if any(st.session_state[key] is None for key in ['model', 'label_encoder', 'tokenizer']):
        st.error("One or more model components failed to load")
        return
        
    # Get model components
    model = st.session_state.model
    label_encoder = st.session_state.label_encoder
    tokenizer = st.session_state.tokenizer
    
    # Text input section
    st.header("Analyze Your Text")
    user_text = st.text_area("Enter your text here (multiple sentences allowed):", height=150)
    
    if st.button("Analyze"):
        if not user_text:
            st.warning("Please enter some text to analyze")
            return
            
        with st.spinner("Analyzing text..."):
            try:
                # Split text into sentences
                sentences = split_sentences_regex(user_text)
                results = []
                
                # Process each sentence
                for sentence in sentences:
                    label, confidence = predict_sentence(model, sentence, tokenizer, label_encoder)
                    results.append({
                        "Sentence": sentence,
                        "Label": label,
                        "Confidence": f"{confidence:.2%}"
                    })
                
                # Display results
                df = pd.DataFrame(results)
                st.dataframe(df)
                
            except Exception as e:
                st.error(f"Analysis failed: {str(e)}")

if __name__ == "__main__":
    show_analysis()