Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import re | |
from utils.prediction import predict_sentence | |
def split_sentences_regex(text): | |
# Clean the text | |
text = re.sub(r'[\n\r]', ' ', text) # Remove newlines | |
text = re.sub(r'["\']', '', text) # Remove quotes | |
text = re.sub(r'\s+', ' ', text) # Normalize whitespace | |
# More aggressive pattern that looks for sentence endings | |
#pattern = r'[.!?]+[\s]+|[.!?]+$' | |
pattern = r'[.]' | |
# Split and clean resulting sentences | |
sentences = [s.strip() for s in re.split(pattern, text) if s] | |
# Filter out empty strings but keep sentences that don't start with capitals | |
return [s for s in sentences if len(s) > 0] | |
def split_sentences_with_abbrev(text): | |
# Common abbreviations to ignore | |
abbreviations = {'mr.', 'mrs.', 'dr.', 'sr.', 'jr.', 'vs.', 'e.g.', 'i.e.', 'etc.'} | |
# Split initially by potential sentence endings | |
parts = text.split('. ') | |
sentences = [] | |
current = parts[0] | |
for part in parts[1:]: | |
# Check if the previous part ends with an abbreviation | |
ends_with_abbrev = any(current.lower().endswith(abbr) for abbr in abbreviations) | |
if ends_with_abbrev: | |
current = current + '. ' + part | |
else: | |
sentences.append(current) | |
current = part | |
sentences.append(current) | |
return sentences | |
def show_analysis(): | |
st.title("Text Analysis") | |
st.write("Use this section to analyze the logical structure of your text.") | |
try: | |
if 'model' not in st.session_state: | |
st.error("Please initialize the model from the home page first.") | |
return | |
model = st.session_state.model | |
label_encoder = st.session_state.label_encoder | |
tokenizer = st.session_state.tokenizer | |
# Text input section | |
st.header("Analyze Your Text") | |
user_text = st.text_area("Enter your text here (multiple sentences allowed):", height=150) | |
if st.button("Analyze"): | |
if user_text: | |
# Split and analyze sentences | |
sentences = split_sentences_regex(user_text) | |
st.subheader("Analysis Results:") | |
for i, sentence in enumerate(sentences, 1): | |
with st.container(): | |
label, confidence = predict_sentence( | |
model, sentence, tokenizer, label_encoder | |
) | |
if label not in ("Unknown", "Error"): | |
st.write("---") | |
st.write(f"**Sentence:** {sentence}") | |
st.write(f"**Predicted:** {label}") | |
st.progress(confidence) | |
else: | |
st.warning("Please enter some text to analyze.") | |
# Example Analysis Section | |
st.header("Example Analysis") | |
show_examples = st.checkbox("Show example analysis", key='show_examples') | |
if show_examples: | |
try: | |
df = pd.read_csv('data/raw/history_01.csv') | |
for sentence in df['Sentence'].head(5): # Limit to 5 examples | |
with st.container(): | |
label, confidence = predict_sentence( | |
model, sentence, tokenizer, label_encoder | |
) | |
if label not in ("Unknown", "Error"): | |
st.write("---") | |
st.write(f"**Sentence:** {sentence}") | |
st.write(f"**Predicted:** {label}") | |
st.progress(confidence) | |
except FileNotFoundError: | |
st.warning("Example file not found. Please check the data path.") | |
except Exception as e: | |
st.error(f"Error: {str(e)}") | |
if __name__ == "__main__": | |
show_analysis() |